├── lib ├── dict │ ├── custom │ │ ├── mydict.dic │ │ ├── ext_stopword.dic │ │ └── single_word_low_freq.dic │ ├── preposition.dic │ ├── suffix.dic │ ├── stopword.dic │ ├── surname.dic │ └── quantifier.dic ├── DictSegment.js ├── config.js ├── LexemePath.js ├── consts.js ├── HitService.js ├── utils.js ├── Hit.js ├── Lexeme.js ├── TreeSet.js ├── QuickSortSet.js ├── Segmenter.js ├── CJKSegmenter.js ├── IKArbitrator.js ├── CN_QuantifierSegmenter.js ├── LetterSegmenter.js ├── Gruntfile.js ├── AnalyzeContext.js ├── CharacterUtil.js ├── AnalyzeService.js └── Dictionary.js ├── index.js ├── .travis.yml ├── test.js ├── package.json ├── README.md └── LICENSE /lib/dict/custom/mydict.dic: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/dict/custom/ext_stopword.dic: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('./lib/Segmenter'); -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | - "0.11" 5 | - "0.12" 6 | - "iojs" 7 | -------------------------------------------------------------------------------- /lib/DictSegment.js: -------------------------------------------------------------------------------- 1 | 2 | var DictSegment = function(){ 3 | this.childrenMap = {}; 4 | this.storeSize = 0; 5 | this.nodeState = 0; 6 | }; 7 | 8 | module.exports = DictSegment; -------------------------------------------------------------------------------- /lib/dict/preposition.dic: -------------------------------------------------------------------------------- 1 | 不 2 | 也 3 | 了 4 | 仍 5 | 从 6 | 以 7 | 使 8 | 则 9 | 却 10 | 又 11 | 及 12 | 对 13 | 就 14 | 并 15 | 很 16 | 或 17 | 把 18 | 是 19 | 的 20 | 着 21 | 给 22 | 而 23 | 被 24 | 让 25 | 但 -------------------------------------------------------------------------------- /lib/config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | // Analyzer 扩展配置 3 | ext_dict: [/*'./dict/custom/mydict.dic', './dict/custom/single_word_low_freq.dic'*/], 4 | 5 | // 用户可以在这里配置自己的扩展停止词字典 6 | ext_stopwords: [/*'./dict/custom/ext_stopword.dic'*/] 7 | }; -------------------------------------------------------------------------------- /lib/dict/suffix.dic: -------------------------------------------------------------------------------- 1 | 乡 2 | 井 3 | 亭 4 | 党 5 | 区 6 | 厅 7 | 县 8 | 园 9 | 塔 10 | 家 11 | 寺 12 | 局 13 | 巷 14 | 市 15 | 弄 16 | 所 17 | 斯基 18 | 楼 19 | 江 20 | 河 21 | 海 22 | 湖 23 | 省 24 | 维奇 25 | 署 26 | 苑 27 | 街 28 | 觀 29 | 观 30 | 诺夫 31 | 路 32 | 部 33 | 镇 34 | 阁 35 | 山 36 | 子 37 | 娃 -------------------------------------------------------------------------------- /lib/dict/stopword.dic: -------------------------------------------------------------------------------- 1 | a 2 | an 3 | and 4 | are 5 | as 6 | at 7 | be 8 | but 9 | by 10 | for 11 | if 12 | in 13 | into 14 | is 15 | it 16 | no 17 | not 18 | of 19 | on 20 | or 21 | such 22 | that 23 | the 24 | their 25 | then 26 | there 27 | these 28 | they 29 | this 30 | to 31 | was 32 | will 33 | with -------------------------------------------------------------------------------- /lib/LexemePath.js: -------------------------------------------------------------------------------- 1 | var util = require('util'); 2 | var QuickSortSet = require('./QuickSortSet'); 3 | 4 | /** 5 | * Lexeme链(路径) 6 | */ 7 | var LexemePath = function(){ 8 | QuickSortSet.call(this); 9 | this.pathBegin = -1; //起始位置 10 | this.pathEnd = -1; //结束 11 | this.payloadLength = 0; //词元链的有效字符长度 12 | }; 13 | 14 | util.inherits(LexemePath, QuickSortSet); 15 | 16 | module.exports = LexemePath; 17 | -------------------------------------------------------------------------------- /lib/consts.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | // LexemeType常量 3 | LexemeType: { 4 | TYPE_UNKNOWN: 0, // 未知 5 | TYPE_ENGLISH: 1, // 英文 6 | TYPE_ARABIC: 2, // 数字 7 | TYPE_LETTER: 3, // 英文数字混合 8 | TYPE_CNWORD: 4, // 中文词元 9 | TYPE_CNCHAR: 64, // 中文单字 10 | TYPE_OTHER_CJK: 8, // 日韩文字 11 | TYPE_CNUM: 16, // 中文数词 12 | TYPE_COUNT: 32, // 中文量词 13 | TYPE_CQUAN: 48 // 中文数量词 14 | }, 15 | CharType: { 16 | CHAR_USELESS: 0, 17 | CHAR_ARABIC: 1, 18 | CHAR_ENGLISH: 2, 19 | CHAR_CHINESE: 4, 20 | CHAR_OTHER_CJK: 8 21 | } 22 | }; -------------------------------------------------------------------------------- /lib/HitService.js: -------------------------------------------------------------------------------- 1 | 2 | var UNMATCH = 0, 3 | MATCH = 1, 4 | PREFIX = 2; 5 | 6 | module.exports = { 7 | 8 | isMatch: function(hit){ 9 | return hit.hitState & MATCH > 0; 10 | }, 11 | 12 | setMatch: function(hit){ 13 | hit.hitState = hit.hitState | MATCH; 14 | }, 15 | 16 | isPrefix: function(hit){ 17 | return (hit.hitState & PREFIX) > 0; 18 | }, 19 | 20 | setPrefix: function(hit){ 21 | hit.hitState = hit.hitState | PREFIX; 22 | }, 23 | 24 | isUnmatch: function(hit){ 25 | return hit.hitState === UNMATCH; 26 | }, 27 | 28 | setUnmatch: function(hit){ 29 | hit.hitState = UNMATCH; 30 | } 31 | }; -------------------------------------------------------------------------------- /lib/utils.js: -------------------------------------------------------------------------------- 1 | var utils = { 2 | arrayFind: function(arr, key, val){ 3 | var found = null; 4 | for(var i=0;i 0; 19 | }; 20 | 21 | Hit.prototype.setMatch = function(){ 22 | this.hitState = this.hitState | MATCH; 23 | }; 24 | 25 | Hit.prototype.isPrefix = function(){ 26 | return (this.hitState & PREFIX) > 0; 27 | }; 28 | 29 | Hit.prototype.setPrefix = function(){ 30 | this.hitState = this.hitState | PREFIX; 31 | }; 32 | 33 | Hit.prototype.isUnmatch = function(){ 34 | return this.hitState === UNMATCH; 35 | }; 36 | 37 | Hit.prototype.setUnmatch = function(){ 38 | this.hitState = UNMATCH; 39 | };*/ 40 | -------------------------------------------------------------------------------- /lib/Lexeme.js: -------------------------------------------------------------------------------- 1 | 2 | var Lexeme = function(offset, begin, len, lexemeType){ 3 | this.offset = offset; 4 | this.begin = begin; 5 | if (len < 0){ 6 | throw new Error("Lexeme len < 0"); 7 | } 8 | this.len = len; 9 | this.lexemeType = lexemeType; 10 | }; 11 | 12 | module.exports = Lexeme; 13 | 14 | /** 15 | * 获取词元的文本内容 16 | * @return String 17 | */ 18 | /*Lexeme.prototype.getLexemeText = function() { 19 | return this.lexemeText || ''; 20 | }; 21 | */ 22 | /*Lexeme.prototype.setLexemeText = function(lexemeText) { 23 | if (!lexemeText){ 24 | this.lexemeText = ""; 25 | this.len = 0; 26 | } 27 | else{ 28 | this.lexemeText = lexemeText; 29 | this.len = lexemeText.length; 30 | } 31 | };*/ 32 | 33 | /** 34 | * 合并两个相邻的词元 35 | * @param l 36 | * @param lexemeType 37 | * @return boolean 词元是否成功合并 38 | */ 39 | /*Lexeme.prototype.append = function(l, lexemeType){ 40 | if (l && this.getEndPosition() === l.getBeginPosition()){ 41 | this.len += l.len; 42 | this.lexemeType = lexemeType; 43 | return true; 44 | } 45 | else { 46 | return false; 47 | } 48 | };*/ 49 | -------------------------------------------------------------------------------- /lib/dict/surname.dic: -------------------------------------------------------------------------------- 1 | 丁 2 | 万 3 | 万俟 4 | 上官 5 | 东方 6 | 乔 7 | 于 8 | 令狐 9 | 仲孙 10 | 任 11 | 何 12 | 余 13 | 候 14 | 傅 15 | 公冶 16 | 公孙 17 | 公羊 18 | 冯 19 | 刘 20 | 单 21 | 单于 22 | 卢 23 | 史 24 | 叶 25 | 司徒 26 | 司空 27 | 司马 28 | 吕 29 | 吴 30 | 周 31 | 唐 32 | 夏 33 | 夏侯 34 | 太叔 35 | 姚 36 | 姜 37 | 孔 38 | 孙 39 | 孟 40 | 宇文 41 | 宋 42 | 宗政 43 | 尉迟 44 | 尹 45 | 崔 46 | 常 47 | 康 48 | 廖 49 | 张 50 | 彭 51 | 徐 52 | 慕容 53 | 戴 54 | 文 55 | 方 56 | 易 57 | 曹 58 | 曾 59 | 朱 60 | 李 61 | 杜 62 | 杨 63 | 林 64 | 梁 65 | 欧阳 66 | 武 67 | 段 68 | 毛 69 | 江 70 | 汤 71 | 沈 72 | 淳于 73 | 潘 74 | 澹台 75 | 濮阳 76 | 熊 77 | 王 78 | 田 79 | 申屠 80 | 白 81 | 皇甫 82 | 石 83 | 秦 84 | 程 85 | 罗 86 | 肖 87 | 胡 88 | 苏 89 | 范 90 | 董 91 | 蒋 92 | 薛 93 | 袁 94 | 许 95 | 诸葛 96 | 谢 97 | 谭 98 | 贺 99 | 贾 100 | 赖 101 | 赫连 102 | 赵 103 | 轩辕 104 | 邓 105 | 邱 106 | 邵 107 | 邹 108 | 郑 109 | 郝 110 | 郭 111 | 金 112 | 钟 113 | 钟离 114 | 钱 115 | 长孙 116 | 闻人 117 | 闾丘 118 | 阎 119 | 陆 120 | 陈 121 | 雷 122 | 韩 123 | 顾 124 | 马 125 | 高 126 | 魏 127 | 鲜于 128 | 黄 129 | 黎 130 | 龙 131 | 龚 -------------------------------------------------------------------------------- /lib/TreeSet.js: -------------------------------------------------------------------------------- 1 | /* 2 | * 实现Java的TreeSet类 3 | 给Set集合中的元素进行元素compareTo指定方式的排序。 4 | 保证元素唯一性的方式:通过元素compareTo比较是否相同. 5 | 底层数据结构是:二叉树。 6 | */ 7 | var AnalyzeService = require('./AnalyzeService'), 8 | LexemePathService = AnalyzeService.LexemePathService; 9 | 10 | var TreeSet = function() { 11 | this.arr = []; 12 | }; 13 | 14 | module.exports = TreeSet; 15 | 16 | TreeSet.prototype.add = function(lexemePath){ 17 | /* this.arr.push(elem); 18 | this.arr = this.arr.sort(function(x, y){ 19 | return x.compareTo(y); 20 | });*/ 21 | if (this.arr.length === 0){ 22 | this.arr.push(lexemePath); 23 | return null; 24 | } 25 | var headList = [], head, compRes; 26 | head = this.arr.shift(); 27 | compRes = LexemePathService.compare(lexemePath, head); 28 | if (compRes === 0){ // 与头部相同,不放入集合 29 | this.arr.unshift(head); 30 | return null; 31 | } 32 | else if (compRes < 0){ // 插入头部 33 | this.arr.unshift(head); 34 | this.arr.unshift(lexemePath); 35 | return null; 36 | } 37 | else{ //从头部往下插 38 | headList.push(head); 39 | var arr = this.add(lexemePath); 40 | if (!arr) { 41 | this.arr = headList.concat(this.arr); 42 | return null; 43 | } 44 | else{ 45 | headList.concat(arr); 46 | return headList; 47 | } 48 | } 49 | }; 50 | 51 | TreeSet.prototype.first = function(){ 52 | if (this.arr.length > 0){ 53 | return this.arr[0]; 54 | } 55 | else{ 56 | return null; 57 | } 58 | }; 59 | 60 | -------------------------------------------------------------------------------- /lib/QuickSortSet.js: -------------------------------------------------------------------------------- 1 | /** 2 | * IK分词器专用的Lexem快速排序集合 3 | */ 4 | 5 | 6 | var QuickSortSet = function(){ 7 | this.lexemeList = []; 8 | }; 9 | 10 | /** 11 | * 向链表集合添加词元 12 | * @param lexeme 13 | */ 14 | /*QuickSortSet.prototype.addLexeme = function(lexeme){ 15 | if (this.lexemeList.length === 0){ 16 | this.lexemeList.push(lexeme); 17 | return null; 18 | } 19 | var tailList = [], tail, compRes; 20 | tail = SortedSetService.pollLast(this); // 比 this.lexemeList.pop(); 快 21 | compRes = LexemeService.compare(tail, lexeme); 22 | if (compRes === 0){ // 词元与尾部词元相同,不放入集合 23 | this.lexemeList.push(tail); 24 | return null; 25 | } 26 | else if (compRes < 0){ // 词元接入链表尾部 27 | this.lexemeList.push(tail); 28 | this.lexemeList.push(lexeme); 29 | return null; 30 | } 31 | else{ //从尾部上逆 32 | tailList.unshift(tail); 33 | var arr = this.addLexeme(lexeme); 34 | if (!arr) { 35 | this.lexemeList.concat(tailList); 36 | return null; 37 | } 38 | else{ 39 | tailList = arr.concat(tailList); 40 | return tailList; 41 | } 42 | } 43 | };*/ 44 | 45 | /** 46 | * 返回链表头部元素 47 | * @return 48 | */ 49 | QuickSortSet.prototype.peekFirst = function(){ 50 | if (this.lexemeList.length > 0){ 51 | return this.lexemeList[0]; 52 | } 53 | return null; 54 | }; 55 | 56 | /** 57 | * 取出链表集合的第一个元素 58 | * @return Lexeme 59 | */ 60 | QuickSortSet.prototype.pollFirst = function(){ 61 | if (this.lexemeList.length > 0){ 62 | return this.lexemeList.shift(); 63 | } 64 | return null; 65 | }; 66 | 67 | /** 68 | * 返回链表尾部元素 69 | * @return 70 | */ 71 | QuickSortSet.prototype.peekLast = function(){ 72 | var idx = this.lexemeList.length - 1; 73 | if (idx >= 0){ 74 | return this.lexemeList[idx]; 75 | } 76 | return null; 77 | }; 78 | 79 | /** 80 | * 取出链表集合的最后一个元素 81 | * @return Lexeme 82 | */ 83 | QuickSortSet.prototype.pollLast = function(){ 84 | return this.lexemeList.pop(); 85 | }; 86 | 87 | module.exports = QuickSortSet; -------------------------------------------------------------------------------- /lib/Segmenter.js: -------------------------------------------------------------------------------- 1 | var Dictionary = require('./Dictionary'), 2 | AnalyzeContext = require('./AnalyzeContext'), 3 | IKArbitrator = require('./IKArbitrator'), 4 | LetterSegmenter = require('./LetterSegmenter'), 5 | CN_QuantifierSegmenter = require('./CN_QuantifierSegmenter'), 6 | CJKSegmenter = require('./CJKSegmenter'); 7 | 8 | var Segmenter = function(opts){ 9 | this.opts = opts || {}; 10 | 11 | this.init(); 12 | }; 13 | 14 | module.exports = Segmenter; 15 | 16 | Segmenter.prototype.init = function(){ 17 | //初始化词典单例 18 | Dictionary.initial(this.opts); 19 | //初始化分词上下文 20 | this.context = new AnalyzeContext(); 21 | //加载子分词器 22 | this.segmenters = this.loadSegmenters(); 23 | //加载歧义裁决器 24 | this.arbitrator = new IKArbitrator(); 25 | }; 26 | 27 | /** 28 | * 初始化词典,加载子分词器实现 29 | * @return List 30 | */ 31 | Segmenter.prototype.loadSegmenters = function(){ 32 | var segmenters = []; 33 | //处理字母的子分词器 34 | segmenters.push(new LetterSegmenter()); 35 | //处理中文数量词的子分词器 36 | segmenters.push(new CN_QuantifierSegmenter()); 37 | //处理中文词的子分词器 38 | segmenters.push(new CJKSegmenter()); 39 | 40 | return segmenters; 41 | }; 42 | 43 | /** 44 | * 重置分词器到初始状态 45 | * @param input 46 | */ 47 | Segmenter.prototype.reset = function(input) { 48 | this.input = input; 49 | this.context.reset(); 50 | this.context.fillBuffer(input); 51 | var segmenter; 52 | for (var i=0;i 0){ 30 | //处理词段队列 31 | var hit, tmpArray = []; 32 | for (var i=0;i 0){ 87 | c = lexemeStack.pop(); 88 | //回滚词元链 89 | this.backPath(crossPath, c/*, option*/); 90 | //从歧义词位置开始,递归,生成可选方案 91 | this.forwardPath(crossPath, c, option); 92 | pathOptions.add(option); 93 | } 94 | 95 | //返回集合中的最优方案 96 | return pathOptions.first(); 97 | }; 98 | 99 | /** 100 | * 向前遍历,添加词元,构造一个无歧义词元组合 101 | // * @param LexemePath path 102 | * @return 103 | */ 104 | IKArbitrator.prototype.forwardPath = function(crossPath, lexeme, option){ 105 | //发生冲突的Lexeme栈 106 | var conflictStack = []; 107 | var c = lexeme; 108 | //迭代遍历Lexeme链表 109 | while(c){ 110 | if (!LexemePathService.addNotCrossLexeme(option, c)){ 111 | //词元交叉,添加失败则加入lexemeStack栈 112 | conflictStack.push(c); 113 | } 114 | c = LexemePathService.getNextLexeme(crossPath, c); 115 | } 116 | return conflictStack; 117 | }; 118 | 119 | /** 120 | * 回滚词元链,直到它能够接受指定的词元 121 | // * @param lexeme 122 | * @param l 123 | */ 124 | IKArbitrator.prototype.backPath = function(crossPath, l/*, option*/){ 125 | while(LexemePathService.checkCross(crossPath, l)){ 126 | LexemePathService.removeTail(crossPath); 127 | } 128 | }; 129 | -------------------------------------------------------------------------------- /lib/CN_QuantifierSegmenter.js: -------------------------------------------------------------------------------- 1 | var Dictionary = require('./Dictionary'); 2 | 3 | var Lexeme = require('./Lexeme'), 4 | consts = require('./consts'), 5 | LexemeType = consts.LexemeType, 6 | CharType = consts.CharType, 7 | HitService = require('./HitService'), 8 | SortedSetService = require('./AnalyzeService').SortedSetService; 9 | 10 | var SEGMENTER_NAME = "QUAN_SEGMENTER", //子分词器标签 11 | Chn_Num = "0123456789〇一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百佰千仟万萬亿億拾佰仟萬亿億兆卅廿"; //中文数词 12 | 13 | /** 14 | * 15 | * 中文数量词子分词器 16 | */ 17 | var CN_QuantifierSegmenter = function() { 18 | // this.name = SEGMENTER_NAME; 19 | // this.ChnNumberChars = Chn_Num; 20 | 21 | /* 22 | * 词元的开始位置, 23 | * 同时作为子分词器状态标识 24 | * 当start > -1 时,标识当前的分词器正在处理字符 25 | */ 26 | this.nStart = -1; 27 | /* 28 | * 记录词元结束位置 29 | * end记录的是在词元中最后一个出现的合理的数词结束 30 | */ 31 | this.nEnd = -1; 32 | 33 | //待处理的量词hit队列 34 | this.countHits = []; 35 | }; 36 | 37 | module.exports = CN_QuantifierSegmenter; 38 | 39 | /** 40 | * 分词 41 | */ 42 | CN_QuantifierSegmenter.prototype.analyze = function(context) { 43 | //处理中文数词 44 | this.processCNumber(context); 45 | //处理中文量词 46 | this.processCount(context); 47 | 48 | //判断是否锁定缓冲区 49 | if (this.nStart === -1 && this.nEnd === -1 && this.countHits.length === 0){ 50 | //对缓冲区解锁 51 | context.unlockBuffer(SEGMENTER_NAME); 52 | } 53 | else{ 54 | context.lockBuffer(SEGMENTER_NAME); 55 | } 56 | }; 57 | 58 | /** 59 | * 重置子分词器状态 60 | */ 61 | CN_QuantifierSegmenter.prototype.reset = function() { 62 | this.nStart = -1; 63 | this.nEnd = -1; 64 | this.countHits = []; 65 | }; 66 | 67 | /** 68 | * 处理数词 69 | */ 70 | CN_QuantifierSegmenter.prototype.processCNumber = function(context){ 71 | var charType = context.getCurrentCharType(); 72 | if (this.nStart === -1 && this.nEnd === -1){//初始状态 73 | if ((CharType.CHAR_CHINESE === charType || CharType.CHAR_ARABIC === charType) && 74 | Chn_Num.indexOf(context.getCurrentChar()) >= 0){ 75 | //记录数词的起始、结束位置 76 | this.nStart = context.cursor; 77 | this.nEnd = context.cursor; 78 | } 79 | } 80 | else{//正在处理状态 81 | if ((CharType.CHAR_CHINESE === charType || CharType.CHAR_ARABIC === charType) && 82 | Chn_Num.indexOf(context.getCurrentChar()) >= 0){ 83 | //记录数词的结束位置 84 | this.nEnd = context.cursor; 85 | } 86 | else{ 87 | //输出数词 88 | this.outputNumLexeme(context); 89 | //重置头尾指针 90 | this.nStart = -1; 91 | this.nEnd = -1; 92 | } 93 | } 94 | 95 | //缓冲区已经用完,还有尚未输出的数词 96 | /* if (context.isBufferConsumed()){ 97 | if (this.nStart !== -1 && this.nEnd !== -1){ 98 | //输出数词 99 | this.outputNumLexeme(context); 100 | //重置头尾指针 101 | this.nStart = -1; 102 | this.nEnd = -1; 103 | } 104 | } */ 105 | }; 106 | 107 | /** 108 | * 处理中文量词 109 | * @param context 110 | */ 111 | CN_QuantifierSegmenter.prototype.processCount = function(context){ 112 | // 判断是否需要启动量词扫描 113 | if (!this.needCountScan(context)){ 114 | var l = context.orgLexemes.peekLast(); 115 | return; 116 | } 117 | 118 | if (CharType.CHAR_CHINESE === context.getCurrentCharType()){ 119 | //优先处理countHits中的hit 120 | var hit, tmpArray = []; 121 | for(var i=0;i 0){ 180 | //正在处理中文数词,或者正在处理量词 181 | return true; 182 | } 183 | else{ 184 | //找到一个相邻的数词 185 | if (context.orgLexemes.lexemeList.length > 0){ 186 | var l = context.orgLexemes.peekLast(); 187 | if (l && (LexemeType.TYPE_CNUM === l.lexemeType || LexemeType.TYPE_ARABIC === l.lexemeType)){ 188 | if (l.begin + l.len === context.cursor){ 189 | return true; 190 | } 191 | } 192 | } 193 | } 194 | return false; 195 | }; 196 | 197 | /** 198 | * 添加数词词元到结果集 199 | * @param context 200 | */ 201 | CN_QuantifierSegmenter.prototype.outputNumLexeme = function(context){ 202 | if (this.nStart > -1 && this.nEnd > -1){ 203 | //输出数词 204 | var newLexeme = new Lexeme(context.buffOffset, this.nStart, this.nEnd - this.nStart + 1, LexemeType.TYPE_CNUM); 205 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 206 | } 207 | }; 208 | -------------------------------------------------------------------------------- /lib/LetterSegmenter.js: -------------------------------------------------------------------------------- 1 | 2 | var Lexeme = require('./Lexeme'), 3 | consts = require('./consts'), 4 | LexemeType = consts.LexemeType, 5 | CharType = consts.CharType, 6 | SortedSetService = require('./AnalyzeService').SortedSetService; 7 | 8 | var SEGMENTER_NAME = "LETTER_SEGMENTER", //子分词器标签 9 | Letter_Connector = ['#', '&', '+', '-', '.', '@', '_'].sort(), //链接符号 10 | Num_Connector = [',', '.'].sort(); //数字符号 11 | 12 | /** 13 | * 14 | * 英文字符及阿拉伯数字子分词器 15 | */ 16 | var LetterSegmenter = function() { 17 | // this.name = SEGMENTER_NAME; 18 | /* 19 | * 词元的开始位置, 20 | * 同时作为子分词器状态标识 21 | * 当start > -1 时,标识当前的分词器正在处理字符 22 | */ 23 | this.start = -1; 24 | /* 25 | * 记录词元结束位置 26 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 27 | */ 28 | this.end = -1; 29 | 30 | /* 31 | * 字母起始位置 32 | */ 33 | this.englishStart = -1; 34 | 35 | /* 36 | * 字母结束位置 37 | */ 38 | this.englishEnd = -1; 39 | 40 | /* 41 | * 阿拉伯数字起始位置 42 | */ 43 | this.arabicStart = -1; 44 | 45 | /* 46 | * 阿拉伯数字结束位置 47 | */ 48 | this.arabicEnd = -1; 49 | }; 50 | 51 | module.exports = LetterSegmenter; 52 | 53 | /* (non-Javadoc) 54 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 55 | */ 56 | LetterSegmenter.prototype.analyze = function(context) { 57 | var bufferLockFlag = false; 58 | //处理英文字母 59 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; 60 | //处理阿拉伯字母 61 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; 62 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复) 63 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; 64 | 65 | //判断是否锁定缓冲区 66 | if (bufferLockFlag){ 67 | context.lockBuffer(SEGMENTER_NAME); 68 | } 69 | else{ 70 | //对缓冲区解锁 71 | context.unlockBuffer(SEGMENTER_NAME); 72 | } 73 | }; 74 | 75 | /* (non-Javadoc) 76 | * @see org.wltea.analyzer.core.ISegmenter#reset() 77 | */ 78 | LetterSegmenter.prototype.reset = function() { 79 | this.start = -1; 80 | this.end = -1; 81 | this.englishStart = -1; 82 | this.englishEnd = -1; 83 | this.arabicStart = -1; 84 | this.arabicEnd = -1; 85 | }; 86 | 87 | /** 88 | * 处理数字字母混合输出 89 | * 如:windos2000 | linliangyi2005@gmail.com 90 | // * @param input 91 | * @param context 92 | * @return 93 | */ 94 | LetterSegmenter.prototype.processMixLetter = function(context){ 95 | var needLock = false, charType = context.getCurrentCharType(); 96 | 97 | if (this.start === -1){//当前的分词器尚未开始处理字符 98 | if (CharType.CHAR_ARABIC === charType || 99 | CharType.CHAR_ENGLISH === charType){ 100 | //记录起始指针的位置,标明分词器进入处理状态 101 | this.start = context.cursor; 102 | this.end = this.start; 103 | } 104 | } 105 | else {//当前的分词器正在处理字符 106 | if (CharType.CHAR_ARABIC === charType || 107 | CharType.CHAR_ENGLISH === charType){ 108 | //记录下可能的结束位置 109 | this.end = context.cursor; 110 | } 111 | else if (CharType.CHAR_USELESS === charType && 112 | this.isLetterConnector(context.getCurrentChar())){ 113 | //记录下可能的结束位置 114 | this.end = context.cursor; 115 | } 116 | else{ 117 | //遇到非Letter字符,输出词元 118 | var newLexeme = new Lexeme(context.buffOffset, this.start, this.end - this.start + 1, LexemeType.TYPE_LETTER); 119 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 120 | this.start = -1; 121 | this.end = -1; 122 | } 123 | } 124 | 125 | //判断缓冲区是否已经读完 126 | /* if (context.isBufferConsumed()){ 127 | if (this.start !== -1 && this.end !== -1){ 128 | //缓冲以读完,输出词元 129 | var newLexeme = new Lexeme(context.buffOffset, this.start, this.end - this.start + 1, LexemeType.TYPE_LETTER); 130 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 131 | this.start = -1; 132 | this.end = -1; 133 | } 134 | }*/ 135 | 136 | //判断是否锁定缓冲区 137 | if (this.start === -1 && this.end === -1){ 138 | //对缓冲区解锁 139 | needLock = false; 140 | }else{ 141 | needLock = true; 142 | } 143 | return needLock; 144 | }; 145 | 146 | /** 147 | * 处理纯英文字母输出 148 | * @param context 149 | * @return 150 | */ 151 | LetterSegmenter.prototype.processEnglishLetter = function(context){ 152 | var needLock = false; 153 | 154 | if (this.englishStart === -1){//当前的分词器尚未开始处理英文字符 155 | if (CharType.CHAR_ENGLISH === context.getCurrentCharType()){ 156 | //记录起始指针的位置,标明分词器进入处理状态 157 | this.englishStart = context.cursor; 158 | this.englishEnd = this.englishStart; 159 | } 160 | } 161 | else {//当前的分词器正在处理英文字符 162 | if (CharType.CHAR_ENGLISH === context.getCurrentCharType()){ 163 | //记录当前指针位置为结束位置 164 | this.englishEnd = context.cursor; 165 | } 166 | else{ 167 | //遇到非English字符,输出词元 168 | var newLexeme = new Lexeme(context.buffOffset, this.englishStart, this.englishEnd - this.englishStart + 1, LexemeType.TYPE_ENGLISH); 169 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 170 | this.englishStart = -1; 171 | this.englishEnd= -1; 172 | } 173 | } 174 | 175 | //判断缓冲区是否已经读完 176 | /* if (context.isBufferConsumed()){ 177 | if (this.englishStart !== -1 && this.englishEnd !== -1){ 178 | //缓冲以读完,输出词元 179 | var newLexeme = new Lexeme(context.buffOffset, this.englishStart, this.englishEnd - this.englishStart + 1, LexemeType.TYPE_ENGLISH); 180 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 181 | 182 | this.englishStart = -1; 183 | this.englishEnd= -1; 184 | } 185 | } */ 186 | 187 | //判断是否锁定缓冲区 188 | if (this.englishStart === -1 && this.englishEnd === -1){ 189 | //对缓冲区解锁 190 | needLock = false; 191 | } 192 | else{ 193 | needLock = true; 194 | } 195 | return needLock; 196 | }; 197 | 198 | /** 199 | * 处理阿拉伯数字输出 200 | * @param context 201 | * @return 202 | */ 203 | LetterSegmenter.prototype.processArabicLetter = function(context){ 204 | var needLock = false; 205 | 206 | if (this.arabicStart === -1){//当前的分词器尚未开始处理数字字符 207 | if (CharType.CHAR_ARABIC === context.getCurrentCharType()){ 208 | //记录起始指针的位置,标明分词器进入处理状态 209 | this.arabicStart = context.cursor; 210 | this.arabicEnd = this.arabicStart; 211 | } 212 | } 213 | else {//当前的分词器正在处理数字字符 214 | if (CharType.CHAR_ARABIC === context.getCurrentCharType()){ 215 | //记录当前指针位置为结束位置 216 | this.arabicEnd = context.cursor; 217 | } 218 | else if (CharType.CHAR_USELESS === context.getCurrentCharType() && 219 | this.isNumConnector(context.getCurrentChar())){ 220 | //不输出数字,但不标记结束 221 | } 222 | else{ 223 | ////遇到非Arabic字符,输出词元 224 | var newLexeme = new Lexeme(context.buffOffset, this.arabicStart, this.arabicEnd - this.arabicStart + 1, LexemeType.TYPE_ARABIC); 225 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 226 | this.arabicStart = -1; 227 | this.arabicEnd = -1; 228 | } 229 | } 230 | 231 | //判断缓冲区是否已经读完 232 | /* if (context.isBufferConsumed()){ 233 | if (this.arabicStart !== -1 && this.arabicEnd !== -1){ 234 | //生成已切分的词元 235 | var newLexeme = new Lexeme(context.buffOffset, this.arabicStart, this.arabicEnd - this.arabicStart + 1, LexemeType.TYPE_ARABIC); 236 | SortedSetService.addLexeme(context.orgLexemes, newLexeme); 237 | this.arabicStart = -1; 238 | this.arabicEnd = -1; 239 | } 240 | }*/ 241 | 242 | //判断是否锁定缓冲区 243 | if (this.arabicStart === -1 && this.arabicEnd === -1){ 244 | //对缓冲区解锁 245 | needLock = false; 246 | } 247 | else{ 248 | needLock = true; 249 | } 250 | return needLock; 251 | }; 252 | 253 | /** 254 | * 判断是否是字母连接符号 255 | * @param input 256 | * @return 257 | */ 258 | LetterSegmenter.prototype.isLetterConnector = function(input){ 259 | var index = Letter_Connector.indexOf(input); 260 | return index >= 0; 261 | }; 262 | 263 | /** 264 | * 判断是否是数字连接符号 265 | * @param input 266 | * @return 267 | */ 268 | LetterSegmenter.prototype.isNumConnector = function(input){ 269 | var index = Num_Connector.indexOf(input); 270 | return index >= 0; 271 | }; 272 | 273 | -------------------------------------------------------------------------------- /lib/Gruntfile.js: -------------------------------------------------------------------------------- 1 | module.exports = function(grunt) { 2 | 3 | // Project configuration. 4 | grunt.initConfig({ 5 | pkg: grunt.file.readJSON('package.json'), 6 | 7 | jshint: { 8 | // define the files to lint 9 | //files: ['**/*.js', '!node_modules/**/*'], 10 | files: [ 11 | './*.js', 12 | '!node_modules/**/*' 13 | ], 14 | // configure JSHint (documented at http://www.jshint.com/docs/) 15 | options: { 16 | force: true, 17 | reporter: 'checkstyle', 18 | reporterOutput: 'jshint.xml', 19 | // more options here if you want to override JSHint defaults 20 | globals: { 21 | jQuery: true, 22 | console: false, 23 | node: true, 24 | module: true, 25 | define: true, 26 | require: true, 27 | createjs: true, 28 | 29 | exports: true, 30 | $: true, 31 | window: true, 32 | pomelo: true, 33 | setImmediate: true, 34 | __filename: true, 35 | __dirname: true, 36 | setTimeout: true, 37 | setInterval: true, 38 | clearInterval: true, 39 | process: true, 40 | Buffer: true, 41 | actor: true, // pomelo-robot 42 | }, 43 | asi: false, 44 | // 如果是真,JSHint会无视没有加分号的行尾, 自动补全分号一直是Javascript很有争议的一个语法特性。默认,JSHint会要求你在每个语句后面加上分号,但是如果你认为自己理解了asi(automatic semicolon insertion),你可以抛弃JSHint对分号的检查。 45 | 46 | bitwise: false, 47 | //如果为真,JSHint会禁用位运算符 Javascript允许位运算,但是他却没有整型,位运算符要把参与运算的数字从浮点数变为整数,并在运算后再转换回来。这样他们的效率就不如在别的语言中那么高。 48 | 49 | boss: false, 50 | //很霸气的选项,如果为真,那么JSHint会允许在if,for,while里面编写赋值语句。 一般来说,我们会在循环、判断等语句中加入值的比较来做语句的运行条件,有时候会把==错写成赋值的=,通常,JSHint会把这个认定为一个错误,但是开启这个选项的化,JSHint就不会检查判断条件中的赋值 ,你是boss,你说的算:)。 51 | 52 | camelcase: false, 53 | // 强迫驼峰风格 54 | 55 | curly: true, 56 | //如果为真,JSHint会要求你在使用if和while等结构语句时加上{}来明确代码块。 Javascript允许在if等结构语句体只有一句的情况下不加括号。不过这样做可能会让你的代码读起来有些晦涩。 57 | 58 | debug: false, 59 | //如果为真,JSHint会允许代码中出现debugger的语句。不过建议你最好在检测代码前去掉debug的语句。 60 | 61 | eqeqeq: true, 62 | //如果为真,JSHint会看你在代码中是否都用了===或者是!==,而不是使用==和!=。 我们建议你在比较0,''(空字符),undefined,null,false和true的时候使用===和!===。 63 | 64 | eqnull: false, 65 | //如果为真,JSHint会允许使用"== null"作比较。 == null 通常用来判断一个变量是undefined或者是null(当时用==,null和undefined都会转化为false)。 66 | 67 | evil: false, 68 | //如果为真,JSHint会允许使用eval eval提供了访问Javascript编译器的途径,这有时很有用,但是同时也对你的代码形成了注入攻击的危险,并且会对debug造成一些困难。 记住,Function构造函数也是另一个‘eval’,另外,当传入的参数是字符串的时候,setTimeout和setInterval也会类似于eval。 69 | 70 | forin: false, 71 | //如果为真,那么,JSHint允许在for in 循环里面不出现hasOwnProperty, for in循环一般用来遍历一个对象的属性,这其中也包括他继承自原型链的属性,而hasOwnProperty可以来判断一个属性是否是对象本身的属性而不是继承得来的。 72 | 73 | immed: true, 74 | //如果为真,JSHint要求匿名函数的调用如下: 75 | 76 | //(function(){ // }()); 77 | //而不是 78 | 79 | //(function(){ //bla bla })(); 80 | 81 | // indent: 2, 82 | 83 | latedef: true, 84 | // This option prohibits the use of a variable before it was defined. JavaScript has function scope only and, in addition to that, all variables are always moved—or hoisted— to the top of the function. This behavior can lead to some very nasty bugs and that's why it is safer to always use variable only after they have been explicitly defined. 85 | // Setting this option to "nofunc" will allow function declarations to be ignored. 86 | 87 | laxbreak: false, 88 | //如果为真,JSHint则不会检查换行。 Javascript会通过自动补充分号来修正一些错误,因此这个选项可以检查一些潜在的问题。 89 | 90 | maxerr: 10, 91 | ///设定错误的阈值,超过这个阈值jshint不再向下检查,提示错误太多。 92 | 93 | newcap: true, 94 | //如果为真,JSHint会要求每一个构造函数名都要大写字母开头。 构造器是一种使用new运算符来创建对象的一种函数,new操作符会创建新的对象,并建立这个对象自己的this,一个构造函数如果不用new运算符来运行,那么他的this会指向全局对象而导致一些问题的发生。 95 | 96 | noarg: true, 97 | //如果为真,JSHint会禁止arguments.caller和arguments.callee的使用 arguments对象是一个类数组的对象,它具有一个索引值。arguments.callee指向当前执行的函数(这个在ES5的严格模式中被禁用了),而arguments.caller指向调用当前函数的函数(如果有的话),并且,他并不是在所有的Javascript实现里面都有。 98 | 99 | noempty: true, 100 | //如果为真,JSHint会禁止出现空的代码块(没有语句的代码块)。 如果为真,JSHint会禁用构造器,以避免一些问题。 在JSLint中会主动禁用构造器的方式以避免一些潜在问题,但其实很多构造器的使用并非有害,例如如下的调用 101 | 102 | //new JsUIWindow(); //注意这个调用是没有把构造器的结果赋值给变量的 103 | //因此,我们需要使用构造器的时候可以禁用这个选项。 104 | 105 | nomen: false, 106 | //如果为真,JSHint会禁用下划线的变量名。 很多人使用_name的方式来命名他们的变量,以说明这是一个私有变量,但实际上,并不是,下划线只是做了一个标识。 如果要使用私有变量,可以使用闭包来实现。 107 | 108 | onevar: true, 109 | //如果为真,JSHint期望函数只被var的形式声明一遍。 110 | 111 | passfail: false, 112 | //如果为真,JSHint会在发现首个错误后停止检查。 113 | 114 | plusplus: false, 115 | //如果为真,JSHint会禁用自增运算和自减运算 ++和--可能会带来一些代码的阅读上的困惑。 116 | 117 | regexp: true, 118 | //如果为真,JSHint会不允许使用.和[^...]的正则, 因为这样的正则往往会匹配到你不期望的内容,并可能会应用造成一些危害。 119 | 120 | undef: true, 121 | //如果为真,JSHint会要求所有的非全局变量,在使用前都被声明。 如果你不在一个本地作用域内使用var的方式来声明变量,Javascript会把它放到全局作用域下面。这样会很容易引起错误。 122 | 123 | unused: true, 124 | 125 | sub: true, 126 | //如果为真,JSHint会允许各种形式的下标来访问对象。 通常,JSHint希望你只是用点运算符来读取对象的属性(除非这个属性名是一个保留字),如果你不希望这样可以关闭这个选项。 127 | 128 | strict: false, 129 | //如果为真,JSHint会要求你使用use strict;语法。 Strict 模式是ES5里面的一个新特性,他允许你把一个程序或者函数放在一个“严格”的作用域中。可见Resig写的一篇关于严格模式的blog 严格模式做了几件事情: 130 | 131 | //1、他可以捕获一些错误和异常 132 | 133 | //2、当我们进行一下“不安全”的操作时,他会抛异常,例如访问全局变量。 134 | 135 | //3、他会禁止你使用一些奇淫技巧,或者不良的代码编写。 136 | 137 | white: false, 138 | //如果为true,JSHint会依据严格的空白规范检查你的代码。 139 | 140 | funcscope: true, 141 | laxcomma: true, 142 | loopfunc: true, // 警告在循环内部定义函数 143 | multistr: true, // 警告多行字符串 144 | notypeof: true, // 警告不正确的错误类型,如 'function' 写成 'functin' 145 | shadow: true, // 警告多重定义变量 146 | smarttabs: true, 147 | validthis: true // 关于 this 的警告 148 | } 149 | 150 | } 151 | 152 | }); 153 | 154 | // Load the plugin that provides the "uglify" task. 155 | 156 | grunt.loadNpmTasks('grunt-contrib-jshint'); 157 | 158 | // Default task(s). 159 | grunt.registerTask('default', ['jshint']); 160 | 161 | }; 162 | 163 | 164 | //module.exports = function(grunt) { 165 | // 166 | // grunt.initConfig({ 167 | // pkg: grunt.file.readJSON('package.json'), 168 | // concat: { 169 | // options: { 170 | // separator: ';' 171 | // }, 172 | // dist: { 173 | // src: ['src/**/*.js'], 174 | // dest: 'dist/<%= pkg.name %>.js' 175 | // } 176 | // }, 177 | // uglify: { 178 | // options: { 179 | // banner: '/*! <%= pkg.name %> <%= grunt.template.today("dd-mm-yyyy") %> */\n' 180 | // }, 181 | // dist: { 182 | // files: { 183 | // 'dist/<%= pkg.name %>.min.js': ['<%= concat.dist.dest %>'] 184 | // } 185 | // } 186 | // }, 187 | // qunit: { 188 | // files: ['test/**/*.html'] 189 | // }, 190 | // jshint: { 191 | // files: ['gruntfile.js', 'src/**/*.js', 'test/**/*.js'], 192 | // options: { 193 | // // options here to override JSHint defaults 194 | // globals: { 195 | // jQuery: true, 196 | // console: true, 197 | // module: true, 198 | // document: true 199 | // } 200 | // } 201 | // }, 202 | // watch: { 203 | // files: ['<%= jshint.files %>'], 204 | // tasks: ['jshint', 'qunit'] 205 | // } 206 | // }); 207 | // 208 | // grunt.loadNpmTasks('grunt-contrib-uglify'); 209 | // grunt.loadNpmTasks('grunt-contrib-jshint'); 210 | // grunt.loadNpmTasks('grunt-contrib-qunit'); 211 | // grunt.loadNpmTasks('grunt-contrib-watch'); 212 | // grunt.loadNpmTasks('grunt-contrib-concat'); 213 | // 214 | // grunt.registerTask('test', ['jshint', 'qunit']); 215 | // 216 | // grunt.registerTask('default', ['jshint', 'qunit', 'concat', 'uglify']); 217 | // 218 | //}; 219 | -------------------------------------------------------------------------------- /lib/AnalyzeContext.js: -------------------------------------------------------------------------------- 1 | var Dictionary = require('./Dictionary'), 2 | QuickSortSet = require('./QuickSortSet'), 3 | AnalyzeService = require('./AnalyzeService'), 4 | SortedSetService = AnalyzeService.SortedSetService, 5 | LexemeService = AnalyzeService.LexemeService; 6 | var Lexeme = require('./Lexeme'), 7 | consts = require('./consts'), 8 | utils = require('./utils'), 9 | CharType = consts.CharType, 10 | LexemeType = consts.LexemeType; 11 | var CharacterUtil = require('./CharacterUtil'); 12 | 13 | var BUFF_EXHAUST_CRITICAL = 100; 14 | 15 | var AnalyzeContext = function(opts){ 16 | this.segmentBuff = ''; 17 | this.charTypes = []; 18 | this.buffLocker = {}; 19 | this.orgLexemes = new QuickSortSet(); 20 | this.pathMap = {}; 21 | this.results = []; 22 | this.totalReadCount = 0; // 总共读取了多少字符 23 | }; 24 | 25 | module.exports = AnalyzeContext; 26 | 27 | AnalyzeContext.prototype.getCurrentChar = function(){ 28 | return this.segmentBuff[this.cursor]; 29 | }; 30 | 31 | AnalyzeContext.prototype.getCurrentCharType = function(){ 32 | return this.charTypes[this.cursor]; 33 | }; 34 | 35 | /** 36 | * 根据context的上下文情况,填充segmentBuff 37 | * @param reader 38 | * @return 返回待分析的(有效的)字串长度 39 | */ 40 | AnalyzeContext.prototype.fillBuffer = function(txt){ 41 | this.segmentBuff = txt; 42 | this.cursor = 0; 43 | this.available = this.segmentBuff.length; 44 | return this.available; 45 | }; 46 | 47 | /** 48 | * 初始化buff指针,处理第一个字符 49 | */ 50 | AnalyzeContext.prototype.initCursor = function(){ 51 | this.cursor = 0; 52 | //this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 53 | //this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 54 | var chr = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 55 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(chr); 56 | 57 | }; 58 | 59 | /** 60 | * 指针+1 61 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 62 | * 并处理当前字符 63 | */ 64 | AnalyzeContext.prototype.moveCursor = function(){ 65 | if (this.cursor < this.available - 1){ 66 | this.cursor += 1; 67 | 68 | //this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 69 | //this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 70 | var chr = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 71 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(chr); 72 | return true; 73 | } 74 | else{ 75 | return false; 76 | } 77 | }; 78 | 79 | /** 80 | * 设置当前segmentBuff为锁定状态 81 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff 82 | * @param segmenterName 83 | */ 84 | AnalyzeContext.prototype.lockBuffer = function(segmenterName){ 85 | this.buffLocker[segmenterName] = 1; 86 | }; 87 | 88 | /** 89 | * 移除指定的子分词器名,释放对segmentBuff的占用 90 | * @param segmenterName 91 | */ 92 | AnalyzeContext.prototype.unlockBuffer = function(segmenterName){ 93 | this.buffLocker[segmenterName] = 0; 94 | }; 95 | 96 | /** 97 | * 只要buffLocker中存在segmenterName 98 | * 则buffer被锁定 99 | * @return boolean 缓冲去是否被锁定 100 | */ 101 | AnalyzeContext.prototype.isBufferLocked = function(){ 102 | for(var k in this.buffLocker){ 103 | if (this.buffLocker[k]) { 104 | return true; 105 | } 106 | } 107 | return false; 108 | }; 109 | 110 | /** 111 | * 判断当前segmentBuff是否已经用完 112 | * 当前执针cursor移至segmentBuff末端this.available - 1 113 | * @return 114 | */ 115 | /*AnalyzeContext.prototype.isBufferConsumed = function(){ 116 | return this.cursor === this.available - 1; 117 | };*/ 118 | 119 | /** 120 | * 判断segmentBuff是否需要读取新数据 121 | * 122 | * 满足以下条件时, 123 | * 1.available == BUFF_SIZE 表示buffer满载 124 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 125 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer 126 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作) 127 | * @return 128 | */ 129 | /*AnalyzeContext.prototype.needRefillBuffer = function(){ 130 | return (this.cursor < this.available - 1 && 131 | this.cursor > this.available - BUFF_EXHAUST_CRITICAL && 132 | !this.isBufferLocked()); 133 | };*/ 134 | 135 | /** 136 | * 累计当前的segmentBuff相对于reader起始位置的位移 137 | */ 138 | AnalyzeContext.prototype.markBufferOffset = function(){ 139 | this.buffOffset += this.cursor; 140 | }; 141 | 142 | /** 143 | * 向分词结果集添加词元 144 | * @param lexeme 145 | */ 146 | /*AnalyzeContext.prototype.addLexeme = function(lexeme){ 147 | this.orgLexemes.addLexeme(lexeme); 148 | };*/ 149 | 150 | /** 151 | * 添加分词结果路径 152 | * 路径起始位置 ---> 路径 映射表 153 | * @param path 154 | */ 155 | AnalyzeContext.prototype.addLexemePath = function(crossPath){ 156 | if (crossPath){ 157 | this.pathMap[crossPath.pathBegin] = crossPath; 158 | } 159 | }; 160 | 161 | /** 162 | * 推送分词结果到结果集合 163 | * 1.从buff头部遍历到this.cursor已处理位置 164 | * 2.将map中存在的分词结果推入results 165 | * 3.将map中不存在的CJDK字符以单字方式推入results 166 | */ 167 | AnalyzeContext.prototype.outputToResult = function(){ 168 | var lexeme; 169 | for(var index = 0;index <= this.cursor;){ 170 | 171 | //跳过非CJK字符 172 | if (CharType.CHAR_USELESS === this.charTypes[index]){ 173 | index ++; 174 | continue; 175 | } 176 | //从pathMap找出对应index位置的LexemePath 177 | var crosspath = this.pathMap[index]; 178 | if (crosspath){ 179 | //输出LexemePath中的lexeme到results集合 180 | lexeme = crosspath.pollFirst(); 181 | while (lexeme){ 182 | this.results.push(lexeme); 183 | //将index移至lexeme后 184 | index = lexeme.begin + lexeme.len; 185 | 186 | lexeme = crosspath.pollFirst(); 187 | 188 | if (lexeme){ 189 | //输出path内部,词元间遗漏的单字 190 | for(;index < lexeme.begin; index++){ 191 | this.outputSingleCJK(index); 192 | } 193 | } 194 | } 195 | } 196 | else{//pathMap中找不到index对应的LexemePath 197 | //单字输出 198 | this.outputSingleCJK(index); 199 | index++; 200 | } 201 | } 202 | //清空当前的Map 203 | this.pathMap = {}; 204 | 205 | var result = []; 206 | lexeme = this.results.shift(); 207 | while(lexeme){ 208 | this.compound(lexeme); 209 | result.push(this.segmentBuff.substr(lexeme.begin, lexeme.len)); 210 | 211 | lexeme = this.results.shift(); 212 | } 213 | /* this.results.forEach(function(v){ 214 | result.push(segmentBuff.substr(v.begin, v.len)); 215 | });*/ 216 | 217 | return result.join(' '); 218 | }; 219 | 220 | /** 221 | * 对CJK字符进行单字输出 222 | * @param index 223 | */ 224 | AnalyzeContext.prototype.outputSingleCJK = function(index){ 225 | if (CharType.CHAR_CHINESE === this.charTypes[index]){ 226 | var singleCharLexeme = new Lexeme(this.buffOffset, index, 1, LexemeType.TYPE_CNCHAR); 227 | this.results.push(singleCharLexeme); 228 | } 229 | else if(CharType.CHAR_OTHER_CJK === this.charTypes[index]){ 230 | var singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , LexemeType.TYPE_OTHER_CJK); 231 | this.results.push(singleCharLexeme); 232 | } 233 | }; 234 | 235 | /** 236 | * 重置分词上下文状态 237 | */ 238 | AnalyzeContext.prototype.reset = function(){ 239 | this.buffLocker = {}; 240 | this.orgLexemes = new QuickSortSet(); 241 | this.available =0; 242 | this.buffOffset = 0; 243 | this.charTypes = []; 244 | this.cursor = 0; 245 | this.results = []; 246 | this.segmentBuff = []; 247 | this.pathMap = {}; 248 | }; 249 | 250 | /** 251 | * 组合词元 252 | */ 253 | AnalyzeContext.prototype.compound = function(lexeme){ 254 | //数量词合并处理 255 | var nextLexeme, appendOk = true; 256 | while (appendOk && this.results.length > 0){ 257 | appendOk = false; 258 | if (LexemeType.TYPE_ARABIC === lexeme.lexemeType){ 259 | nextLexeme = this.results[0]; 260 | if (LexemeType.TYPE_CNUM === nextLexeme.lexemeType){ 261 | //合并英文数词+中文数词 262 | appendOk = LexemeService.append(lexeme, nextLexeme, LexemeType.TYPE_CNUM); 263 | } 264 | else if (LexemeType.TYPE_COUNT === nextLexeme.lexemeType){ 265 | //合并英文数词+中文量词 266 | appendOk = LexemeService.append(lexeme, nextLexeme, LexemeType.TYPE_CQUAN); 267 | } 268 | if (appendOk){ 269 | //弹出 270 | this.results.shift(); 271 | } 272 | } 273 | 274 | //可能存在第二轮合并 275 | if (LexemeType.TYPE_CNUM === lexeme.lexemeType && this.results.length > 0){ 276 | nextLexeme = this.results[0]; 277 | appendOk = false; 278 | if (LexemeType.TYPE_COUNT == nextLexeme.lexemeType){ 279 | //合并中文数词+中文量词 280 | appendOk = LexemeService.append(lexeme, nextLexeme, LexemeType.TYPE_CQUAN); 281 | } 282 | if (appendOk){ 283 | //弹出 284 | this.results.shift(); 285 | } 286 | } 287 | } 288 | }; 289 | -------------------------------------------------------------------------------- /lib/CharacterUtil.js: -------------------------------------------------------------------------------- 1 | var CharType = require('./consts').CharType; 2 | 3 | /* 4 | 0000..007F; Basic Latin 5 | 0080..00FF; Latin-1 Supplement 6 | 0100..017F; Latin Extended-A 7 | 0180..024F; Latin Extended-B 8 | 0250..02AF; IPA Extensions 9 | 02B0..02FF; Spacing Modifier Letters 10 | 0300..036F; Combining Diacritical Marks 11 | 0370..03FF; Greek and Coptic 12 | 0400..04FF; Cyrillic 13 | 0500..052F; Cyrillic Supplement 14 | 0530..058F; Armenian 15 | 0590..05FF; Hebrew 16 | 0600..06FF; Arabic 17 | 0700..074F; Syriac 18 | 0750..077F; Arabic Supplement 19 | 0780..07BF; Thaana 20 | 07C0..07FF; NKo 21 | 0800..083F; Samaritan 22 | 0900..097F; Devanagari 23 | 0980..09FF; Bengali 24 | 0A00..0A7F; Gurmukhi 25 | 0A80..0AFF; Gujarati 26 | 0B00..0B7F; Oriya 27 | 0B80..0BFF; Tamil 28 | 0C00..0C7F; Telugu 29 | 0C80..0CFF; Kannada 30 | 0D00..0D7F; Malayalam 31 | 0D80..0DFF; Sinhala 32 | 0E00..0E7F; Thai 33 | 0E80..0EFF; Lao 34 | 0F00..0FFF; Tibetan 35 | 1000..109F; Myanmar 36 | 10A0..10FF; Georgian 37 | 1100..11FF; Hangul Jamo 38 | 1200..137F; Ethiopic 39 | 1380..139F; Ethiopic Supplement 40 | 13A0..13FF; Cherokee 41 | 1400..167F; Unified Canadian Aboriginal Syllabics 42 | 1680..169F; Ogham 43 | 16A0..16FF; Runic 44 | 1700..171F; Tagalog 45 | 1720..173F; Hanunoo 46 | 1740..175F; Buhid 47 | 1760..177F; Tagbanwa 48 | 1780..17FF; Khmer 49 | 1800..18AF; Mongolian 50 | 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended 51 | 1900..194F; Limbu 52 | 1950..197F; Tai Le 53 | 1980..19DF; New Tai Lue 54 | 19E0..19FF; Khmer Symbols 55 | 1A00..1A1F; Buginese 56 | 1A20..1AAF; Tai Tham 57 | 1B00..1B7F; Balinese 58 | 1B80..1BBF; Sundanese 59 | 1C00..1C4F; Lepcha 60 | 1C50..1C7F; Ol Chiki 61 | 1CD0..1CFF; Vedic Extensions 62 | 1D00..1D7F; Phonetic Extensions 63 | 1D80..1DBF; Phonetic Extensions Supplement 64 | 1DC0..1DFF; Combining Diacritical Marks Supplement 65 | 1E00..1EFF; Latin Extended Additional 66 | 1F00..1FFF; Greek Extended 67 | 2000..206F; General Punctuation 68 | 2070..209F; Superscripts and Subscripts 69 | 20A0..20CF; Currency Symbols 70 | 20D0..20FF; Combining Diacritical Marks for Symbols 71 | 2100..214F; Letterlike Symbols 72 | 2150..218F; Number Forms 73 | 2190..21FF; Arrows 74 | 2200..22FF; Mathematical Operators 75 | 2300..23FF; Miscellaneous Technical 76 | 2400..243F; Control Pictures 77 | 2440..245F; Optical Character Recognition 78 | 2460..24FF; Enclosed Alphanumerics 79 | 2500..257F; Box Drawing 80 | 2580..259F; Block Elements 81 | 25A0..25FF; Geometric Shapes 82 | 2600..26FF; Miscellaneous Symbols 83 | 2700..27BF; Dingbats 84 | 27C0..27EF; Miscellaneous Mathematical Symbols-A 85 | 27F0..27FF; Supplemental Arrows-A 86 | 2800..28FF; Braille Patterns 87 | 2900..297F; Supplemental Arrows-B 88 | 2980..29FF; Miscellaneous Mathematical Symbols-B 89 | 2A00..2AFF; Supplemental Mathematical Operators 90 | 2B00..2BFF; Miscellaneous Symbols and Arrows 91 | 2C00..2C5F; Glagolitic 92 | 2C60..2C7F; Latin Extended-C 93 | 2C80..2CFF; Coptic 94 | 2D00..2D2F; Georgian Supplement 95 | 2D30..2D7F; Tifinagh 96 | 2D80..2DDF; Ethiopic Extended 97 | 2DE0..2DFF; Cyrillic Extended-A 98 | 2E00..2E7F; Supplemental Punctuation 99 | 2E80..2EFF; CJK Radicals Supplement 100 | 2F00..2FDF; Kangxi Radicals 101 | 2FF0..2FFF; Ideographic Description Characters 102 | 3000..303F; CJK Symbols and Punctuation 103 | 3040..309F; Hiragana 104 | 30A0..30FF; Katakana 105 | 3100..312F; Bopomofo 106 | 3130..318F; Hangul Compatibility Jamo 107 | 3190..319F; Kanbun 108 | 31A0..31BF; Bopomofo Extended 109 | 31C0..31EF; CJK Strokes 110 | 31F0..31FF; Katakana Phonetic Extensions 111 | 3200..32FF; Enclosed CJK Letters and Months 112 | 3300..33FF; CJK Compatibility 113 | 3400..4DBF; CJK Unified Ideographs Extension A 114 | 4DC0..4DFF; Yijing Hexagram Symbols 115 | 4E00..9FFF; CJK Unified Ideographs 116 | A000..A48F; Yi Syllables 117 | A490..A4CF; Yi Radicals 118 | A4D0..A4FF; Lisu 119 | A500..A63F; Vai 120 | A640..A69F; Cyrillic Extended-B 121 | A6A0..A6FF; Bamum 122 | A700..A71F; Modifier Tone Letters 123 | A720..A7FF; Latin Extended-D 124 | A800..A82F; Syloti Nagri 125 | A830..A83F; Common Indic Number Forms 126 | A840..A87F; Phags-pa 127 | A880..A8DF; Saurashtra 128 | A8E0..A8FF; Devanagari Extended 129 | A900..A92F; Kayah Li 130 | A930..A95F; Rejang 131 | A960..A97F; Hangul Jamo Extended-A 132 | A980..A9DF; Javanese 133 | AA00..AA5F; Cham 134 | AA60..AA7F; Myanmar Extended-A 135 | AA80..AADF; Tai Viet 136 | ABC0..ABFF; Meetei Mayek 137 | AC00..D7AF; Hangul Syllables 138 | D7B0..D7FF; Hangul Jamo Extended-B 139 | D800..DB7F; High Surrogates 140 | DB80..DBFF; High Private Use Surrogates 141 | DC00..DFFF; Low Surrogates 142 | E000..F8FF; Private Use Area 143 | F900..FAFF; CJK Compatibility Ideographs 144 | FB00..FB4F; Alphabetic Presentation Forms 145 | FB50..FDFF; Arabic Presentation Forms-A 146 | FE00..FE0F; Variation Selectors 147 | FE10..FE1F; Vertical Forms 148 | FE20..FE2F; Combining Half Marks 149 | FE30..FE4F; CJK Compatibility Forms 150 | FE50..FE6F; Small Form Variants 151 | FE70..FEFF; Arabic Presentation Forms-B 152 | FF00..FFEF; Halfwidth and Fullwidth Forms 153 | FFF0..FFFF; Specials 154 | 10000..1007F; Linear B Syllabary 155 | 10080..100FF; Linear B Ideograms 156 | 10100..1013F; Aegean Numbers 157 | 10140..1018F; Ancient Greek Numbers 158 | 10190..101CF; Ancient Symbols 159 | 101D0..101FF; Phaistos Disc 160 | 10280..1029F; Lycian 161 | 102A0..102DF; Carian 162 | 10300..1032F; Old Italic 163 | 10330..1034F; Gothic 164 | 10380..1039F; Ugaritic 165 | 103A0..103DF; Old Persian 166 | 10400..1044F; Deseret 167 | 10450..1047F; Shavian 168 | 10480..104AF; Osmanya 169 | 10800..1083F; Cypriot Syllabary 170 | 10840..1085F; Imperial Aramaic 171 | 10900..1091F; Phoenician 172 | 10920..1093F; Lydian 173 | 10A00..10A5F; Kharoshthi 174 | 10A60..10A7F; Old South Arabian 175 | 10B00..10B3F; Avestan 176 | 10B40..10B5F; Inscriptional Parthian 177 | 10B60..10B7F; Inscriptional Pahlavi 178 | 10C00..10C4F; Old Turkic 179 | 10E60..10E7F; Rumi Numeral Symbols 180 | 11080..110CF; Kaithi 181 | 12000..123FF; Cuneiform 182 | 12400..1247F; Cuneiform Numbers and Punctuation 183 | 13000..1342F; Egyptian Hieroglyphs 184 | 1D000..1D0FF; Byzantine Musical Symbols 185 | 1D100..1D1FF; Musical Symbols 186 | 1D200..1D24F; Ancient Greek Musical Notation 187 | 1D300..1D35F; Tai Xuan Jing Symbols 188 | 1D360..1D37F; Counting Rod Numerals 189 | 1D400..1D7FF; Mathematical Alphanumeric Symbols 190 | 1F000..1F02F; Mahjong Tiles 191 | 1F030..1F09F; Domino Tiles 192 | 1F100..1F1FF; Enclosed Alphanumeric Supplement 193 | 1F200..1F2FF; Enclosed Ideographic Supplement 194 | 20000..2A6DF; CJK Unified Ideographs Extension B 195 | 2A700..2B73F; CJK Unified Ideographs Extension C 196 | 2F800..2FA1F; CJK Compatibility Ideographs Supplement 197 | E0000..E007F; Tags 198 | E0100..E01EF; Variation Selectors Supplement 199 | F0000..FFFFF; Supplementary Private Use Area-A 200 | 100000..10FFFF; Supplementary Private Use Area- 201 | */ 202 | var CharacterUtil = { 203 | /** 204 | * 识别字符类型 205 | * @param input 206 | * @return int CharacterUtil定义的字符类型常量 207 | */ 208 | identifyCharType: function(input){ 209 | var ident = CharType.CHAR_USELESS; //其他的不做处理的字符 210 | if ((input >= '0' && input <= '9')){ 211 | ident = CharType.CHAR_ARABIC; 212 | } 213 | else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')){ 214 | ident = CharType.CHAR_ENGLISH; 215 | } 216 | else { 217 | ident = CharType.CHAR_CHINESE; 218 | 219 | var ub = input.charCodeAt(0); 220 | if ((ub >= 0x3400 && ub <= 0x4DBF) || // CJK Unified Ideographs Extension A 221 | (ub >= 0x4E00 && ub <= 0x9FFF) || // CJK Unified Ideographs 222 | (ub >= 0xF900 && ub <= 0xFAFF)){ // CJK Compatibility Ideographs 223 | //目前已知的中文字符UTF-8集合 224 | ident = CharType.CHAR_CHINESE; 225 | } 226 | else if ((ub >= 0xFF00 && ub <= 0xFFEF) || // Halfwidth and Fullwidth Forms 全角数字字符和日韩字符 227 | //韩文字符集 228 | (ub >= 0x1100 && ub <= 0x11FF) || // Hangul Jamo 229 | (ub >= 0x3130 && ub <= 0x318F) || // Hangul Compatibility Jamo 230 | (ub >= 0xA960 && ub <= 0xA97F) || // Hangul Jamo Extended-A 231 | (ub >= 0xAC00 && ub <= 0xD7AF) || // Hangul Syllables 232 | (ub >= 0xD7B0 && ub <= 0xD7FF) || // Hangul Jamo Extended-B 233 | //日文字符集 234 | (ub >= 0x3040 && ub <= 0x309F) || // Hiragana 平假名 235 | (ub >= 0x30A0 && ub <= 0x30FF) || // KATAKANA 片假名 236 | (ub >= 0x31F0 && ub <= 0x31FF)){ // Katakana Phonetic Extensions 237 | ident = CharType.CHAR_OTHER_CJK; 238 | } 239 | } 240 | return ident; 241 | }, 242 | /** 243 | * 进行字符规格化(全角转半角,大写转小写处理) 244 | * @param input 245 | * @return char 246 | */ 247 | regularize: function(input){ 248 | var code = input.charCodeAt(0); 249 | if (code === 12288) { 250 | input = ' ';//32; 251 | } 252 | else if (code > 65280 && code < 65375) { 253 | input = String.fromCharCode(code - 65248); 254 | if ((input >= 'A' && input <= 'Z')) { 255 | code = input.charCodeAt(0); 256 | input = String.fromCharCode(code + 32); 257 | } 258 | } 259 | else if ((input >= 'A' && input <= 'Z')) { 260 | input = String.fromCharCode(code + 32); 261 | } 262 | 263 | return input; 264 | } 265 | }; 266 | 267 | module.exports = CharacterUtil; -------------------------------------------------------------------------------- /lib/AnalyzeService.js: -------------------------------------------------------------------------------- 1 | var Lexeme = require('./Lexeme'); 2 | 3 | var LexemeService = { 4 | /** 5 | * 合并两个相邻的词元 6 | * @param l 7 | * @param lexemeType 8 | * @return boolean 词元是否成功合并 9 | */ 10 | append: function(prev, next, lexemeType){ 11 | if (next && this.getEndPosition(prev) === this.getBeginPosition(next)){ 12 | prev.len += next.len; 13 | prev.lexemeType = lexemeType; 14 | return true; 15 | }else { 16 | return false; 17 | } 18 | }, 19 | /* 20 | * 判断词元相等算法 21 | * 起始位置偏移、起始位置、终止位置相同 22 | */ 23 | equals: function(l, o){ 24 | if (!o){ 25 | return false; 26 | } 27 | 28 | if (l === o){ 29 | return true; 30 | } 31 | 32 | if (o instanceof Lexeme){ 33 | if (l.offset === o.offset && 34 | l.begin === o.begin && 35 | l.len === o.len){ 36 | return true; 37 | } 38 | else{ 39 | return false; 40 | } 41 | } 42 | else{ 43 | return false; 44 | } 45 | }, 46 | /* 47 | * 词元在排序集合中的比较算法 48 | */ 49 | compare: function(x, y){ 50 | //起始位置优先 51 | if (x.begin < y.begin){ 52 | return -1; 53 | } 54 | else if(x.begin === y.begin){ 55 | //词元长度优先 56 | if(x.len > y.len){ 57 | return -1; 58 | } 59 | else if (x.len === y.len){ 60 | return 0; 61 | } 62 | else {//x.len < y.getLength() 63 | return 1; 64 | } 65 | } 66 | else {//x.begin > y.begin 67 | return 1; 68 | } 69 | }, 70 | /** 71 | * 获取词元在文本中的起始位置 72 | * @return int 73 | */ 74 | getBeginPosition: function(lexeme){ 75 | return lexeme.offset + lexeme.begin; 76 | }, 77 | /** 78 | * 获取词元在文本中的结束位置 79 | * @return int 80 | */ 81 | getEndPosition: function(lexeme){ 82 | return lexeme.offset + lexeme.begin + lexeme.len; 83 | } 84 | }; 85 | 86 | var SortedSetService = { 87 | /** 88 | * 向链表集合添加词元 89 | * @param lexeme 90 | */ 91 | addLexeme: function(sortedSet, lexeme){ 92 | if (sortedSet.lexemeList.length === 0){ 93 | sortedSet.lexemeList.push(lexeme); 94 | return null; 95 | } 96 | var tailList = [], tail, compRes; 97 | tail = sortedSet.pollLast(); // 比 this.lexemeList.pop(); 快 98 | compRes = LexemeService.compare(tail, lexeme); 99 | if (compRes === 0){ // 词元与尾部词元相同,不放入集合 100 | sortedSet.lexemeList.push(tail); 101 | return null; 102 | } 103 | else if (compRes < 0){ // 词元接入链表尾部 104 | sortedSet.lexemeList.push(tail); 105 | sortedSet.lexemeList.push(lexeme); 106 | return null; 107 | } 108 | else{ //从尾部上逆 109 | tailList.unshift(tail); 110 | var arr = this.addLexeme(sortedSet, lexeme); 111 | if (!arr) { 112 | sortedSet.lexemeList.concat(tailList); 113 | return null; 114 | } 115 | else{ 116 | tailList = arr.concat(tailList); 117 | return tailList; 118 | } 119 | } 120 | }, 121 | /** 122 | * 返回链表头部元素 123 | * @return 124 | */ 125 | peekFirst: function(sortedSet){ 126 | if (sortedSet.lexemeList.length > 0){ 127 | return sortedSet.lexemeList[0]; 128 | } 129 | return null; 130 | }, 131 | /** 132 | * 取出链表集合的第一个元素 133 | * @return Lexeme 134 | */ 135 | pollFirst: function(sortedSet){ 136 | if (sortedSet.lexemeList.length > 0){ 137 | return sortedSet.lexemeList.shift(); 138 | } 139 | return null; 140 | }, 141 | /** 142 | * 取出链表集合的最后一个元素 143 | * @return Lexeme 144 | */ 145 | pollLast: function(sortedSet){ 146 | return sortedSet.lexemeList.pop(); 147 | }, 148 | /** 149 | * 返回链表尾部元素 150 | * @return 151 | */ 152 | peekLast: function(sortedSet){ 153 | var idx = sortedSet.lexemeList.length - 1; 154 | if (idx >= 0){ 155 | return sortedSet.lexemeList[idx]; 156 | } 157 | return null; 158 | } 159 | }; 160 | 161 | var LexemePathService = { 162 | newLexemePath: function(){ 163 | 164 | }, 165 | compare: function(x, y){ 166 | //比较有效文本长度 167 | if (x.payloadLength > y.payloadLength){ 168 | return -1; 169 | } 170 | else if (x.payloadLength < y.payloadLength){ 171 | return 1; 172 | } 173 | else{ 174 | //比较词元个数,越少越好 175 | if (x.lexemeList.length < y.lexemeList.length){ 176 | return -1; 177 | } 178 | else if (x.lexemeList.length > y.lexemeList.length){ 179 | return 1; 180 | } 181 | else{ 182 | //路径跨度越大越好 183 | if (LexemePathService.getPathLength(x) > LexemePathService.getPathLength(y)){ 184 | return -1; 185 | } 186 | else if (LexemePathService.getPathLength(x) < LexemePathService.getPathLength(y)){ 187 | return 1; 188 | } 189 | else { 190 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 191 | if (x.pathEnd > y.pathEnd){ 192 | return -1; 193 | } 194 | else if (x.pathEnd < y.pathEnd){ 195 | return 1; 196 | } 197 | else{ 198 | //词长越平均越好 199 | if (LexemePathService.getXWeight(x) > LexemePathService.getXWeight(y)){ 200 | return -1; 201 | } 202 | else if (LexemePathService.getXWeight(x) < LexemePathService.getXWeight(y)){ 203 | return 1; 204 | } 205 | else { 206 | //词元位置权重比较 207 | if (LexemePathService.getPWeight(x) > LexemePathService.getPWeight(y)){ 208 | return -1; 209 | } 210 | else if (LexemePathService.getPWeight(x) < LexemePathService.getPWeight(y)){ 211 | return 1; 212 | } 213 | } 214 | } 215 | } 216 | } 217 | } 218 | return 0; 219 | }, 220 | /** 221 | * 获取LexemePath的路径长度 222 | * @return 223 | */ 224 | getPathLength: function(lexemePath){ 225 | return lexemePath.pathEnd - lexemePath.pathBegin; 226 | }, 227 | getNextLexeme: function(lexemePath, currLexeme){ 228 | var idx = lexemePath.lexemeList.indexOf(currLexeme); 229 | if (idx >= 0 && lexemePath.lexemeList.length > idx){ 230 | return lexemePath.lexemeList[idx+1]; 231 | } 232 | return null; 233 | }, 234 | /** 235 | * 检测词元位置交叉(有歧义的切分),有交叉时返回 true 236 | * @param lexeme 237 | * @return 238 | */ 239 | checkCross: function(lexemePath, lexeme){ 240 | return (lexeme.begin >= lexemePath.pathBegin && lexeme.begin < lexemePath.pathEnd) || 241 | (lexemePath.pathBegin >= lexeme.begin && lexemePath.pathBegin < lexeme.begin+ lexeme.len); 242 | }, 243 | /** 244 | * 移除尾部的Lexeme 245 | * @return 246 | */ 247 | removeTail: function(lexemePath){ 248 | var tail = lexemePath.pollLast(); 249 | if (lexemePath.lexemeList.length === 0){ 250 | lexemePath.pathBegin = -1; 251 | lexemePath.pathEnd = -1; 252 | lexemePath.payloadLength = 0; 253 | } 254 | else{ 255 | lexemePath.payloadLength -= tail.len; 256 | var newTail = lexemePath.peekLast(); 257 | lexemePath.pathEnd = newTail.begin + newTail.len; 258 | } 259 | return tail; 260 | }, 261 | /** 262 | * 向LexemePath追加相交的Lexeme 263 | * @param lexeme 264 | * @return 265 | */ 266 | addCrossLexeme: function(lexemePath, lexeme){ 267 | if (lexemePath.lexemeList.length === 0){ 268 | SortedSetService.addLexeme(lexemePath, lexeme); 269 | lexemePath.pathBegin = lexeme.begin; 270 | lexemePath.pathEnd = lexeme.begin + lexeme.len; 271 | lexemePath.payloadLength += lexeme.len; 272 | return true; 273 | } 274 | else if (LexemePathService.checkCross(lexemePath, lexeme)){ 275 | SortedSetService.addLexeme(lexemePath, lexeme); 276 | if (lexeme.begin + lexeme.len > lexemePath.pathEnd){ 277 | lexemePath.pathEnd = lexeme.begin + lexeme.len; 278 | } 279 | lexemePath.payloadLength = lexemePath.pathEnd - lexemePath.pathBegin; 280 | return true; 281 | } 282 | else{ 283 | return false; 284 | } 285 | }, 286 | /** 287 | * 向LexemePath追加不相交的Lexeme 288 | * @param lexeme 289 | * @return 290 | */ 291 | addNotCrossLexeme: function(lexemePath, lexeme){ 292 | if (lexemePath.lexemeList.length === 0){ 293 | SortedSetService.addLexeme(lexemePath, lexeme); 294 | lexemePath.pathBegin = lexeme.begin; 295 | lexemePath.pathEnd = lexeme.begin + lexeme.len; 296 | lexemePath.payloadLength += lexeme.len; 297 | return true; 298 | 299 | } 300 | else if (LexemePathService.checkCross(lexemePath, lexeme)){ 301 | return false; 302 | } 303 | else{ 304 | SortedSetService.addLexeme(lexemePath, lexeme); 305 | lexemePath.payloadLength += lexeme.len; 306 | var head = lexemePath.peekFirst(); 307 | lexemePath.pathBegin = head.begin; 308 | var tail = lexemePath.peekLast(); 309 | lexemePath.pathEnd = tail.begin + tail.len; 310 | return true; 311 | } 312 | }, 313 | /** 314 | * X权重(词元长度积) 315 | * @return 316 | */ 317 | getXWeight: function(lexemePath){ 318 | var product = 1; 319 | var c = lexemePath.peekFirst(); 320 | while(c){ 321 | product *= c.len; 322 | c = LexemePathService.getNextLexeme(lexemePath, c); 323 | } 324 | return product; 325 | }, 326 | /** 327 | * 词元位置权重 328 | * @return 329 | */ 330 | getPWeight: function(lexemePath){ 331 | var pWeight = 0; 332 | var p = 0; 333 | var c = lexemePath.peekFirst(); 334 | while(c){ 335 | p++; 336 | pWeight += p * c.len; 337 | c = LexemePathService.getNextLexeme(lexemePath, c); 338 | } 339 | return pWeight; 340 | } 341 | }; 342 | 343 | 344 | 345 | module.exports = { 346 | LexemeService: LexemeService, 347 | LexemePathService: LexemePathService, 348 | SortedSetService: SortedSetService 349 | }; -------------------------------------------------------------------------------- /lib/Dictionary.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'), 2 | config = require('./config'), 3 | Hit = require('./Hit'), 4 | HitService = require('./HitService'); 5 | 6 | var DictSegment = require('./DictSegment'); 7 | 8 | var PATH_DIC_MAIN = __dirname + "/dict/main.dic", 9 | PATH_DIC_SURNAME = __dirname + "/dict/surname.dic", 10 | PATH_DIC_QUANTIFIER = __dirname + "/dict/quantifier.dic", 11 | PATH_DIC_SUFFIX = __dirname + "/dict/suffix.dic", 12 | PATH_DIC_PREP = __dirname + "/dict/preposition.dic", 13 | PATH_DIC_STOP = __dirname + "/dict/stopword.dic"; 14 | 15 | var Dictionary = { 16 | inited: false, 17 | initial: function(opts){ 18 | this.inited = true; 19 | this.config = opts.Config || config; 20 | 21 | if (!this.config.ext_dict) { this.config.ext_dict = config.ext_dict;} 22 | if (!this.config.ext_stopwords) { this.config.ext_stopwords = config.ext_stopwords;} 23 | 24 | this.loadMainDict(opts.MainDictPath); 25 | 26 | this.loadSurnameDict(opts.SurnameDictPath); 27 | this.loadQuantifierDict(opts.QuantifierDictPath); 28 | this.loadSuffixDict(opts.SuffixDictPath); 29 | this.loadPrepDict(opts.PrepDictPath); 30 | this.loadStopWordDict(opts.StopWordDictPath); 31 | 32 | // todo 缓存字典 33 | //fs.writeFileSync('./dict/main.dic.json', JSON.stringify(this._MainDict, null, '\t')); 34 | }, 35 | loadExtDict: function(filepath){ 36 | //读取扩展词典文件 37 | var file = fs.readFileSync(filepath, {encoding: 'utf8'}); 38 | file = file.replace(/ |\r/g, ''); 39 | file = file.split('\n'); 40 | 41 | var theWord; 42 | for(var i=0;i 1){ 245 | //词元还没有完全加入词典树 246 | Dictionary.fillSegment(ds, charArray, begin + 1, len - 1 , enabled); 247 | } 248 | else if (len === 1){ 249 | //已经是词元的最后一个char,设置当前节点状态为enabled, 250 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 251 | ds.nodeState = enabled; 252 | } 253 | } 254 | }, 255 | matchInDictSegment: function(dictSeg, charArray, begin, len, searchHit) { 256 | if (!searchHit){ 257 | //如果hit为空,新建 258 | searchHit = new Hit(); 259 | //设置hit的起始文本位置 260 | searchHit.begin = begin; 261 | } 262 | else{ 263 | //否则要将HIT状态重置 264 | HitService.setUnmatch(searchHit); 265 | } 266 | //设置hit的当前处理位置 267 | searchHit.end = begin; 268 | 269 | var keyChar = charArray[begin]; 270 | //在map中查找 271 | var ds = dictSeg.childrenMap[keyChar]; 272 | 273 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 274 | if (ds){ 275 | if (len > 1){ 276 | //词未匹配完,继续往下搜索 277 | return Dictionary.match(ds, charArray, begin + 1 , len - 1 , searchHit); 278 | } 279 | else if (len === 1){ 280 | //搜索最后一个char 281 | if(ds.nodeState === 1){ 282 | //添加HIT状态为完全匹配 283 | HitService.setMatch(searchHit); 284 | } 285 | if(ds.storeSize > 0){ 286 | //添加HIT状态为前缀匹配 287 | HitService.setPrefix(searchHit); 288 | //记录当前位置的DictSegment 289 | searchHit.matchedDictSegment = ds; 290 | } 291 | return searchHit; 292 | } 293 | } 294 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配 295 | return searchHit; 296 | }, 297 | lookforSegment: function(dictSeg, keyChar, create){ 298 | //搜索Map 299 | var ds = dictSeg.childrenMap[keyChar]; 300 | if (!ds && create){ 301 | //构造新的segment 302 | ds = new DictSegment(); 303 | dictSeg.childrenMap[keyChar] = ds; 304 | //当前节点存储segment数目+1 305 | dictSeg.storeSize += 1; 306 | } 307 | 308 | return ds; 309 | } 310 | 311 | }; 312 | 313 | module.exports = Dictionary; 314 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /lib/dict/custom/single_word_low_freq.dic: -------------------------------------------------------------------------------- 1 | 踧 2 | 覢 3 | 觓 4 | 覛 5 | 覅 6 | 覟 7 | 覗 8 | 覣 9 | 覭 10 | 覂 11 | 觡 12 | 覝 13 | 觟 14 | 褱 15 | 褰 16 | 襒 17 | 覞 18 | 袨 19 | 觏 20 | 赒 21 | 觇 22 | 謍 23 | 讙 24 | 襦 25 | 袤 26 | 誸 27 | 诮 28 | 衩 29 | 茷 30 | 趒 31 | 襌 32 | 诰 33 | 譠 34 | 袄 35 | 聱 36 | 豸 37 | 蠓 38 | 讵 39 | 袅 40 | 诂 41 | 裞 42 | 訄 43 | 荺 44 | 褂 45 | 蠡 46 | 裐 47 | 諴 48 | 芫 49 | 赧 50 | 触 51 | 跫 52 | 褫 53 | 赝 54 | 褡 55 | 衪 56 | 裎 57 | 豜 58 | 褶 59 | 裟 60 | 跏 61 | 袪 62 | 袈 63 | 觐 64 | 跄 65 | 坏 66 | 肱 67 | 裾 68 | 考 69 | 豝 70 | 踰 71 | 覃 72 | 蹓 73 | 黾 74 | 褴 75 | 轲 76 | 裨 77 | 蜇 78 | 鮆 79 | 褥 80 | 誊 81 | 貉 82 | 褊 83 | 蜉 84 | 衔 85 | 詄 86 | 豋 87 | 胼 88 | 荞 89 | 踫 90 | 谗 91 | 耦 92 | 誏 93 | 衮 94 | 胝 95 | 幔 96 | 轭 97 | 赈 98 | 贲 99 | 蓼 100 | 褛 101 | 迵 102 | 觊 103 | 蚜 104 | 讫 105 | 颢 106 | 葄 107 | 觎 108 | 诎 109 | 謢 110 | 蹧 111 | 邬 112 | 芊 113 | 赣 114 | 囱 115 | 蝎 116 | 夆 117 | 蠋 118 | 蠕 119 | 蹼 120 | 臊 121 | 蛭 122 | 颚 123 | 讴 124 | 踽 125 | 菫 126 | 臾 127 | 薮 128 | 蹒 129 | 谀 130 | 菀 131 | 佶 132 | 摀 133 | 佚 134 | 邸 135 | 跺 136 | 豊 137 | 荔 138 | 锌 139 | 诿 140 | 蕤 141 | 诳 142 | 芩 143 | 蹴 144 | 褉 145 | 觔 146 | 舴 147 | 腋 148 | 颍 149 | 膊 150 | 脯 151 | 荪 152 | 郢 153 | 坛 154 | 轫 155 | 醺 156 | 捺 157 | 姝 158 | 胭 159 | 饷 160 | 谪 161 | 驮 162 | 僮 163 | 踯 164 | 忪 165 | 驷 166 | 躅 167 | 忑 168 | 彧 169 | 衲 170 | 唠 171 | 跚 172 | 吃 173 | 诩 174 | 褓 175 | 诤 176 | 豨 177 | 诋 178 | 菈 179 | 逖 180 | 荟 181 | 裆 182 | 喋 183 | 忖 184 | 闾 185 | 诌 186 | 啻 187 | 铀 188 | 菡 189 | 胱 190 | 蹬 191 | 隹 192 | 鹬 193 | 诒 194 | 轧 195 | 萏 196 | 舶 197 | 鳅 198 | 药 199 | 酯 200 | 夯 201 | 偬 202 | 酝 203 | 跻 204 | 咤 205 | 掬 206 | 呆 207 | 蹶 208 | 踞 209 | 蝌 210 | 咋 211 | 谧 212 | 舫 213 | 啐 214 | 茸 215 | 谟 216 | 嵌 217 | 蜿 218 | 魇 219 | 帷 220 | 觑 221 | 鳍 222 | 谏 223 | 哽 224 | 乓 225 | 蚌 226 | 嗙 227 | 巿 228 | 刽 229 | 踱 230 | 腆 231 | 薏 232 | 蜃 233 | 谑 234 | 躄 235 | 鸾 236 | 齁 237 | 腼 238 | 呷 239 | 吆 240 | 荀 241 | 裱 242 | 辇 243 | 睫 244 | 伎 245 | 妲 246 | 菠 247 | 鼐 248 | 麾 249 | 芮 250 | 鲑 251 | 辉 252 | 啜 253 | 苞 254 | 踼 255 | 荃 256 | 杞 257 | 浣 258 | 沬 259 | 胤 260 | 恿 261 | 驭 262 | 逵 263 | 钛 264 | 徕 265 | 贮 266 | 蔫 267 | 锚 268 | 衙 269 | 肄 270 | 豺 271 | 闸 272 | 隋 273 | 腑 274 | 脐 275 | 脓 276 | 叱 277 | 迥 278 | 踝 279 | 馥 280 | 佣 281 | 喳 282 | 迩 283 | 贻 284 | 诙 285 | 椭 286 | 琬 287 | 赂 288 | 诧 289 | 苯 290 | 怂 291 | 蟆 292 | 龊 293 | 漳 294 | 迭 295 | 垛 296 | 铲 297 | 馊 298 | 娓 299 | 葆 300 | 赑 301 | 卍 302 | 遽 303 | 谯 304 | 賏 305 | 蛹 306 | 锤 307 | 粟 308 | 衿 309 | 渥 310 | 铳 311 | 刍 312 | 镳 313 | 匮 314 | 万 315 | 骁 316 | 酣 317 | 酉 318 | 骥 319 | 寨 320 | 蓁 321 | 诽 322 | 钡 323 | 浙 324 | 酗 325 | 跩 326 | 拗 327 | 坷 328 | 雱 329 | 闺 330 | 喈 331 | 晔 332 | 螳 333 | 谙 334 | 蹂 335 | 鞑 336 | 蔗 337 | 账 338 | 垚 339 | 瞩 340 | 谩 341 | 掳 342 | 媲 343 | 葾 344 | 鳗 345 | 钣 346 | 檀 347 | 阕 348 | 聿 349 | 蜍 350 | 仆 351 | 嗅 352 | 峥 353 | 蜈 354 | 垠 355 | 蚓 356 | 麓 357 | 殉 358 | 弩 359 | 朴 360 | 胥 361 | 瘴 362 | 篑 363 | 镍 364 | 鹂 365 | 暐 366 | 榷 367 | 咀 368 | 佯 369 | 蚣 370 | 荻 371 | 鬓 372 | 仝 373 | 裴 374 | 讷 375 | 孺 376 | 咨 377 | 俑 378 | 遴 379 | 吽 380 | 笋 381 | 耀 382 | 霾 383 | 绎 384 | 咿 385 | 骸 386 | 霭 387 | 昕 388 | 漩 389 | 浒 390 | 轼 391 | 婿 392 | 嗳 393 | 钙 394 | 谲 395 | 蛾 396 | 跛 397 | 惺 398 | 翎 399 | 炽 400 | 晒 401 | 钳 402 | 鞘 403 | 谚 404 | 钊 405 | 背 406 | 瀛 407 | 槌 408 | 臀 409 | 跋 410 | 窒 411 | 藤 412 | 噬 413 | 蓊 414 | 褐 415 | 蔺 416 | 鲍 417 | 鲨 418 | 舔 419 | 箔 420 | 萦 421 | 诏 422 | 褔 423 | 咄 424 | 俘 425 | 彪 426 | 饪 427 | 嘱 428 | 诬 429 | 踮 430 | 囝 431 | 佢 432 | 汶 433 | 讹 434 | 踅 435 | 咐 436 | 讼 437 | 玟 438 | 迂 439 | 亵 440 | 婵 441 | 馁 442 | 崭 443 | 惦 444 | 蠹 445 | 濒 446 | 匈 447 | 蟋 448 | 谕 449 | 酪 450 | 眛 451 | 煦 452 | 甭 453 | 谄 454 | 妾 455 | 梧 456 | 芜 457 | 蛎 458 | 颐 459 | 雌 460 | 褒 461 | 臼 462 | 圳 463 | 剔 464 | 噶 465 | 耨 466 | 嗈 467 | 勋 468 | 冶 469 | 扑 470 | 膺 471 | 腺 472 | 荤 473 | 坞 474 | 羲 475 | 栾 476 | 傌 477 | 幌 478 | 噗 479 | 蛀 480 | 觞 481 | 塾 482 | 耙 483 | 枭 484 | 擞 485 | 缅 486 | 踌 487 | 蟀 488 | 侥 489 | 诣 490 | 姜 491 | 甸 492 | 俭 493 | 泠 494 | 躇 495 | 萌 496 | 虏 497 | 匕 498 | 藩 499 | 嗽 500 | 蜻 501 | 咛 502 | 艹 503 | 跎 504 | 蔬 505 | 鸠 506 | 跆 507 | 肋 508 | 巅 509 | 芯 510 | 荐 511 | 荼 512 | 慵 513 | 咸 514 | 杭 515 | 樟 516 | 夸 517 | 戮 518 | 吱 519 | 模 520 | 葔 521 | 迢 522 | 砰 523 | 须 524 | 蒜 525 | 骐 526 | 茱 527 | 痊 528 | 蛤 529 | 蜴 530 | 诟 531 | 俾 532 | 疮 533 | 悴 534 | 袒 535 | 蒹 536 | 镖 537 | 娥 538 | 鹉 539 | 婊 540 | 噫 541 | 矜 542 | 岳 543 | 鹦 544 | 葭 545 | 褚 546 | 嵩 547 | 丫 548 | 凛 549 | 峦 550 | 惚 551 | 懊 552 | 韶 553 | 憋 554 | 聋 555 | 讪 556 | 瘫 557 | 霓 558 | 哺 559 | 蝙 560 | 靥 561 | 堇 562 | 铺 563 | 趾 564 | 褪 565 | 缆 566 | 媛 567 | 胧 568 | 肛 569 | 珈 570 | 畴 571 | 驹 572 | 熔 573 | 臆 574 | 肘 575 | 豁 576 | 冕 577 | 吊 578 | 韧 579 | 炜 580 | 舱 581 | 恁 582 | 巳 583 | 舵 584 | 臻 585 | 戊 586 | 稽 587 | 诲 588 | 隽 589 | 铐 590 | 鲫 591 | 畸 592 | 饥 593 | 茉 594 | 蒲 595 | 矶 596 | 峨 597 | 蚵 598 | 蔼 599 | 诛 600 | 焰 601 | 偈 602 | 蚱 603 | 骯 604 | 盔 605 | 巩 606 | 折 607 | 偕 608 | 嗓 609 | 辙 610 | 鸶 611 | 酵 612 | 莘 613 | 耘 614 | 汹 615 | 楞 616 | 陡 617 | 裳 618 | 憎 619 | 讳 620 | 荆 621 | 笃 622 | 屉 623 | 霈 624 | 恬 625 | 蹦 626 | 扬 627 | 侃 628 | 艳 629 | 璇 630 | 韬 631 | 烬 632 | 傀 633 | 铮 634 | 曦 635 | 搂 636 | 蝠 637 | 霄 638 | 胺 639 | 遐 640 | 飨 641 | 郡 642 | 困 643 | 呎 644 | 墅 645 | 鞠 646 | 瘤 647 | 藻 648 | 咆 649 | 踹 650 | 狷 651 | 镀 652 | 桐 653 | 赘 654 | 揽 655 | 炬 656 | 氢 657 | 膛 658 | 搪 659 | 湿 660 | 唆 661 | 兑 662 | 暸 663 | 厮 664 | 懈 665 | 媳 666 | 塘 667 | 靡 668 | 鹭 669 | 祟 670 | 冀 671 | 豚 672 | 蹄 673 | 橙 674 | 阎 675 | 硫 676 | 埠 677 | 噱 678 | 妃 679 | 搓 680 | 啃 681 | 俞 682 | 龚 683 | 橄 684 | 嚎 685 | 椎 686 | 蓦 687 | 朔 688 | 痘 689 | 鳞 690 | 铠 691 | 叽 692 | 跤 693 | 裔 694 | 诃 695 | 岫 696 | 怯 697 | 讥 698 | 聂 699 | 垢 700 | 藐 701 | 濑 702 | 莒 703 | 淇 704 | 毯 705 | 礁 706 | 赃 707 | 庐 708 | 辕 709 | 瞌 710 | 锯 711 | 莓 712 | 涡 713 | 昼 714 | 捌 715 | 嗡 716 | 倌 717 | 禹 718 | 蹋 719 | 卯 720 | 粪 721 | 耽 722 | 闰 723 | 曳 724 | 苔 725 | 诵 726 | 菇 727 | 斟 728 | 芥 729 | 莅 730 | 喀 731 | 麒 732 | 颊 733 | 扛 734 | 曜 735 | 咎 736 | 缮 737 | 诫 738 | 躁 739 | 茜 740 | 缤 741 | 暧 742 | 郄 743 | 酥 744 | 僻 745 | 躬 746 | 峙 747 | 驯 748 | 噎 749 | 厦 750 | 澜 751 | 杏 752 | 樽 753 | 勘 754 | 煤 755 | 茎 756 | 嚷 757 | 昆 758 | 铸 759 | 烘 760 | 邹 761 | 廓 762 | 拚 763 | 俐 764 | 裘 765 | 饵 766 | 恃 767 | 蔓 768 | 笙 769 | 茁 770 | 楷 771 | 嚼 772 | 锻 773 | 蕊 774 | 脖 775 | 茍 776 | 壤 777 | 琮 778 | 莽 779 | 塌 780 | 蚤 781 | 膳 782 | 磋 783 | 蓓 784 | 澈 785 | 萎 786 | 擒 787 | 禄 788 | 儡 789 | 懦 790 | 瞻 791 | 虔 792 | 粥 793 | 赦 794 | 畜 795 | 彷 796 | 寥 797 | 揣 798 | 嫖 799 | 朽 800 | 挂 801 | 啄 802 | 浇 803 | 崖 804 | 棠 805 | 禽 806 | 台 807 | 邂 808 | 矫 809 | 茅 810 | 惫 811 | 吠 812 | 苟 813 | 叩 814 | 徊 815 | 巍 816 | 舆 817 | 邵 818 | 彗 819 | 萃 820 | 拱 821 | 嘶 822 | 貂 823 | 趴 824 | 愿 825 | 脊 826 | 冗 827 | 杆 828 | 蕙 829 | 铎 830 | 囚 831 | 啼 832 | 谤 833 | 徘 834 | 芹 835 | 骆 836 | 夭 837 | 饺 838 | 馒 839 | 溺 840 | 咫 841 | 屐 842 | 绅 843 | 诅 844 | 缉 845 | 渣 846 | 敞 847 | 萱 848 | 丰 849 | 俏 850 | 螃 851 | 蜀 852 | 徽 853 | 逞 854 | 跪 855 | 虞 856 | 隙 857 | 匀 858 | 憧 859 | 辄 860 | 鸳 861 | 疵 862 | 跷 863 | 呱 864 | 穆 865 | 阑 866 | 搏 867 | 肾 868 | 靶 869 | 阱 870 | 囡 871 | 寰 872 | 庄 873 | 蟾 874 | 怠 875 | 腕 876 | 烟 877 | 巾 878 | 奢 879 | 垄 880 | 姨 881 | 躯 882 | 肺 883 | 钰 884 | 佰 885 | 阙 886 | 雏 887 | 溉 888 | 焚 889 | 丑 890 | 锥 891 | 诘 892 | 瞪 893 | 茹 894 | 绊 895 | 蚀 896 | 袱 897 | 煽 898 | 窕 899 | 掷 900 | 沮 901 | 钞 902 | 涕 903 | 浏 904 | 仄 905 | 孰 906 | 峻 907 | 皱 908 | 芦 909 | 膏 910 | 晰 911 | 衬 912 | 谍 913 | 丞 914 | 绽 915 | 蔽 916 | 呕 917 | 轿 918 | 隶 919 | 楠 920 | 匣 921 | 葵 922 | 沫 923 | 刃 924 | 禧 925 | 晦 926 | 哔 927 | 晖 928 | 绣 929 | 仟 930 | 窟 931 | 谛 932 | 瀚 933 | 黛 934 | 忿 935 | 姚 936 | 蜘 937 | 耸 938 | 捍 939 | 斐 940 | 卜 941 | 辗 942 | 刁 943 | 涅 944 | 泓 945 | 梵 946 | 扳 947 | 暇 948 | 袜 949 | 柠 950 | 傍 951 | 逮 952 | 呃 953 | 蜗 954 | 窍 955 | 琉 956 | 喃 957 | 溢 958 | 抉 959 | 旷 960 | 卅 961 | 亟 962 | 膝 963 | 伶 964 | 闇 965 | 莺 966 | 蔚 967 | 醋 968 | 瑛 969 | 拭 970 | 绮 971 | 鑫 972 | 圭 973 | 脂 974 | 酿 975 | 诈 976 | 膨 977 | 隧 978 | 惭 979 | 庚 980 | 衅 981 | 哨 982 | 凋 983 | 里 984 | 祯 985 | 撼 986 | 谭 987 | 稻 988 | 迋 989 | 碌 990 | 罕 991 | 逾 992 | 嗜 993 | 蹲 994 | 檬 995 | 肖 996 | 辖 997 | 襟 998 | 扎 999 | 槟 1000 | 缔 1001 | 袂 1002 | 敷 1003 | 腥 1004 | 喘 1005 | 簿 1006 | 鳖 1007 | 出 1008 | 噢 1009 | 炫 1010 | 佑 1011 | 贷 1012 | 粮 1013 | 荳 1014 | 桦 1015 | 颉 1016 | 哑 1017 | 倪 1018 | 颤 1019 | 御 1020 | 芽 1021 | 朦 1022 | 裹 1023 | 贬 1024 | 蕉 1025 | 蝉 1026 | 赎 1027 | 崔 1028 | 滔 1029 | 茵 1030 | 径 1031 | 克 1032 | 啤 1033 | 拯 1034 | 坟 1035 | 葱 1036 | 芋 1037 | 瞒 1038 | 掠 1039 | 绳 1040 | 蛛 1041 | 匠 1042 | 凸 1043 | 苛 1044 | 押 1045 | 楣 1046 | 芙 1047 | 酌 1048 | 俺 1049 | 掏 1050 | 倡 1051 | 唾 1052 | 瞄 1053 | 磊 1054 | 吼 1055 | 搅 1056 | 溃 1057 | 聆 1058 | 沌 1059 | 蝇 1060 | 鸥 1061 | 妒 1062 | 焕 1063 | 拙 1064 | 夷 1065 | 迄 1066 | 绰 1067 | 锵 1068 | 耿 1069 | 祺 1070 | 吶 1071 | 惶 1072 | 廊 1073 | 兜 1074 | 倩 1075 | 杖 1076 | 窄 1077 | 僚 1078 | 竖 1079 | 芷 1080 | 咚 1081 | 鲢 1082 | 沛 1083 | 挪 1084 | 柄 1085 | 顷 1086 | 璞 1087 | 裸 1088 | 鵰 1089 | 郊 1090 | 屿 1091 | 仕 1092 | 艘 1093 | 铅 1094 | 铝 1095 | 饲 1096 | 黯 1097 | 疫 1098 | 栽 1099 | 喉 1100 | 逗 1101 | 祇 1102 | 阪 1103 | 侍 1104 | 抒 1105 | 弗 1106 | 尬 1107 | 浦 1108 | 鄙 1109 | 盏 1110 | 喽 1111 | 炳 1112 | 卵 1113 | 肌 1114 | 迦 1115 | 擅 1116 | 豹 1117 | 胏 1118 | 炼 1119 | 悸 1120 | 谴 1121 | 贾 1122 | 胀 1123 | 疋 1124 | 矿 1125 | 梨 1126 | 碑 1127 | 髓 1128 | 巢 1129 | 叹 1130 | 屡 1131 | 滩 1132 | 侮 1133 | 橘 1134 | 嘲 1135 | 酬 1136 | 枚 1137 | 氓 1138 | 菌 1139 | 颁 1140 | 萝 1141 | 谘 1142 | 曝 1143 | 薯 1144 | 襄 1145 | 辽 1146 | 萄 1147 | 寇 1148 | 舜 1149 | 颂 1150 | 撰 1151 | 腻 1152 | 崩 1153 | 咕 1154 | 癌 1155 | 歇 1156 | 汰 1157 | 烁 1158 | 撇 1159 | 宴 1160 | 惩 1161 | 烛 1162 | 贰 1163 | 呻 1164 | 呒 1165 | 翩 1166 | 绑 1167 | 捞 1168 | 爹 1169 | 秉 1170 | 棉 1171 | 妓 1172 | 尉 1173 | 霍 1174 | 甫 1175 | 尝 1176 | 葡 1177 | 蒸 1178 | 鸦 1179 | 挚 1180 | 奸 1181 | 纬 1182 | 艰 1183 | 履 1184 | 葬 1185 | 滨 1186 | 耕 1187 | 婴 1188 | 醇 1189 | 堵 1190 | 钉 1191 | 喧 1192 | 遂 1193 | 锣 1194 | 垮 1195 | 蓬 1196 | 薛 1197 | 虐 1198 | 睁 1199 | 厨 1200 | 娶 1201 | 浆 1202 | 挨 1203 | 矢 1204 | 蕾 1205 | 伺 1206 | 券 1207 | 鹏 1208 | 削 1209 | 蓄 1210 | 琦 1211 | 熄 1212 | 湘 1213 | 慌 1214 | 枕 1215 | 衍 1216 | 薇 1217 | 囊 1218 | 喂 1219 | 蕴 1220 | 倘 1221 | 峡 1222 | 浊 1223 | 窃 1224 | 颈 1225 | 裙 1226 | 晕 1227 | 缚 1228 | 获 1229 | 帕 1230 | 脾 1231 | 莹 1232 | 逍 1233 | 姬 1234 | 韦 1235 | 畔 1236 | 伐 1237 | 霞 1238 | 嘘 1239 | 盐 1240 | 摧 1241 | 债 1242 | 佩 1243 | 畏 1244 | 驴 1245 | 氧 1246 | 奴 1247 | 瘦 1248 | 菊 1249 | 廿 1250 | 狭 1251 | 赴 1252 | 碳 1253 | 坊 1254 | 盆 1255 | 趟 1256 | 匿 1257 | 肇 1258 | 溶 1259 | 揭 1260 | 剥 1261 | 沦 1262 | 秃 1263 | 郝 1264 | 唔 1265 | 锡 1266 | 娇 1267 | 抚 1268 | 屎 1269 | 甩 1270 | 娱 1271 | 表 1272 | 犬 1273 | 魁 1274 | 蒂 1275 | 皓 1276 | 祷 1277 | 瞎 1278 | 瘾 1279 | 煎 1280 | 螺 1281 | 遮 1282 | 坠 1283 | 剎 1284 | 筝 1285 | 棵 1286 | 冤 1287 | 崎 1288 | 昔 1289 | 驼 1290 | 竿 1291 | 甄 1292 | 斑 1293 | 歹 1294 | 骏 1295 | 缝 1296 | 鞭 1297 | 垫 1298 | 淹 1299 | 并 1300 | 遨 1301 | 宠 1302 | 掰 1303 | 枯 1304 | 艇 1305 | 豫 1306 | 募 1307 | 郁 1308 | 稚 1309 | 懿 1310 | 辐 1311 | 酱 1312 | 恕 1313 | 范 1314 | 涂 1315 | 滤 1316 | 肃 1317 | 膜 1318 | 佬 1319 | 哼 1320 | 慨 1321 | 穗 1322 | 辰 1323 | 雁 1324 | 瑟 1325 | 帆 1326 | 拢 1327 | 汁 1328 | 蝴 1329 | 冈 1330 | 诠 1331 | 蹈 1332 | 黏 1333 | 痞 1334 | 屑 1335 | 潇 1336 | 觅 1337 | 钧 1338 | 挣 1339 | 谐 1340 | 霜 1341 | 诊 1342 | 熬 1343 | 讽 1344 | 歧 1345 | 戈 1346 | 闯 1347 | 饶 1348 | 斤 1349 | 婉 1350 | 致 1351 | 贿 1352 | 苑 1353 | 矮 1354 | 毋 1355 | 詹 1356 | 祈 1357 | 咳 1358 | 昱 1359 | 佐 1360 | 帖 1361 | 猩 1362 | 尹 1363 | 诇 1364 | 肆 1365 | 亭 1366 | 丘 1367 | 淘 1368 | 颠 1369 | 勃 1370 | 讶 1371 | 抖 1372 | 袁 1373 | 柱 1374 | 僧 1375 | 蚊 1376 | 匹 1377 | 辣 1378 | 螂 1379 | 澡 1380 | 昧 1381 | 诡 1382 | 槽 1383 | 穴 1384 | 斩 1385 | 聘 1386 | 扶 1387 | 熙 1388 | 驰 1389 | 棍 1390 | 兆 1391 | 蟑 1392 | 矩 1393 | 谬 1394 | 贫 1395 | 鼎 1396 | 践 1397 | 盲 1398 | 眷 1399 | 尿 1400 | 伫 1401 | 饿 1402 | 砸 1403 | 妄 1404 | 荡 1405 | 炒 1406 | 冥 1407 | 偿 1408 | 墓 1409 | 骄 1410 | 毙 1411 | 淋 1412 | 芝 1413 | 胃 1414 | 宅 1415 | 董 1416 | 梭 1417 | 凑 1418 | 宰 1419 | 卑 1420 | 丛 1421 | 纠 1422 | 肢 1423 | 闽 1424 | 铜 1425 | 寺 1426 | 瞬 1427 | 澳 1428 | 庞 1429 | 腔 1430 | 泼 1431 | 昂 1432 | 梁 1433 | 躺 1434 | 姻 1435 | 潭 1436 | 吋 1437 | 撤 1438 | 殖 1439 | 轴 1440 | 颖 1441 | 冻 1442 | 琼 1443 | 恳 1444 | 衫 1445 | 譬 1446 | 猎 1447 | 衰 1448 | 桶 1449 | 辜 1450 | 筒 1451 | 赫 1452 | 仗 1453 | 膀 1454 | 乳 1455 | 嚣 1456 | 划 1457 | 玮 1458 | 卿 1459 | 枉 1460 | 埃 1461 | 跨 1462 | 粹 1463 | 猴 1464 | 愤 1465 | 壹 1466 | 卢 1467 | 尧 1468 | 翰 1469 | 叮 1470 | 媚 1471 | 钮 1472 | 袖 1473 | 斌 1474 | 卓 1475 | 粽 1476 | 雀 1477 | 谦 1478 | 傅 1479 | 殿 1480 | 睹 1481 | 菁 1482 | 桂 1483 | 诱 1484 | 舌 1485 | 惟 1486 | 岗 1487 | 衷 1488 | 屈 1489 | 陋 1490 | 陌 1491 | 宵 1492 | 麟 1493 | 魏 1494 | 贸 1495 | 几 1496 | 埔 1497 | 谎 1498 | 袍 1499 | 卸 1500 | 仓 1501 | 匪 1502 | 叛 1503 | 肠 1504 | 肝 1505 | 俄 1506 | 孕 1507 | 庙 1508 | 嫁 1509 | 肤 1510 | 拦 1511 | 羯 1512 | 匙 1513 | 咏 1514 | 蠢 1515 | 纽 1516 | 拘 1517 | 旨 1518 | 胁 1519 | 馨 1520 | 珊 1521 | 签 1522 | 赔 1523 | 秩 1524 | 喻 1525 | 谜 1526 | 翠 1527 | 芭 1528 | 摊 1529 | 侣 1530 | 灿 1531 | 寡 1532 | 罐 1533 | 贼 1534 | 叙 1535 | 谨 1536 | 体 1537 | 敲 1538 | 浴 1539 | 吻 1540 | 臂 1541 | 袭 1542 | 煮 1543 | 腹 1544 | 暮 1545 | 曹 1546 | 虹 1547 | 抑 1548 | 贩 1549 | 踩 1550 | 澎 1551 | 糖 1552 | 催 1553 | 萍 1554 | 垂 1555 | 斥 1556 | 侬 1557 | 拷 1558 | 唤 1559 | 匆 1560 | 阮 1561 | 飙 1562 | 柴 1563 | 剂 1564 | 妖 1565 | 添 1566 | 畅 1567 | 汗 1568 | 鸭 1569 | 稀 1570 | 晋 1571 | 埋 1572 | 弊 1573 | 返 1574 | 叡 1575 | 娟 1576 | 玻 1577 | 腾 1578 | 栋 1579 | 歪 1580 | 邓 1581 | 渴 1582 | 粒 1583 | 泣 1584 | 疾 1585 | 蓉 1586 | 塑 1587 | 祂 1588 | 储 1589 | 劣 1590 | 柯 1591 | 陶 1592 | 患 1593 | 蛇 1594 | 腐 1595 | 琳 1596 | 慎 1597 | 泊 1598 | 牢 1599 | 呈 1600 | 趁 1601 | 恶 1602 | 浑 1603 | 扮 1604 | 樱 1605 | 臣 1606 | 遵 1607 | 缠 1608 | 虫 1609 | 撒 1610 | 叉 1611 | 刑 1612 | 苗 1613 | 脉 1614 | 盈 1615 | 津 1616 | 愧 1617 | 摔 1618 | 盒 1619 | 丧 1620 | 鹤 1621 | 呦 1622 | 厕 1623 | 斜 1624 | 芒 1625 | 翅 1626 | 悄 1627 | 晃 1628 | 茂 1629 | 寸 1630 | 杉 1631 | 旺 1632 | 俩 1633 | 雯 1634 | 霖 1635 | 递 1636 | 胶 1637 | 氛 1638 | 谣 1639 | 捉 1640 | 虾 1641 | 秘 1642 | 漠 1643 | 扭 1644 | 贞 1645 | 陵 1646 | 叔 1647 | 轨 1648 | 鹅 1649 | 液 1650 | 妥 1651 | 贱 1652 | 涨 1653 | 滥 1654 | 痕 1655 | 沿 1656 | 秤 1657 | 措 1658 | 巡 1659 | 丈 1660 | 魅 1661 | 欲 1662 | 缸 1663 | 鹿 1664 | 汝 1665 | 迁 1666 | 矣 1667 | 肩 1668 | 烤 1669 | 笛 1670 | 迅 1671 | 劫 1672 | 趋 1673 | 披 1674 | 荷 1675 | 卒 1676 | 丙 1677 | 碗 1678 | 伙 1679 | 椅 1680 | 赞 1681 | 侦 1682 | 灾 1683 | 秦 1684 | 蛙 1685 | 禅 1686 | 慰 1687 | 余 1688 | 朗 1689 | 辱 1690 | 征 1691 | 愚 1692 | 抛 1693 | 挺 1694 | 彭 1695 | 允 1696 | 靖 1697 | 滋 1698 | 凝 1699 | 赠 1700 | 莎 1701 | 顽 1702 | 狠 1703 | 堕 1704 | 翘 1705 | 惹 1706 | 纲 1707 | 贯 1708 | 饼 1709 | 抬 1710 | 逆 1711 | 堪 1712 | 坤 1713 | 斗 1714 | 钦 1715 | 疏 1716 | 羞 1717 | 扇 1718 | 蜂 1719 | 赌 1720 | 驻 1721 | 屏 1722 | 爵 1723 | 轰 1724 | 契 1725 | 悦 1726 | 邻 1727 | 哉 1728 | 陀 1729 | 裂 1730 | 刷 1731 | 毅 1732 | 拾 1733 | 疼 1734 | 阔 1735 | 耍 1736 | 亏 1737 | 吟 1738 | 锐 1739 | 惧 1740 | 锅 1741 | 蝶 1742 | 壳 1743 | 糕 1744 | 舟 1745 | 牧 1746 | 妮 1747 | 粗 1748 | 仇 1749 | 驶 1750 | 促 1751 | 孝 1752 | 裤 1753 | 誉 1754 | 家 1755 | 迈 1756 | 姿 1757 | 踪 1758 | 兔 1759 | 综 1760 | 旭 1761 | 韵 1762 | 齿 1763 | 乔 1764 | 怖 1765 | 晴 1766 | 闷 1767 | 墨 1768 | 咬 1769 | 侧 1770 | 狱 1771 | 琪 1772 | 梯 1773 | 宾 1774 | 枫 1775 | 锦 1776 | 瑜 1777 | 敦 1778 | 矛 1779 | 弘 1780 | 玛 1781 | 茫 1782 | 迪 1783 | 览 1784 | 挤 1785 | 雳 1786 | 岚 1787 | 卷 1788 | 黎 1789 | 薄 1790 | 柳 1791 | 咦 1792 | 廷 1793 | 瞧 1794 | 幅 1795 | 挖 1796 | 唬 1797 | 侯 1798 | 祸 1799 | 饰 1800 | 儒 1801 | 捡 1802 | 筋 1803 | 融 1804 | 耗 1805 | 铃 1806 | 奉 1807 | 鼻 1808 | 坜 1809 | 曼 1810 | 贡 1811 | 嗨 1812 | 炎 1813 | 啡 1814 | 捐 1815 | 炮 1816 | 霹 1817 | 貌 1818 | 鸣 1819 | 饱 1820 | 廉 1821 | 绘 1822 | 咪 1823 | 吝 1824 | 肚 1825 | 云 1826 | 翼 1827 | 氏 1828 | 骚 1829 | 爷 1830 | 寿 1831 | 绕 1832 | 唷 1833 | 牺 1834 | 屠 1835 | 谋 1836 | 彻 1837 | 俱 1838 | 粉 1839 | 雾 1840 | 涵 1841 | 侨 1842 | 础 1843 | 疗 1844 | 署 1845 | 稿 1846 | 涉 1847 | 稣 1848 | 誓 1849 | 箭 1850 | 涯 1851 | 锺 1852 | 迹 1853 | 抄 1854 | 踢 1855 | 贪 1856 | 咖 1857 | 莱 1858 | 夺 1859 | 勉 1860 | 焦 1861 | 蒋 1862 | 桑 1863 | 沧 1864 | 恰 1865 | 泳 1866 | 牲 1867 | 戒 1868 | 恼 1869 | 夕 1870 | 棚 1871 | 爬 1872 | 菲 1873 | 翁 1874 | 奔 1875 | 滴 1876 | 玄 1877 | 捷 1878 | 曰 1879 | 愉 1880 | 逊 1881 | 憾 1882 | 钓 1883 | 壁 1884 | 躲 1885 | 嫌 1886 | 姆 1887 | 乏 1888 | 洛 1889 | 逼 1890 | 磨 1891 | 剪 1892 | 逝 1893 | 亨 1894 | 盼 1895 | 杯 1896 | 敝 1897 | 碍 1898 | 痴 1899 | 植 1900 | 瑰 1901 | 勤 1902 | 悟 1903 | 彬 1904 | 删 1905 | 薪 1906 | 悠 1907 | 胎 1908 | 侵 1909 | 坪 1910 | 赋 1911 | 弯 1912 | 丹 1913 | 巫 1914 | 轩 1915 | 辨 1916 | 吐 1917 | 么 1918 | 盾 1919 | 扯 1920 | 割 1921 | 艾 1922 | 幼 1923 | 捕 1924 | 召 1925 | 怒 1926 | 坡 1927 | 缓 1928 | 猛 1929 | 驾 1930 | 莉 1931 | 彦 1932 | 韩 1933 | 鞋 1934 | 碧 1935 | 泽 1936 | 泉 1937 | 缴 1938 | 跃 1939 | 喇 1940 | 腿 1941 | 糟 1942 | 胆 1943 | 摘 1944 | 朵 1945 | 逛 1946 | 甜 1947 | 拔 1948 | 劲 1949 | 悉 1950 | 穷 1951 | 汤 1952 | 唐 1953 | 臭 1954 | 玲 1955 | 怡 1956 | 舍 1957 | 欺 1958 | 蜜 1959 | 耻 1960 | 坦 1961 | 叭 1962 | 亿 1963 | 忌 1964 | 鲁 1965 | 繁 1966 | 泥 1967 | 伸 1968 | 壮 1969 | 串 1970 | 圾 1971 | 币 1972 | 荒 1973 | 垃 1974 | 妇 1975 | 旦 1976 | 截 1977 | 喷 1978 | 碎 1979 | 吕 1980 | 犹 1981 | 抹 1982 | 脆 1983 | 煞 1984 | 胞 1985 | 晶 1986 | 潜 1987 | 玫 1988 | 妻 1989 | 估 1990 | 陷 1991 | 孔 1992 | 娃 1993 | 兽 1994 | 肥 1995 | 凉 1996 | 岂 1997 | 逻 1998 | 胸 1999 | 杜 2000 | 袋 2001 | 甘 2002 | 邀 2003 | 培 2004 | 龄 2005 | 辆 2006 | 廖 2007 | 冲 2008 | 渡 2009 | 羽 2010 | 秒 2011 | 辞 2012 | 倾 2013 | 窝 2014 | 柏 2015 | 淑 2016 | 诞 2017 | 漏 2018 | 姑 2019 | 托 2020 | 吾 2021 | 纷 2022 | 拆 2023 | 浩 2024 | 税 2025 | 邱 2026 | 迟 2027 | 筹 2028 | 监 2029 | 汪 2030 | 擎 2031 | 衡 2032 | 狐 2033 | 灰 2034 | 尖 2035 | 番 2036 | 罚 2037 | 证 2038 | 盗 2039 | 祥 2040 | 毫 2041 | 彰 2042 | 扩 2043 | 幽 2044 | 阐 2045 | 喊 2046 | 菩 2047 | 赐 2048 | 奋 2049 | 鲜 2050 | 劝 2051 | 栏 2052 | 慈 2053 | 扫 2054 | 尽 2055 | 穹 2056 | 丌 2057 | 绪 2058 | 砂 2059 | 勿 2060 | 抢 2061 | 啪 2062 | 庸 2063 | 赤 2064 | 饮 2065 | 萨 2066 | 兼 2067 | 访 2068 | 舒 2069 | 裕 2070 | 逸 2071 | 宙 2072 | 丸 2073 | 准 2074 | 魂 2075 | 厚 2076 | 励 2077 | 仰 2078 | 糊 2079 | 顿 2080 | 闭 2081 | 塔 2082 | 枪 2083 | 睛 2084 | 斋 2085 | 奥 2086 | 恭 2087 | 翔 2088 | 遥 2089 | 航 2090 | 孟 2091 | 昌 2092 | 卧 2093 | 颇 2094 | 革 2095 | 邪 2096 | 阻 2097 | 蟹 2098 | 裁 2099 | 后 2100 | 函 2101 | 于 2102 | 拳 2103 | 宽 2104 | 锋 2105 | 州 2106 | 葛 2107 | 拒 2108 | 池 2109 | 镇 2110 | 芬 2111 | 岸 2112 | 寞 2113 | 凭 2114 | 姊 2115 | 殊 2116 | 板 2117 | 勒 2118 | 慕 2119 | 跌 2120 | 踏 2121 | 填 2122 | 陪 2123 | 逐 2124 | 洽 2125 | 描 2126 | 妨 2127 | 仪 2128 | 摄 2129 | 紫 2130 | 谅 2131 | 阅 2132 | 邦 2133 | 麦 2134 | 莲 2135 | 闪 2136 | 纵 2137 | 庭 2138 | 圈 2139 | 榜 2140 | 滑 2141 | 舰 2142 | 面 2143 | 献 2144 | 浅 2145 | 飘 2146 | 宋 2147 | 俗 2148 | 沟 2149 | 巷 2150 | 眠 2151 | 帽 2152 | 惑 2153 | 羊 2154 | 牵 2155 | 净 2156 | 厉 2157 | 撞 2158 | 崇 2159 | 竞 2160 | 回 2161 | 乙 2162 | 聪 2163 | 桃 2164 | 伍 2165 | 役 2166 | 潮 2167 | 损 2168 | 凯 2169 | 锁 2170 | 震 2171 | 醉 2172 | 屁 2173 | 牠 2174 | 孙 2175 | 酷 2176 | 染 2177 | 尺 2178 | 摸 2179 | 盛 2180 | 闹 2181 | 棋 2182 | 吓 2183 | 迫 2184 | 瓜 2185 | 松 2186 | 搬 2187 | 戴 2188 | 瞭 2189 | 乌 2190 | 谱 2191 | 滚 2192 | 赚 2193 | 障 2194 | 逃 2195 | 齐 2196 | 牙 2197 | 怨 2198 | 拖 2199 | 皇 2200 | 贺 2201 | 横 2202 | 塞 2203 | 摆 2204 | 农 2205 | 倍 2206 | 额 2207 | 乘 2208 | 户 2209 | 奈 2210 | 川 2211 | 徐 2212 | 井 2213 | 寝 2214 | 洞 2215 | 劳 2216 | 船 2217 | 域 2218 | 屋 2219 | 胖 2220 | 藉 2221 | 销 2222 | 拼 2223 | 桌 2224 | 忧 2225 | 违 2226 | 拟 2227 | 吵 2228 | 媒 2229 | 辩 2230 | 妙 2231 | 鸿 2232 | 恩 2233 | 映 2234 | 耳 2235 | 傻 2236 | 京 2237 | 搭 2238 | 残 2239 | 稍 2240 | 颜 2241 | 固 2242 | 眉 2243 | 龟 2244 | 哀 2245 | 发 2246 | 沈 2247 | 拨 2248 | 丁 2249 | 愁 2250 | 耐 2251 | 宪 2252 | 覆 2253 | 盟 2254 | 昭 2255 | 握 2256 | 萧 2257 | 延 2258 | 豆 2259 | 弱 2260 | 隆 2261 | 页 2262 | 烧 2263 | 遍 2264 | 距 2265 | 摩 2266 | 祖 2267 | 探 2268 | 倚 2269 | 寂 2270 | 阴 2271 | 悔 2272 | 库 2273 | 嘴 2274 | 沉 2275 | 伊 2276 | 暂 2277 | 霸 2278 | 喵 2279 | 频 2280 | 鼓 2281 | 冒 2282 | 鼠 2283 | 企 2284 | 副 2285 | 菜 2286 | 款 2287 | 忽 2288 | 尾 2289 | 租 2290 | 椰 2291 | 隔 2292 | 狼 2293 | 浮 2294 | 惠 2295 | 峰 2296 | 索 2297 | 芳 2298 | 摇 2299 | 洪 2300 | 伦 2301 | 骨 2302 | 吹 2303 | 郑 2304 | 哩 2305 | 珍 2306 | 纳 2307 | 零 2308 | 哲 2309 | 遭 2310 | 瓶 2311 | 亡 2312 | 振 2313 | 予 2314 | 村 2315 | 旅 2316 | 惨 2317 | 汽 2318 | 爸 2319 | 隐 2320 | 械 2321 | 寒 2322 | 危 2323 | 邮 2324 | 贝 2325 | 阶 2326 | 赖 2327 | 茶 2328 | 谊 2329 | 涛 2330 | 惯 2331 | 尘 2332 | 丝 2333 | 森 2334 | 询 2335 | 露 2336 | 稳 2337 | 桥 2338 | 夏 2339 | 哭 2340 | 坚 2341 | 籍 2342 | 厌 2343 | 苍 2344 | 析 2345 | 冰 2346 | 仙 2347 | 布 2348 | 箱 2349 | 脱 2350 | 贤 2351 | 途 2352 | 订 2353 | 财 2354 | 欧 2355 | 赢 2356 | 枢 2357 | 泪 2358 | 废 2359 | 钢 2360 | 渐 2361 | 泡 2362 | 刊 2363 | 肯 2364 | 恨 2365 | 砍 2366 | 抽 2367 | 股 2368 | 咧 2369 | 婆 2370 | 禁 2371 | 郎 2372 | 默 2373 | 符 2374 | 缩 2375 | 童 2376 | 绿 2377 | 骗 2378 | 辈 2379 | 尼 2380 | 届 2381 | 彼 2382 | 兮 2383 | 聚 2384 | 宇 2385 | 辛 2386 | 疯 2387 | 减 2388 | 米 2389 | 念 2390 | 降 2391 | 街 2392 | 临 2393 | 敏 2394 | 洗 2395 | 玉 2396 | 伴 2397 | 辅 2398 | 诺 2399 | 鸡 2400 | 侠 2401 | 健 2402 | 熊 2403 | 顶 2404 | 挑 2405 | 替 2406 | 豪 2407 | 掌 2408 | 饭 2409 | 银 2410 | 圆 2411 | 志 2412 | 休 2413 | 材 2414 | 灭 2415 | 烈 2416 | 爆 2417 | 透 2418 | 遗 2419 | 虚 2420 | 醒 2421 | 货 2422 | 雅 2423 | 宏 2424 | 帅 2425 | 宫 2426 | 港 2427 | 偶 2428 | 丢 2429 | 篮 2430 | 凡 2431 | 瑞 2432 | 硕 2433 | 雪 2434 | 忠 2435 | 蔡 2436 | 插 2437 | 积 2438 | 乖 2439 | 挥 2440 | 抗 2441 | 察 2442 | 末 2443 | 盖 2444 | 厅 2445 | 移 2446 | 吸 2447 | 括 2448 | 笨 2449 | 孤 2450 | 译 2451 | 避 2452 | 秀 2453 | 富 2454 | 漂 2455 | 柔 2456 | 私 2457 | 围 2458 | 狮 2459 | 祝 2460 | 庆 2461 | 序 2462 | 拥 2463 | 洲 2464 | 徒 2465 | 借 2466 | 晓 2467 | 嘉 2468 | 诗 2469 | 淡 2470 | 束 2471 | 姓 2472 | 颗 2473 | 勇 2474 | 犯 2475 | 喝 2476 | 食 2477 | 镜 2478 | 偏 2479 | 猜 2480 | 层 2481 | 帐 2482 | 仅 2483 | 购 2484 | 衣 2485 | 申 2486 | 伯 2487 | 紧 2488 | 县 2489 | 婚 2490 | 季 2491 | 敬 2492 | 弃 2493 | 尊 2494 | 蛋 2495 | 鹰 2496 | 熟 2497 | 冠 2498 | 唯 2499 | 混 2500 | 藏 2501 | 河 2502 | 忍 2503 | 窗 2504 | 朝 2505 | 轮 2506 | 册 2507 | 乡 2508 | 敌 2509 | 散 2510 | 沙 2511 | 幻 2512 | 短 2513 | 略 2514 | 批 2515 | 游 2516 | 奖 2517 | 岛 2518 | 逢 2519 | 脸 2520 | 顾 2521 | 督 2522 | 协 2523 | 雷 2524 | 详 2525 | 穿 2526 | 慧 2527 | 巧 2528 | 罢 2529 | 呼 2530 | 暗 2531 | 贴 2532 | 纸 2533 | 歉 2534 | 郭 2535 | 努 2536 | 担 2537 | 蓝 2538 | 训 2539 | 享 2540 | 架 2541 | 济 2542 | 猪 2543 | 派 2544 | 均 2545 | 妈 2546 | 哦 2547 | 宣 2548 | 检 2549 | 鬼 2550 | 灯 2551 | 策 2552 | 梅 2553 | 启 2554 | 嘿 2555 | 洋 2556 | 伟 2557 | 萤 2558 | 磁 2559 | 啰 2560 | 付 2561 | 弄 2562 | 寄 2563 | 钟 2564 | 播 2565 | 险 2566 | 载 2567 | 赏 2568 | 汉 2569 | 块 2570 | 刀 2571 | 铭 2572 | 施 2573 | 卫 2574 | 弹 2575 | 售 2576 | 叶 2577 | 皆 2578 | 罪 2579 | 虎 2580 | 归 2581 | 毛 2582 | 昨 2583 | 荣 2584 | 律 2585 | 树 2586 | 奏 2587 | 注 2588 | 扁 2589 | 笔 2590 | 旁 2591 | 键 2592 | 制 2593 | 莫 2594 | 堆 2595 | 射 2596 | 承 2597 | 波 2598 | 皮 2599 | 释 2600 | 判 2601 | 含 2602 | 既 2603 | 退 2604 | 纪 2605 | 刻 2606 | 肉 2607 | 靠 2608 | 麻 2609 | 湖 2610 | 继 2611 | 诚 2612 | 姐 2613 | 益 2614 | 置 2615 | 惜 2616 | 艺 2617 | 尚 2618 | 纯 2619 | 骂 2620 | 琴 2621 | 漫 2622 | 援 2623 | 缺 2624 | 诸 2625 | 尤 2626 | 忆 2627 | 景 2628 | 府 2629 | 委 2630 | 刘 2631 | 绍 2632 | 虑 2633 | 暴 2634 | 草 2635 | 充 2636 | 授 2637 | 防 2638 | 素 2639 | 房 2640 | 搞 2641 | 典 2642 | 仔 2643 | 父 2644 | 吉 2645 | 招 2646 | 剑 2647 | 脚 2648 | 突 2649 | 牌 2650 | 餐 2651 | 仁 2652 | 酒 2653 | 礼 2654 | 巴 2655 | 丽 2656 | 亮 2657 | 恐 2658 | 述 2659 | 周 2660 | 杂 2661 | 旧 2662 | 套 2663 | 赵 2664 | 堂 2665 | 创 2666 | 母 2667 | 辑 2668 | 络 2669 | 俊 2670 | 毒 2671 | 威 2672 | 冷 2673 | 蛮 2674 | 普 2675 | 登 2676 | 微 2677 | 控 2678 | 爽 2679 | 香 2680 | 坐 2681 | 缘 2682 | 幕 2683 | 兰 2684 | 悲 2685 | 势 2686 | 午 2687 | 睡 2688 | 密 2689 | 垒 2690 | 警 2691 | 宗 2692 | 严 2693 | 阵 2694 | 江 2695 | 亚 2696 | 攻 2697 | 静 2698 | 抱 2699 | 啥 2700 | 急 2701 | 宿 2702 | 剧 2703 | 词 2704 | 忙 2705 | 牛 2706 | 吴 2707 | 陆 2708 | 维 2709 | 激 2710 | 增 2711 | 聊 2712 | 浪 2713 | 状 2714 | 良 --------------------------------------------------------------------------------