├── lib
    ├── dict
    │   ├── custom
    │   │   ├── mydict.dic
    │   │   ├── ext_stopword.dic
    │   │   └── single_word_low_freq.dic
    │   ├── preposition.dic
    │   ├── suffix.dic
    │   ├── stopword.dic
    │   ├── surname.dic
    │   └── quantifier.dic
    ├── DictSegment.js
    ├── config.js
    ├── LexemePath.js
    ├── consts.js
    ├── HitService.js
    ├── utils.js
    ├── Hit.js
    ├── Lexeme.js
    ├── TreeSet.js
    ├── QuickSortSet.js
    ├── Segmenter.js
    ├── CJKSegmenter.js
    ├── IKArbitrator.js
    ├── CN_QuantifierSegmenter.js
    ├── LetterSegmenter.js
    ├── Gruntfile.js
    ├── AnalyzeContext.js
    ├── CharacterUtil.js
    ├── AnalyzeService.js
    └── Dictionary.js
├── index.js
├── .travis.yml
├── test.js
├── package.json
├── README.md
└── LICENSE


/lib/dict/custom/mydict.dic:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/dict/custom/ext_stopword.dic:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | module.exports = require('./lib/Segmenter');


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |    - "0.10"
4 |    - "0.11"
5 |    - "0.12"
6 |    - "iojs"
7 | 


--------------------------------------------------------------------------------
/lib/DictSegment.js:
--------------------------------------------------------------------------------
1 | 
2 | var DictSegment = function(){
3 |   this.childrenMap = {};
4 |   this.storeSize = 0;
5 |   this.nodeState = 0;
6 | };
7 | 
8 | module.exports = DictSegment;


--------------------------------------------------------------------------------
/lib/dict/preposition.dic:
--------------------------------------------------------------------------------
 1 | 不
 2 | 也
 3 | 了
 4 | 仍
 5 | 从
 6 | 以
 7 | 使
 8 | 则
 9 | 却
10 | 又
11 | 及
12 | 对
13 | 就
14 | 并
15 | 很
16 | 或
17 | 把
18 | 是
19 | 的
20 | 着
21 | 给
22 | 而
23 | 被
24 | 让
25 | 但


--------------------------------------------------------------------------------
/lib/config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | 	// Analyzer 扩展配置
3 | 	ext_dict: [/*'./dict/custom/mydict.dic', './dict/custom/single_word_low_freq.dic'*/],
4 | 	
5 | 	// 用户可以在这里配置自己的扩展停止词字典
6 | 	ext_stopwords: [/*'./dict/custom/ext_stopword.dic'*/]
7 | };


--------------------------------------------------------------------------------
/lib/dict/suffix.dic:
--------------------------------------------------------------------------------
 1 | 乡
 2 | 井
 3 | 亭
 4 | 党
 5 | 区
 6 | 厅
 7 | 县
 8 | 园
 9 | 塔
10 | 家
11 | 寺
12 | 局
13 | 巷
14 | 市
15 | 弄
16 | 所
17 | 斯基
18 | 楼
19 | 江
20 | 河
21 | 海
22 | 湖
23 | 省
24 | 维奇
25 | 署
26 | 苑
27 | 街
28 | 觀
29 | 观
30 | 诺夫
31 | 路
32 | 部
33 | 镇
34 | 阁
35 | 山
36 | 子
37 | 娃


--------------------------------------------------------------------------------
/lib/dict/stopword.dic:
--------------------------------------------------------------------------------
 1 | a
 2 | an
 3 | and
 4 | are
 5 | as
 6 | at
 7 | be
 8 | but
 9 | by
10 | for
11 | if
12 | in
13 | into
14 | is
15 | it
16 | no
17 | not
18 | of
19 | on
20 | or
21 | such
22 | that
23 | the
24 | their
25 | then
26 | there
27 | these
28 | they
29 | this
30 | to
31 | was
32 | will
33 | with


--------------------------------------------------------------------------------
/lib/LexemePath.js:
--------------------------------------------------------------------------------
 1 | var util = require('util');
 2 | var QuickSortSet = require('./QuickSortSet');
 3 |   
 4 | /**
 5 |  * Lexeme链（路径）
 6 |  */
 7 | var LexemePath = function(){
 8 |   QuickSortSet.call(this);
 9 | 	this.pathBegin = -1;    //起始位置
10 | 	this.pathEnd = -1;      //结束
11 | 	this.payloadLength = 0; //词元链的有效字符长度
12 | };
13 | 
14 | util.inherits(LexemePath, QuickSortSet);
15 | 
16 | module.exports = LexemePath;
17 | 


--------------------------------------------------------------------------------
/lib/consts.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   // LexemeType常量
 3 |   LexemeType: {
 4 |     TYPE_UNKNOWN:   0,  // 未知
 5 |   	TYPE_ENGLISH:   1,  // 英文
 6 |   	TYPE_ARABIC:    2,  // 数字
 7 |   	TYPE_LETTER:    3,  // 英文数字混合
 8 |   	TYPE_CNWORD:    4,  // 中文词元
 9 |   	TYPE_CNCHAR:    64, // 中文单字
10 |   	TYPE_OTHER_CJK: 8,  // 日韩文字
11 |   	TYPE_CNUM:      16, // 中文数词
12 |   	TYPE_COUNT:     32, // 中文量词
13 |   	TYPE_CQUAN:     48  // 中文数量词
14 |   },
15 |   CharType: {
16 |     CHAR_USELESS:   0,
17 |     CHAR_ARABIC:    1,
18 |     CHAR_ENGLISH:   2,
19 |     CHAR_CHINESE:   4,
20 |     CHAR_OTHER_CJK: 8
21 |   }
22 | };


--------------------------------------------------------------------------------
/lib/HitService.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var UNMATCH = 0,
 3 |   MATCH = 1,
 4 |   PREFIX = 2;
 5 | 
 6 | module.exports = {
 7 | 
 8 |   isMatch: function(hit){
 9 |     return hit.hitState & MATCH > 0;
10 |   },
11 | 
12 |   setMatch: function(hit){
13 |     hit.hitState = hit.hitState | MATCH;
14 |   },
15 | 
16 |   isPrefix: function(hit){
17 |     return (hit.hitState & PREFIX) > 0;
18 |   },
19 | 
20 |   setPrefix: function(hit){
21 |     hit.hitState = hit.hitState | PREFIX;
22 |   },
23 | 
24 |   isUnmatch: function(hit){
25 |     return hit.hitState === UNMATCH;
26 |   },
27 | 
28 |   setUnmatch: function(hit){
29 |     hit.hitState = UNMATCH;
30 |   }
31 | };


--------------------------------------------------------------------------------
/lib/utils.js:
--------------------------------------------------------------------------------
 1 | var utils = {
 2 |   arrayFind: function(arr, key, val){
 3 |     var found = null;
 4 |     for(var i=0;i<arr.length;i++){
 5 |       if (arr[i][key] === val){
 6 |         found = arr[i];
 7 |         break;
 8 |       }
 9 |     }
10 |     return found;
11 |   },
12 |   clone: function(origin){
13 |     if(!origin){
14 |       return;
15 |     }
16 |   
17 |     if (Array.isArray(origin)){
18 |       return origin.concat();
19 |     }
20 |     
21 |     var obj = {};
22 |     for(var f in origin){
23 |       //if(origin.hasOwnProperty(f)){
24 |       if (typeof origin[f] !== 'undefined'){
25 |         obj[f] = origin[f];
26 |       }
27 |     }
28 |     return obj;
29 |   }
30 | };
31 | 
32 | module.exports = utils;
33 | 


--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
 1 | var Segmenter = require('./lib/Segmenter');
 2 | 
 3 | 
 4 | var opts = {
 5 |   MainDictPath: 'lib/dict/main.dic',
 6 |   SurnameDictPath: 'lib/dict/surname.dic',
 7 |   QuantifierDictPath: 'lib/dict/quantifier.dic',
 8 |   SuffixDictPath: 'lib/dict/suffix.dic',
 9 |   PrepDictPath: 'lib/dict/preposition.dic',
10 |   StopWordDictPath: 'lib/dict/stopword.dic',
11 | };
12 | 
13 | var segmenter = new Segmenter(opts);
14 | 
15 | var txt = '１９９５年１０月，他与中方探讨了在海运、造船方面合作的可能与途径。';
16 | console.log('txt: ', txt);
17 | 
18 | var result = segmenter.analyze(txt);
19 | console.log('result: ', result);
20 | 
21 | txt = '无耻啊无耻，西工大图书馆 标题要长长长长长长长长长长长长长长长长长长长长长长长长长长长长长长长长长';
22 | console.log('txt: ', txt);
23 | 
24 | result = segmenter.analyze(txt);
25 | console.log('result: ', result);
26 | 
27 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "node-analyzer",
 3 |   "version": "5.0.1",
 4 |   "private": false,
 5 |   "dependencies": {},
 6 |   "devDependencies": {
 7 |     "grunt": "~0.4.5",
 8 |     "grunt-contrib-jshint": "~0.11.2"
 9 |   },
10 |   "description": "The node.js implement of IKAnalyzer Chinese Segmenter. ",
11 |   "main": "index.js",
12 |   "scripts": {
13 |     "test": "node test.js"
14 |   },
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "https://github.com/newebug/node-analyzer.git"
18 |   },
19 |   "keywords": [
20 |     "node.js",
21 |     "Chinese",
22 |     "segmenter",
23 |     "analyzer",
24 |     "ikanalyzer",
25 |     "中文分词"
26 |   ],
27 |   "author": "newebug@gmail.com",
28 |   "license": "Apache Licence 2.0",
29 |   "bugs": {
30 |     "url": "https://github.com/newebug/node-analyzer/issues"
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/lib/Hit.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var UNMATCH = 0,
 3 |   MATCH = 1,
 4 |   PREFIX = 2;
 5 | 
 6 | var Hit = function(){
 7 |   this.hitState = 0;//UNMATCH;
 8 |   this.matchedDictSegment = null;
 9 |   this.begin = 0;
10 |   this.end = 0;
11 | //  this.endPosition = 0;
12 | };
13 | 
14 | module.exports = Hit;
15 | 
16 | /*  移到 HitService 下
17 | Hit.prototype.isMatch = function(){
18 |   return this.hitState & MATCH > 0;
19 | };
20 | 
21 | Hit.prototype.setMatch = function(){
22 |   this.hitState = this.hitState | MATCH;
23 | };
24 | 
25 | Hit.prototype.isPrefix = function(){
26 |   return (this.hitState & PREFIX) > 0;
27 | };
28 | 
29 | Hit.prototype.setPrefix = function(){
30 |   this.hitState = this.hitState | PREFIX;
31 | };
32 | 
33 | Hit.prototype.isUnmatch = function(){
34 |   return this.hitState === UNMATCH;
35 | };
36 | 
37 | Hit.prototype.setUnmatch = function(){
38 |   this.hitState = UNMATCH;
39 | };*/
40 | 


--------------------------------------------------------------------------------
/lib/Lexeme.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Lexeme = function(offset, begin, len, lexemeType){
 3 |   this.offset = offset;
 4 | 	this.begin = begin;
 5 | 	if (len < 0){
 6 | 		throw new Error("Lexeme len < 0");
 7 | 	}
 8 | 	this.len = len;
 9 | 	this.lexemeType = lexemeType;
10 | };
11 | 
12 | module.exports = Lexeme;
13 | 
14 | /**
15 |  * 获取词元的文本内容
16 |  * @return String
17 |  */
18 | /*Lexeme.prototype.getLexemeText = function() {
19 | 	return this.lexemeText || '';
20 | };
21 | */
22 | /*Lexeme.prototype.setLexemeText = function(lexemeText) {
23 | 	if (!lexemeText){
24 | 		this.lexemeText = "";
25 | 		this.len = 0;
26 | 	}
27 | 	else{
28 | 		this.lexemeText = lexemeText;
29 | 		this.len = lexemeText.length;
30 | 	}
31 | };*/
32 | 
33 | /**
34 |  * 合并两个相邻的词元
35 |  * @param l
36 |  * @param lexemeType
37 |  * @return boolean 词元是否成功合并
38 |  */
39 | /*Lexeme.prototype.append = function(l, lexemeType){
40 | 	if (l && this.getEndPosition() === l.getBeginPosition()){
41 | 		this.len += l.len;
42 | 		this.lexemeType = lexemeType;
43 | 		return true;
44 | 	}
45 | 	else {
46 | 		return false;
47 | 	}
48 | };*/
49 | 


--------------------------------------------------------------------------------
/lib/dict/surname.dic:
--------------------------------------------------------------------------------
  1 | 丁
  2 | 万
  3 | 万俟
  4 | 上官
  5 | 东方
  6 | 乔
  7 | 于
  8 | 令狐
  9 | 仲孙
 10 | 任
 11 | 何
 12 | 余
 13 | 候
 14 | 傅
 15 | 公冶
 16 | 公孙
 17 | 公羊
 18 | 冯
 19 | 刘
 20 | 单
 21 | 单于
 22 | 卢
 23 | 史
 24 | 叶
 25 | 司徒
 26 | 司空
 27 | 司马
 28 | 吕
 29 | 吴
 30 | 周
 31 | 唐
 32 | 夏
 33 | 夏侯
 34 | 太叔
 35 | 姚
 36 | 姜
 37 | 孔
 38 | 孙
 39 | 孟
 40 | 宇文
 41 | 宋
 42 | 宗政
 43 | 尉迟
 44 | 尹
 45 | 崔
 46 | 常
 47 | 康
 48 | 廖
 49 | 张
 50 | 彭
 51 | 徐
 52 | 慕容
 53 | 戴
 54 | 文
 55 | 方
 56 | 易
 57 | 曹
 58 | 曾
 59 | 朱
 60 | 李
 61 | 杜
 62 | 杨
 63 | 林
 64 | 梁
 65 | 欧阳
 66 | 武
 67 | 段
 68 | 毛
 69 | 江
 70 | 汤
 71 | 沈
 72 | 淳于
 73 | 潘
 74 | 澹台
 75 | 濮阳
 76 | 熊
 77 | 王
 78 | 田
 79 | 申屠
 80 | 白
 81 | 皇甫
 82 | 石
 83 | 秦
 84 | 程
 85 | 罗
 86 | 肖
 87 | 胡
 88 | 苏
 89 | 范
 90 | 董
 91 | 蒋
 92 | 薛
 93 | 袁
 94 | 许
 95 | 诸葛
 96 | 谢
 97 | 谭
 98 | 贺
 99 | 贾
100 | 赖
101 | 赫连
102 | 赵
103 | 轩辕
104 | 邓
105 | 邱
106 | 邵
107 | 邹
108 | 郑
109 | 郝
110 | 郭
111 | 金
112 | 钟
113 | 钟离
114 | 钱
115 | 长孙
116 | 闻人
117 | 闾丘
118 | 阎
119 | 陆
120 | 陈
121 | 雷
122 | 韩
123 | 顾
124 | 马
125 | 高
126 | 魏
127 | 鲜于
128 | 黄
129 | 黎
130 | 龙
131 | 龚


--------------------------------------------------------------------------------
/lib/TreeSet.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * 实现Java的TreeSet类
 3 |         给Set集合中的元素进行元素compareTo指定方式的排序。
 4 |         保证元素唯一性的方式：通过元素compareTo比较是否相同.
 5 |         底层数据结构是：二叉树。
 6 |  */
 7 | var AnalyzeService = require('./AnalyzeService'),
 8 |   LexemePathService = AnalyzeService.LexemePathService;
 9 |   
10 | var TreeSet = function() {
11 | 	this.arr = [];
12 | };
13 | 
14 | module.exports = TreeSet;
15 | 
16 | TreeSet.prototype.add = function(lexemePath){
17 | /*  this.arr.push(elem);
18 |   this.arr = this.arr.sort(function(x, y){
19 |     return x.compareTo(y);
20 |   });*/
21 |   if (this.arr.length === 0){
22 |     this.arr.push(lexemePath);
23 |     return null;
24 |   }
25 |   var headList = [], head, compRes;
26 |   head = this.arr.shift();
27 |   compRes = LexemePathService.compare(lexemePath, head);
28 |   if (compRes === 0){   // 与头部相同，不放入集合
29 |     this.arr.unshift(head);
30 |     return null;
31 |   }
32 |   else if (compRes < 0){  // 插入头部
33 |     this.arr.unshift(head);
34 |     this.arr.unshift(lexemePath);
35 |     return null;
36 |   }
37 |   else{                 //从头部往下插
38 |     headList.push(head);
39 |     var arr = this.add(lexemePath);
40 |     if (!arr) {
41 |       this.arr = headList.concat(this.arr);
42 |       return null;
43 |     }
44 |     else{
45 |       headList.concat(arr);
46 |       return headList;
47 |     }
48 |   }
49 | };
50 | 
51 | TreeSet.prototype.first = function(){
52 |   if (this.arr.length > 0){
53 |     return this.arr[0];
54 |   }
55 |   else{
56 |     return null;
57 |   }
58 | };
59 | 
60 | 


--------------------------------------------------------------------------------
/lib/QuickSortSet.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * IK分词器专用的Lexem快速排序集合
 3 |  */
 4 | 
 5 | 
 6 | var QuickSortSet = function(){
 7 | 	this.lexemeList = [];
 8 | };
 9 | 
10 | /**
11 |  * 向链表集合添加词元
12 |  * @param lexeme
13 |  */
14 | /*QuickSortSet.prototype.addLexeme = function(lexeme){
15 |   if (this.lexemeList.length === 0){
16 |     this.lexemeList.push(lexeme);
17 |     return null;
18 |   }
19 |   var tailList = [], tail, compRes;
20 |   tail = SortedSetService.pollLast(this); // 比 this.lexemeList.pop(); 快
21 |   compRes = LexemeService.compare(tail, lexeme);
22 |   if (compRes === 0){   // 词元与尾部词元相同，不放入集合
23 |     this.lexemeList.push(tail);
24 |     return null;
25 |   }
26 |   else if (compRes < 0){  // 词元接入链表尾部
27 |     this.lexemeList.push(tail);
28 |     this.lexemeList.push(lexeme);
29 |     return null;
30 |   }
31 |   else{                 //从尾部上逆
32 |     tailList.unshift(tail);
33 |     var arr = this.addLexeme(lexeme);
34 |     if (!arr) {
35 |       this.lexemeList.concat(tailList);
36 |       return null;
37 |     }
38 |     else{
39 |       tailList = arr.concat(tailList);
40 |       return tailList;
41 |     }
42 |   }
43 | };*/
44 | 
45 | /**
46 |  * 返回链表头部元素
47 |  * @return
48 |  */
49 | QuickSortSet.prototype.peekFirst = function(){
50 | 	if (this.lexemeList.length > 0){
51 | 		return this.lexemeList[0];
52 | 	}
53 | 	return null;
54 | };
55 | 	
56 | /**
57 |  * 取出链表集合的第一个元素
58 |  * @return Lexeme
59 |  */
60 | QuickSortSet.prototype.pollFirst = function(){
61 | 	if (this.lexemeList.length > 0){
62 | 		return this.lexemeList.shift();
63 | 	}
64 | 	return null;
65 | };
66 | 
67 | /**
68 |  * 返回链表尾部元素
69 |  * @return
70 |  */
71 | QuickSortSet.prototype.peekLast = function(){
72 |   var idx = this.lexemeList.length - 1;
73 | 	if (idx >= 0){
74 | 		return this.lexemeList[idx];
75 | 	}
76 | 	return null;
77 | };
78 | 
79 | /**
80 |  * 取出链表集合的最后一个元素
81 |  * @return Lexeme
82 |  */
83 | QuickSortSet.prototype.pollLast = function(){
84 |   return this.lexemeList.pop();
85 | };
86 | 
87 | module.exports = QuickSortSet;


--------------------------------------------------------------------------------
/lib/Segmenter.js:
--------------------------------------------------------------------------------
 1 | var Dictionary = require('./Dictionary'),
 2 |   AnalyzeContext = require('./AnalyzeContext'),
 3 |   IKArbitrator = require('./IKArbitrator'),
 4 |   LetterSegmenter = require('./LetterSegmenter'),
 5 |   CN_QuantifierSegmenter = require('./CN_QuantifierSegmenter'),
 6 |   CJKSegmenter = require('./CJKSegmenter');
 7 |  
 8 | var Segmenter = function(opts){
 9 |   this.opts = opts || {};
10 | 
11 |   this.init();
12 | };
13 | 
14 | module.exports = Segmenter;
15 | 
16 | Segmenter.prototype.init = function(){
17 |   //初始化词典单例
18 | 	Dictionary.initial(this.opts);
19 | 	//初始化分词上下文
20 | 	this.context = new AnalyzeContext();
21 | 	//加载子分词器
22 | 	this.segmenters = this.loadSegmenters();
23 | 	//加载歧义裁决器
24 | 	this.arbitrator = new IKArbitrator();
25 | };
26 | 
27 | /**
28 |  * 初始化词典，加载子分词器实现
29 |  * @return List<ISegmenter>
30 |  */
31 | Segmenter.prototype.loadSegmenters = function(){
32 | 	var segmenters = [];
33 | 	//处理字母的子分词器
34 | 	segmenters.push(new LetterSegmenter()); 
35 | 	//处理中文数量词的子分词器
36 | 	segmenters.push(new CN_QuantifierSegmenter());
37 | 	//处理中文词的子分词器
38 | 	segmenters.push(new CJKSegmenter());
39 | 
40 | 	return segmenters;
41 | };
42 | 
43 | /**
44 |    * 重置分词器到初始状态
45 |    * @param input
46 |    */
47 | Segmenter.prototype.reset = function(input) {
48 | 	this.input = input;
49 | 	this.context.reset();
50 | 	this.context.fillBuffer(input);
51 | 	var segmenter;
52 | 	for (var i=0;i<this.segmenters.length;i++){
53 | 	  segmenter = this.segmenters[i];
54 | 	  if (segmenter){
55 | 	    segmenter.reset();
56 | 	  }
57 | 	}
58 | };
59 | 
60 | Segmenter.prototype.analyze = function(input){
61 |   if (!input) {
62 |     return input;  
63 |   }
64 | 
65 |   this.reset(input);
66 |   //清除所有的词元属性
67 | 	//this.clearAttributes();
68 | 	
69 | 	this.context.initCursor();    // ok
70 | 	var segmenter, b = true;
71 | 	
72 | 	while (b) {
73 |     //遍历子分词器
74 | 		for (var i=0;i<this.segmenters.length;i++){
75 | 		  segmenter = this.segmenters[i];
76 | 		  if (segmenter) {
77 | 		    segmenter.analyze(this.context);
78 | 		    
79 | 		  }
80 | 		}
81 | 
82 |     //字符缓冲区接近读完，需要读入新的字符
83 | 		//if (this.context.needRefillBuffer()){
84 | 		//	break;
85 | 		//}
86 | 		//向前移动指针
87 | 		b = this.context.moveCursor();
88 |   }
89 | 	//对分词进行歧义处理
90 | 	this.arbitrator.process(this.context);
91 | 	
92 | 	//将分词结果输出到结果集，并处理未切分的单个CJK字符
93 | 	var result = this.context.outputToResult();
94 |   
95 |   return result;
96 | };


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/newebug/node-analyzer.png)](https://travis-ci.org/newebug/node-analyzer)
 2 | [![npm version](https://badge.fury.io/js/node-analyzer.svg)](https://badge.fury.io/js/node-analyzer)
 3 | [![NPM Downloads](https://img.shields.io/npm/dm/node-analyzer.svg)](https://npmjs.org/package/node-analyzer)
 4 | # node-analyzer
 5 | 基于 IKAnalyzer 字典分词器的 node.js 实现
 6 | 
 7 | # 安装
 8 |       
 9 |       npm i node-analyzer
10 | 
11 | # 用法
12 | test.js
13 | 
14 |       var Segmenter = require('node-analyzer');
15 |       var segmenter = new Segmenter();
16 |       
17 |       var txt = '１９９５年１０月，他与中方探讨了在海运、造船方面合作的可能与途径。';
18 |       console.log('txt: ', txt);
19 |       
20 |       var result = segmenter.analyze(txt);
21 |       console.log('result: ', result);
22 |       
23 |       // result:  １９９５年 １０月 ， 他 与 中方 探讨 了 在 海运 、 造船 方面 合作 的 可能 与 途径 。
24 | 
25 | # 字典
26 | 默认字典在 /lib/dict 目录下，当需要使用自定义字典时：// 全部字段可选
27 |       
28 |       var opts = {
29 |         MainDictPath: 'your_dict_folder/main.dic',
30 |         SurnameDictPath: 'your_dict_folder/dict/surname.dic',
31 |         QuantifierDictPath: 'your_dict_folder/dict/quantifier.dic',
32 |         SuffixDictPath: 'your_dict_folder/dict/suffix.dic',
33 |         PrepDictPath: 'your_dict_folder/dict/preposition.dic',
34 |         StopWordDictPath: 'your_dict_folder/dict/stopword.dic',
35 |         Config: {
36 |           ext_dict: [],
37 |           ext_stopwords: []
38 |         }
39 |       };
40 |       var segmenter = new Segmenter(opts);
41 |       
42 |       // var segmenter = new Segmenter();   // 使用默认字典
43 | 
44 | # 效果
45 | 对 Backoff 2005 的测试语料 pku_test.utf8，msr_test.utf8 结果如下：
46 | 
47 | msr_test.utf8：
48 | 
49 |     标准词数：106873 个，正确词数：79516 个，错误词数：20638 个
50 |     标准行数：3985，正确行数：302 ，错误行数：3683
51 |     Recall: 74.4023279968%
52 |     Precision: 79.3937336502%
53 |     F MEASURE: 76.817033527%
54 |     ERR RATE: 0.193107707279%
55 |   
56 | pku_test.utf8：
57 | 
58 |     标准词数：104372 个，正确词数：75045 个，错误词数：18985 个
59 |     标准行数：1944，正确行数：206 ，错误行数：1738
60 |     Recall: 71.9014678266%
61 |     Precision: 79.8096352228%
62 |     F MEASURE: 75.6494390178%
63 |     ERR RATE: 0.181897443759%
64 |   
65 | 评分代码：
66 |     
67 |     http://blog.csdn.net/mg1616/article/details/45545919
68 |     
69 | 虽然准确率不高，但是考虑测试语料的标准分词也有不合理的地方，并且网络的快速发展，近年也出现不少新鲜词汇，所以这个效果应该是比较令人满意的。
70 | 
71 | # 性能
72 | 
73 |      初始化耗时：1094ms
74 |      分词速度：358778.63 字/秒, 900763.36 字节/秒, 字数：1880000，耗时：5240ms
75 |      
76 | 用最新版本的 io.js 2.02 测试时，耗时减少到50%，分词速度达到 50w字/秒
77 | 


--------------------------------------------------------------------------------
/lib/dict/quantifier.dic:
--------------------------------------------------------------------------------
  1 | 丈
  2 | 下
  3 | 世
  4 | 世纪
  5 | 两
  6 | 个
  7 | 中
  8 | 串
  9 | 亩
 10 | 人
 11 | 介
 12 | 付
 13 | 代
 14 | 件
 15 | 任
 16 | 份
 17 | 伏
 18 | 伙
 19 | 位
 20 | 位数
 21 | 例
 22 | 倍
 23 | 像素
 24 | 元
 25 | 克
 26 | 克拉
 27 | 公亩
 28 | 公克
 29 | 公分
 30 | 公升
 31 | 公尺
 32 | 公担
 33 | 公斤
 34 | 公里
 35 | 公顷
 36 | 具
 37 | 册
 38 | 出
 39 | 刀
 40 | 分
 41 | 分钟
 42 | 分米
 43 | 划
 44 | 列
 45 | 则
 46 | 刻
 47 | 剂
 48 | 剑
 49 | 副
 50 | 加仑
 51 | 勺
 52 | 包
 53 | 匙
 54 | 匹
 55 | 区
 56 | 千克
 57 | 千米
 58 | 升
 59 | 卷
 60 | 厅
 61 | 厘
 62 | 厘米
 63 | 双
 64 | 发
 65 | 口
 66 | 句
 67 | 只
 68 | 台
 69 | 叶
 70 | 号
 71 | 名
 72 | 吨
 73 | 听
 74 | 员
 75 | 周
 76 | 周年
 77 | 品
 78 | 回
 79 | 团
 80 | 圆
 81 | 圈
 82 | 地
 83 | 场
 84 | 块
 85 | 坪
 86 | 堆
 87 | 声
 88 | 壶
 89 | 处
 90 | 夜
 91 | 大
 92 | 天
 93 | 头
 94 | 套
 95 | 女
 96 | 孔
 97 | 字
 98 | 宗
 99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首


--------------------------------------------------------------------------------
/lib/CJKSegmenter.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Dictionary = require('./Dictionary');
 3 | var Lexeme = require('./Lexeme'),
 4 |   consts = require('./consts'),
 5 |   LexemeType = consts.LexemeType,
 6 |   CharType = consts.CharType,
 7 |   HitService = require('./HitService'),
 8 |   AnalyzeService = require('./AnalyzeService'),
 9 |   SortedSetService = AnalyzeService.SortedSetService;
10 | var SEGMENTER_NAME = "CJK_SEGMENTER"; //子分词器标签
11 | 
12 | /**
13 |  *  中文-日韩文子分词器
14 |  */
15 | var CJKSegmenter = function() {
16 | //  this.name = SEGMENTER_NAME;
17 |   //待处理的分词hit队列
18 | 	this.tmpHits = [];
19 | };
20 | 
21 | module.exports = CJKSegmenter;
22 | 
23 | /* (non-Javadoc)
24 |  * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
25 |  */
26 | CJKSegmenter.prototype.analyze = function(context) {
27 | 	if (CharType.CHAR_USELESS !== context.getCurrentCharType()){
28 | 		//优先处理tmpHits中的hit
29 | 		if (this.tmpHits.length > 0){
30 | 			//处理词段队列
31 | 			var hit, tmpArray = [];
32 | 			for (var i=0;i<this.tmpHits.length;i++){
33 | 			  hit = this.tmpHits[i];
34 | 				hit = Dictionary.matchWithHit(context.segmentBuff, context.cursor, hit);
35 | 				if (HitService.isMatch(hit)){
36 | 					//输出当前的词
37 | 					var newLexeme = new Lexeme(context.buffOffset, hit.begin, context.cursor - hit.begin + 1, LexemeType.TYPE_CNWORD);
38 | 					SortedSetService.addLexeme(context.orgLexemes, newLexeme);
39 | 					if (HitService.isPrefix(hit)){//是词前缀，留着
40 | 						tmpArray.push(hit);
41 | 					}
42 | 				}
43 | 				else if (!HitService.isUnmatch(hit)){
44 | 					//hit是词，留着
45 | 					tmpArray.push(hit);
46 | 				}
47 | 			}
48 | 			this.tmpHits = tmpArray;
49 | 		}
50 | 		//*********************************
51 | 		//再对当前指针位置的字符进行单字匹配
52 | 		var singleCharHit = Dictionary.matchInMainDict(context.segmentBuff, context.cursor, 1);
53 | 		
54 | 		if (HitService.isMatch(singleCharHit)){//首字成词
55 | 			//输出当前的词
56 | 			var newLexeme = new Lexeme(context.buffOffset, context.cursor, 1, LexemeType.TYPE_CNWORD);
57 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
58 | 			//同时也是词前缀
59 | 			if (HitService.isPrefix(singleCharHit)){
60 | 				//前缀匹配则放入hit列表
61 | 				this.tmpHits.push(singleCharHit);
62 | 			}
63 | 		}
64 | 		else if (HitService.isPrefix(singleCharHit)){//首字为词前缀
65 | 			//前缀匹配则放入hit列表
66 | 			this.tmpHits.push(singleCharHit);
67 | 		}
68 | 	}
69 | 	else{
70 | 		//遇到CHAR_USELESS字符
71 | 		//清空队列
72 | 		this.tmpHits = [];
73 | 	}
74 | 
75 | 	//判断缓冲区是否已经读完
76 | /*	if (context.isBufferConsumed()){
77 | 		//清空队列
78 | 		this.tmpHits = [];
79 | 	}*/
80 | 
81 | 	//判断是否锁定缓冲区
82 | 	if (this.tmpHits.length === 0){
83 | 		context.unlockBuffer(SEGMENTER_NAME);
84 | 	}
85 | 	else{
86 | 		context.lockBuffer(SEGMENTER_NAME);
87 | 	}
88 | };
89 | 
90 | CJKSegmenter.prototype.reset = function() {
91 | 	this.tmpHits = [];
92 | };
93 | 
94 | 


--------------------------------------------------------------------------------
/lib/IKArbitrator.js:
--------------------------------------------------------------------------------
  1 | var LexemePath = require('./LexemePath'),
  2 |   TreeSet = require('./TreeSet'),
  3 |   AnalyzeService = require('./AnalyzeService'),
  4 |   SortedSetService = AnalyzeService.SortedSetService,
  5 |   LexemePathService = AnalyzeService.LexemePathService;
  6 | 
  7 | var utils = require('./utils');
  8 | 
  9 | /**
 10 |  * IK分词歧义裁决器
 11 |  */
 12 | var IKArbitrator = function() {
 13 |   
 14 | };
 15 | 
 16 | module.exports = IKArbitrator;
 17 | 
 18 | /**
 19 |  * 分词歧义处理
 20 | //	 * @param orgLexemes
 21 |  * @param useSmart
 22 |  */
 23 | IKArbitrator.prototype.process = function(context){
 24 | 	var orgLexemes = context.orgLexemes;
 25 | 	var orgLexeme = orgLexemes.pollFirst();
 26 | 
 27 | 	var crossPath = new LexemePath();
 28 | 	while (orgLexeme){
 29 | 	  var isAdded = LexemePathService.addCrossLexeme(crossPath, orgLexeme);
 30 | 		if (!isAdded){
 31 | 			//找到与crossPath不相交的下一个crossPath(当前分词无交叉)
 32 | 			if (crossPath.lexemeList.length === 1){
 33 | 				//crossPath没有歧义 或者 不做歧义处理
 34 | 				//直接输出当前crossPath
 35 | 				context.addLexemePath(crossPath);
 36 | 			}
 37 | 			else{
 38 | 				//对当前的crossPath进行歧义处理
 39 | 				var lexeme = crossPath.peekFirst();
 40 | 				var judgeResult = this.judge(crossPath, lexeme/*, crossPath.getPathLength()*/);
 41 | 				//输出歧义处理结果judgeResult
 42 | 				context.addLexemePath(judgeResult);
 43 | 			}
 44 | 
 45 | 			//把orgLexeme加入新的crossPath中
 46 | 			crossPath = new LexemePath();
 47 | 			LexemePathService.addCrossLexeme(crossPath, orgLexeme);
 48 | 		}
 49 | 		orgLexeme = orgLexemes.pollFirst();
 50 | 	}
 51 | 
 52 | 	//处理最后的path
 53 | 	if (crossPath.lexemeList.length === 1){
 54 | 		//crossPath没有歧义 或者 不做歧义处理
 55 | 		//直接输出当前crossPath
 56 | 		context.addLexemePath(crossPath);
 57 | 	}
 58 | 	else{
 59 | 		//对当前的crossPath进行歧义处理
 60 | 		var lexeme = crossPath.peekFirst();
 61 | 		var judgeResult = this.judge(crossPath, lexeme/*, crossPath.getPathLength()*/);
 62 | 		//输出歧义处理结果judgeResult
 63 | 		context.addLexemePath(judgeResult);
 64 | 	}
 65 | };
 66 | 
 67 | /**
 68 |  * 歧义识别
 69 |  * @param lexeme 歧义路径链表头
 70 |  * @param fullTextLength 歧义路径文本长度
 71 |  * @return
 72 |  */
 73 | IKArbitrator.prototype.judge = function(crossPath, lexeme/*, fullTextLength*/){
 74 | 	//候选路径集合
 75 | 	var pathOptions = new TreeSet();
 76 | 	//候选结果路径
 77 | 	var option = new LexemePath();
 78 | 
 79 | 	//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
 80 | 	var lexemeStack = this.forwardPath(crossPath, lexeme, option);
 81 | 	//当前词元链并非最理想的，加入候选路径集合
 82 | 	pathOptions.add(option);
 83 | 
 84 | 	//存在歧义词，处理
 85 | 	var c = null;
 86 | 	while (lexemeStack.length > 0){
 87 | 		c = lexemeStack.pop();
 88 | 		//回滚词元链
 89 | 		this.backPath(crossPath, c/*, option*/);
 90 | 		//从歧义词位置开始，递归，生成可选方案
 91 | 		this.forwardPath(crossPath, c, option);
 92 | 		pathOptions.add(option);
 93 | 	}
 94 | 
 95 | 	//返回集合中的最优方案
 96 | 	return pathOptions.first();
 97 | };
 98 | 
 99 | /**
100 |  * 向前遍历，添加词元，构造一个无歧义词元组合
101 | //	 * @param LexemePath path
102 |  * @return
103 |  */
104 | IKArbitrator.prototype.forwardPath = function(crossPath, lexeme, option){
105 | 	//发生冲突的Lexeme栈
106 | 	var conflictStack = [];
107 | 	var c = lexeme;
108 | 	//迭代遍历Lexeme链表
109 | 	while(c){
110 | 		if (!LexemePathService.addNotCrossLexeme(option, c)){
111 | 			//词元交叉，添加失败则加入lexemeStack栈
112 | 			conflictStack.push(c);
113 | 		}
114 | 		c = LexemePathService.getNextLexeme(crossPath, c);
115 | 	}
116 | 	return conflictStack;
117 | };
118 | 
119 | /**
120 |  * 回滚词元链，直到它能够接受指定的词元
121 | //	 * @param lexeme
122 |  * @param l
123 |  */
124 | IKArbitrator.prototype.backPath = function(crossPath, l/*, option*/){
125 | 	while(LexemePathService.checkCross(crossPath, l)){
126 | 		LexemePathService.removeTail(crossPath);
127 | 	}
128 | };
129 | 


--------------------------------------------------------------------------------
/lib/CN_QuantifierSegmenter.js:
--------------------------------------------------------------------------------
  1 | var Dictionary = require('./Dictionary');
  2 | 
  3 | var Lexeme = require('./Lexeme'),
  4 |   consts = require('./consts'),
  5 |   LexemeType = consts.LexemeType,
  6 |   CharType = consts.CharType,
  7 |   HitService = require('./HitService'),
  8 |   SortedSetService = require('./AnalyzeService').SortedSetService;
  9 | 
 10 | var SEGMENTER_NAME = "QUAN_SEGMENTER", //子分词器标签
 11 |     Chn_Num = "０１２３４５６７８９〇一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百佰千仟万萬亿億拾佰仟萬亿億兆卅廿"; //中文数词
 12 | 
 13 | /**
 14 |  * 
 15 |  * 中文数量词子分词器
 16 |  */
 17 | var CN_QuantifierSegmenter = function() {
 18 | //  this.name = SEGMENTER_NAME;
 19 | //	this.ChnNumberChars = Chn_Num;
 20 | 	
 21 | 	/*
 22 | 	 * 词元的开始位置，
 23 | 	 * 同时作为子分词器状态标识
 24 | 	 * 当start > -1 时，标识当前的分词器正在处理字符
 25 | 	 */
 26 | 	this.nStart = -1;
 27 | 	/*
 28 | 	 * 记录词元结束位置
 29 | 	 * end记录的是在词元中最后一个出现的合理的数词结束
 30 | 	 */
 31 | 	this.nEnd = -1;
 32 | 	
 33 | 	//待处理的量词hit队列
 34 | 	this.countHits = [];
 35 | };
 36 | 
 37 | module.exports = CN_QuantifierSegmenter;
 38 | 
 39 | /**
 40 |  * 分词
 41 |  */
 42 | CN_QuantifierSegmenter.prototype.analyze = function(context) {
 43 | 	//处理中文数词
 44 | 	this.processCNumber(context);
 45 | 	//处理中文量词
 46 | 	this.processCount(context);
 47 | 	
 48 | 	//判断是否锁定缓冲区
 49 | 	if (this.nStart === -1 && this.nEnd === -1	&& this.countHits.length === 0){
 50 | 		//对缓冲区解锁
 51 | 		context.unlockBuffer(SEGMENTER_NAME);
 52 | 	}
 53 | 	else{
 54 | 		context.lockBuffer(SEGMENTER_NAME);
 55 | 	}
 56 | };
 57 | 
 58 | /**
 59 |  * 重置子分词器状态
 60 |  */
 61 | CN_QuantifierSegmenter.prototype.reset = function() {
 62 | 	this.nStart = -1;
 63 | 	this.nEnd = -1;
 64 | 	this.countHits = [];
 65 | };
 66 | 
 67 | /**
 68 |  * 处理数词
 69 |  */
 70 | CN_QuantifierSegmenter.prototype.processCNumber = function(context){
 71 |   var charType = context.getCurrentCharType();
 72 | 	if (this.nStart === -1 && this.nEnd === -1){//初始状态
 73 | 		if ((CharType.CHAR_CHINESE === charType || CharType.CHAR_ARABIC === charType) &&
 74 | 		  Chn_Num.indexOf(context.getCurrentChar()) >= 0){
 75 | 			//记录数词的起始、结束位置
 76 | 			this.nStart = context.cursor;
 77 | 			this.nEnd = context.cursor;
 78 | 		}
 79 | 	}
 80 | 	else{//正在处理状态
 81 | 		if ((CharType.CHAR_CHINESE === charType || CharType.CHAR_ARABIC === charType) &&
 82 | 		    Chn_Num.indexOf(context.getCurrentChar()) >= 0){
 83 | 			//记录数词的结束位置
 84 | 			this.nEnd = context.cursor;
 85 | 		}
 86 | 		else{
 87 | 			//输出数词
 88 | 			this.outputNumLexeme(context);
 89 | 			//重置头尾指针
 90 | 			this.nStart = -1;
 91 | 			this.nEnd = -1;
 92 | 		}
 93 | 	}
 94 | 	
 95 | 	//缓冲区已经用完，还有尚未输出的数词
 96 | /*	if (context.isBufferConsumed()){
 97 | 		if (this.nStart !== -1 && this.nEnd !== -1){
 98 | 			//输出数词
 99 | 			this.outputNumLexeme(context);
100 | 			//重置头尾指针
101 | 			this.nStart = -1;
102 | 			this.nEnd = -1;
103 | 		}
104 | 	}	*/
105 | };
106 | 
107 | /**
108 |  * 处理中文量词
109 |  * @param context
110 |  */
111 | CN_QuantifierSegmenter.prototype.processCount = function(context){
112 | 	// 判断是否需要启动量词扫描
113 | 	if (!this.needCountScan(context)){
114 | 	  var l = context.orgLexemes.peekLast();
115 | 		return;
116 | 	}
117 | 	
118 | 	if (CharType.CHAR_CHINESE === context.getCurrentCharType()){
119 | 		//优先处理countHits中的hit
120 | 		var hit, tmpArray = [];
121 | 		for(var i=0;i<this.countHits.length;i++){
122 | 		  hit = this.countHits[i];
123 | 			//处理词段队列
124 | 
125 | 			hit = Dictionary.matchWithHit(context.segmentBuff, context.cursor, hit);
126 | 			if (HitService.isMatch(hit)){
127 | 				//输出当前的词
128 | 				var newLexeme = new Lexeme(context.buffOffset, hit.begin, context.cursor - hit.begin + 1, LexemeType.TYPE_COUNT);
129 | 				SortedSetService.addLexeme(context.orgLexemes, newLexeme);
130 | 
131 | 				if (HitService.isPrefix(hit)){//是词前缀，留着
132 | 					tmpArray.push(hit);
133 | 				}
134 | 			}
135 | 			else if (!HitService.isUnmatch(hit)){
136 | 				//hit是词，留着
137 | 				tmpArray.push(hit);
138 | 			}
139 | 		}
140 | 		this.countHits = tmpArray;
141 | 
142 | 		//*********************************
143 | 		//对当前指针位置的字符进行单字匹配
144 | 		var singleCharHit = Dictionary.matchInQuantifierDict(context.segmentBuff, context.cursor, 1);
145 | 		if (HitService.isMatch(singleCharHit)){//首字成量词词
146 | 			//输出当前的词
147 | 			var newLexeme = new Lexeme(context.buffOffset, context.cursor, 1, LexemeType.TYPE_COUNT);
148 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
149 | 			//同时也是词前缀
150 | 			if (HitService.isPrefix(singleCharHit)){
151 | 				//前缀匹配则放入hit列表
152 | 				this.countHits.push(singleCharHit);
153 | 			}
154 | 		}
155 | 		else if (HitService.isPrefix(singleCharHit)){//首字为量词前缀
156 | 			//前缀匹配则放入hit列表
157 | 			this.countHits.push(singleCharHit);
158 | 		}
159 | 	}
160 | 	else{
161 | 		//输入的不是中文字符
162 | 		//清空未成形的量词
163 | 		this.countHits = [];
164 | 	}
165 | 	
166 | 	//缓冲区数据已经读完，还有尚未输出的量词
167 | /*	if (context.isBufferConsumed()){
168 | 		//清空未成形的量词
169 | 		this.countHits = [];
170 | 	}*/
171 | };
172 | 
173 | /**
174 |  * 判断是否需要扫描量词
175 |  * @return
176 |  */
177 | CN_QuantifierSegmenter.prototype.needCountScan = function(context){
178 |   var l = context.orgLexemes.peekLast();
179 | 	if ((this.nStart !== -1 && this.nEnd !== -1) || this.countHits.length > 0){
180 | 		//正在处理中文数词,或者正在处理量词
181 | 		return true;
182 | 	}
183 | 	else{
184 | 		//找到一个相邻的数词
185 | 		if (context.orgLexemes.lexemeList.length > 0){
186 | 			var l = context.orgLexemes.peekLast();
187 | 			if (l && (LexemeType.TYPE_CNUM === l.lexemeType || LexemeType.TYPE_ARABIC === l.lexemeType)){
188 | 				if (l.begin + l.len === context.cursor){
189 | 					return true;
190 | 				}
191 | 			}
192 | 		}
193 | 	}
194 | 	return false;
195 | };
196 | 
197 | /**
198 |  * 添加数词词元到结果集
199 |  * @param context
200 |  */
201 | CN_QuantifierSegmenter.prototype.outputNumLexeme = function(context){
202 | 	if (this.nStart > -1 && this.nEnd > -1){
203 | 		//输出数词
204 | 		var newLexeme = new Lexeme(context.buffOffset, this.nStart, this.nEnd - this.nStart + 1, LexemeType.TYPE_CNUM);
205 | 		SortedSetService.addLexeme(context.orgLexemes, newLexeme);
206 | 	}
207 | };
208 | 


--------------------------------------------------------------------------------
/lib/LetterSegmenter.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var Lexeme = require('./Lexeme'),
  3 |   consts = require('./consts'),
  4 |   LexemeType = consts.LexemeType,
  5 |   CharType = consts.CharType,
  6 |   SortedSetService = require('./AnalyzeService').SortedSetService;
  7 | 
  8 | var SEGMENTER_NAME = "LETTER_SEGMENTER", //子分词器标签
  9 |     Letter_Connector = ['#', '&', '+', '-', '.', '@', '_'].sort(), //链接符号
 10 |     Num_Connector = [',', '.'].sort();  //数字符号
 11 |     
 12 | /**
 13 |  * 
 14 |  * 英文字符及阿拉伯数字子分词器
 15 |  */
 16 | var LetterSegmenter = function() {
 17 | //  this.name = SEGMENTER_NAME;
 18 | 	/*
 19 | 	 * 词元的开始位置，
 20 | 	 * 同时作为子分词器状态标识
 21 | 	 * 当start > -1 时，标识当前的分词器正在处理字符
 22 | 	 */
 23 | 	this.start = -1;
 24 | 	/*
 25 | 	 * 记录词元结束位置
 26 | 	 * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
 27 | 	 */
 28 | 	this.end = -1;
 29 | 	
 30 | 	/*
 31 | 	 * 字母起始位置
 32 | 	 */
 33 | 	this.englishStart = -1;
 34 | 
 35 | 	/*
 36 | 	 * 字母结束位置
 37 | 	 */
 38 | 	this.englishEnd = -1;
 39 | 	
 40 | 	/*
 41 | 	 * 阿拉伯数字起始位置
 42 | 	 */
 43 | 	this.arabicStart = -1;
 44 | 	
 45 | 	/*
 46 | 	 * 阿拉伯数字结束位置
 47 | 	 */
 48 | 	this.arabicEnd = -1;
 49 | };
 50 | 
 51 | module.exports = LetterSegmenter;
 52 | 
 53 | /* (non-Javadoc)
 54 |  * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
 55 |  */
 56 | LetterSegmenter.prototype.analyze = function(context) {
 57 | 	var bufferLockFlag = false;
 58 | 	//处理英文字母
 59 | 	bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
 60 | 	//处理阿拉伯字母
 61 | 	bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
 62 | 	//处理混合字母(这个要放最后处理，可以通过QuickSortSet排除重复)
 63 | 	bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
 64 | 	
 65 | 	//判断是否锁定缓冲区
 66 | 	if (bufferLockFlag){
 67 | 		context.lockBuffer(SEGMENTER_NAME);
 68 | 	}
 69 | 	else{
 70 | 		//对缓冲区解锁
 71 | 		context.unlockBuffer(SEGMENTER_NAME);
 72 | 	}
 73 | };
 74 | 
 75 | /* (non-Javadoc)
 76 |  * @see org.wltea.analyzer.core.ISegmenter#reset()
 77 |  */
 78 | LetterSegmenter.prototype.reset = function() {
 79 | 	this.start = -1;
 80 | 	this.end = -1;
 81 | 	this.englishStart = -1;
 82 | 	this.englishEnd = -1;
 83 | 	this.arabicStart = -1;
 84 | 	this.arabicEnd = -1;
 85 | };
 86 | 
 87 | /**
 88 |  * 处理数字字母混合输出
 89 |  * 如：windos2000 | linliangyi2005@gmail.com
 90 | //	 * @param input
 91 |  * @param context
 92 |  * @return
 93 |  */
 94 | LetterSegmenter.prototype.processMixLetter = function(context){
 95 | 	var needLock = false, charType = context.getCurrentCharType();
 96 | 	
 97 | 	if (this.start === -1){//当前的分词器尚未开始处理字符
 98 | 		if (CharType.CHAR_ARABIC === charType ||
 99 | 		  CharType.CHAR_ENGLISH === charType){
100 | 			//记录起始指针的位置,标明分词器进入处理状态
101 | 			this.start = context.cursor;
102 | 			this.end = this.start;
103 | 		}
104 | 	}
105 | 	else {//当前的分词器正在处理字符			
106 | 		if (CharType.CHAR_ARABIC === charType ||
107 | 		    CharType.CHAR_ENGLISH === charType){
108 | 			//记录下可能的结束位置
109 | 			this.end = context.cursor;
110 | 		}
111 | 		else if (CharType.CHAR_USELESS === charType && 
112 | 		    this.isLetterConnector(context.getCurrentChar())){
113 | 			//记录下可能的结束位置
114 | 			this.end = context.cursor;
115 | 		}
116 | 		else{
117 | 			//遇到非Letter字符，输出词元
118 | 			var newLexeme = new Lexeme(context.buffOffset, this.start, this.end - this.start + 1, LexemeType.TYPE_LETTER);
119 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
120 | 			this.start = -1;
121 | 			this.end = -1;
122 | 		}			
123 | 	}
124 | 	
125 | 	//判断缓冲区是否已经读完
126 | /*	if (context.isBufferConsumed()){
127 | 		if (this.start !== -1 && this.end !== -1){
128 | 			//缓冲以读完，输出词元
129 | 			var newLexeme = new Lexeme(context.buffOffset, this.start, this.end - this.start + 1, LexemeType.TYPE_LETTER);
130 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
131 | 			this.start = -1;
132 | 			this.end = -1;
133 | 		}
134 | 	}*/
135 | 	
136 | 	//判断是否锁定缓冲区
137 | 	if (this.start === -1 && this.end === -1){
138 | 		//对缓冲区解锁
139 | 		needLock = false;
140 | 	}else{
141 | 		needLock = true;
142 | 	}
143 | 	return needLock;
144 | };
145 | 
146 | /**
147 |  * 处理纯英文字母输出
148 |  * @param context
149 |  * @return
150 |  */
151 | LetterSegmenter.prototype.processEnglishLetter = function(context){
152 | 	var needLock = false;
153 | 	
154 | 	if (this.englishStart === -1){//当前的分词器尚未开始处理英文字符	
155 | 		if (CharType.CHAR_ENGLISH === context.getCurrentCharType()){
156 | 			//记录起始指针的位置,标明分词器进入处理状态
157 | 			this.englishStart = context.cursor;
158 | 			this.englishEnd = this.englishStart;
159 | 		}
160 | 	}
161 | 	else {//当前的分词器正在处理英文字符	
162 | 		if (CharType.CHAR_ENGLISH === context.getCurrentCharType()){
163 | 			//记录当前指针位置为结束位置
164 | 			this.englishEnd =  context.cursor;
165 | 		}
166 | 		else{
167 | 			//遇到非English字符,输出词元
168 | 			var newLexeme = new Lexeme(context.buffOffset, this.englishStart, this.englishEnd - this.englishStart + 1, LexemeType.TYPE_ENGLISH);
169 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
170 | 			this.englishStart = -1;
171 | 			this.englishEnd= -1;
172 | 		}
173 | 	}
174 | 	
175 | 	//判断缓冲区是否已经读完
176 | /*	if (context.isBufferConsumed()){
177 | 		if (this.englishStart !== -1 && this.englishEnd !== -1){
178 | 			//缓冲以读完，输出词元
179 | 			var newLexeme = new Lexeme(context.buffOffset, this.englishStart, this.englishEnd - this.englishStart + 1, LexemeType.TYPE_ENGLISH);
180 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
181 | 
182 | 			this.englishStart = -1;
183 | 			this.englishEnd= -1;
184 | 		}
185 | 	}	*/
186 | 	
187 | 	//判断是否锁定缓冲区
188 | 	if (this.englishStart === -1 && this.englishEnd === -1){
189 | 		//对缓冲区解锁
190 | 		needLock = false;
191 | 	}
192 | 	else{
193 | 		needLock = true;
194 | 	}
195 | 	return needLock;			
196 | };
197 | 
198 | /**
199 |  * 处理阿拉伯数字输出
200 |  * @param context
201 |  * @return
202 |  */
203 | LetterSegmenter.prototype.processArabicLetter = function(context){
204 | 	var needLock = false;
205 | 	
206 | 	if (this.arabicStart === -1){//当前的分词器尚未开始处理数字字符	
207 | 		if (CharType.CHAR_ARABIC === context.getCurrentCharType()){
208 | 			//记录起始指针的位置,标明分词器进入处理状态
209 | 			this.arabicStart = context.cursor;
210 | 			this.arabicEnd = this.arabicStart;
211 | 		}
212 | 	}
213 | 	else {//当前的分词器正在处理数字字符	
214 | 		if (CharType.CHAR_ARABIC === context.getCurrentCharType()){
215 | 			//记录当前指针位置为结束位置
216 | 			this.arabicEnd = context.cursor;
217 | 		}
218 | 		else if (CharType.CHAR_USELESS === context.getCurrentCharType() &&
219 | 		  this.isNumConnector(context.getCurrentChar())){
220 | 			//不输出数字，但不标记结束
221 | 		}
222 | 		else{
223 | 			////遇到非Arabic字符,输出词元
224 | 			var newLexeme = new Lexeme(context.buffOffset, this.arabicStart, this.arabicEnd - this.arabicStart + 1, LexemeType.TYPE_ARABIC);
225 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
226 | 			this.arabicStart = -1;
227 | 			this.arabicEnd = -1;
228 | 		}
229 | 	}
230 | 	
231 | 	//判断缓冲区是否已经读完
232 | /*	if (context.isBufferConsumed()){
233 | 		if (this.arabicStart !== -1 && this.arabicEnd !== -1){
234 | 			//生成已切分的词元
235 | 			var newLexeme = new Lexeme(context.buffOffset,  this.arabicStart, this.arabicEnd - this.arabicStart + 1, LexemeType.TYPE_ARABIC);
236 | 			SortedSetService.addLexeme(context.orgLexemes, newLexeme);
237 | 			this.arabicStart = -1;
238 | 			this.arabicEnd = -1;
239 | 		}
240 | 	}*/
241 | 	
242 | 	//判断是否锁定缓冲区
243 | 	if (this.arabicStart === -1 && this.arabicEnd === -1){
244 | 		//对缓冲区解锁
245 | 		needLock = false;
246 | 	}
247 | 	else{
248 | 		needLock = true;
249 | 	}
250 | 	return needLock;		
251 | };
252 | 
253 | /**
254 |  * 判断是否是字母连接符号
255 |  * @param input
256 |  * @return
257 |  */
258 | LetterSegmenter.prototype.isLetterConnector = function(input){
259 | 	var index = Letter_Connector.indexOf(input);
260 | 	return index >= 0;
261 | };
262 | 
263 | /**
264 |  * 判断是否是数字连接符号
265 |  * @param input
266 |  * @return
267 |  */
268 | LetterSegmenter.prototype.isNumConnector = function(input){
269 | 	var index = Num_Connector.indexOf(input);
270 | 	return index >= 0;
271 | };
272 | 
273 | 


--------------------------------------------------------------------------------
/lib/Gruntfile.js:
--------------------------------------------------------------------------------
  1 | module.exports = function(grunt) {
  2 | 
  3 |   // Project configuration.
  4 |   grunt.initConfig({
  5 |   	pkg: grunt.file.readJSON('package.json'),
  6 | 
  7 |     jshint: {
  8 |       // define the files to lint
  9 |       //files: ['**/*.js', '!node_modules/**/*'],
 10 |       files: [
 11 |         './*.js',
 12 |         '!node_modules/**/*'
 13 |       ],
 14 |       // configure JSHint (documented at http://www.jshint.com/docs/)
 15 |       options: {
 16 |         force: true,
 17 |         reporter: 'checkstyle',
 18 |         reporterOutput: 'jshint.xml',
 19 |         // more options here if you want to override JSHint defaults
 20 |         globals: {
 21 |           jQuery: true,
 22 |           console: false,
 23 |           node: true,
 24 |           module: true,
 25 |           define: true,
 26 |           require: true,
 27 |           createjs: true,
 28 |          
 29 |           exports: true,
 30 |           $: true,
 31 |           window: true,
 32 |           pomelo: true,
 33 |           setImmediate: true,
 34 |           __filename: true,
 35 |           __dirname: true,
 36 |           setTimeout: true,
 37 |           setInterval: true,
 38 |           clearInterval: true,
 39 |           process: true,
 40 |           Buffer: true,
 41 |           actor: true,      // pomelo-robot
 42 |         },
 43 |         asi: false,
 44 |         // 如果是真，JSHint会无视没有加分号的行尾， 自动补全分号一直是Javascript很有争议的一个语法特性。默认，JSHint会要求你在每个语句后面加上分号，但是如果你认为自己理解了asi(automatic semicolon insertion)，你可以抛弃JSHint对分号的检查。
 45 | 
 46 |         bitwise: false,
 47 |         //如果为真，JSHint会禁用位运算符 Javascript允许位运算，但是他却没有整型，位运算符要把参与运算的数字从浮点数变为整数，并在运算后再转换回来。这样他们的效率就不如在别的语言中那么高。
 48 | 
 49 |         boss: false,
 50 |         //很霸气的选项，如果为真，那么JSHint会允许在if，for，while里面编写赋值语句。 一般来说，我们会在循环、判断等语句中加入值的比较来做语句的运行条件，有时候会把==错写成赋值的=，通常，JSHint会把这个认定为一个错误，但是开启这个选项的化，JSHint就不会检查判断条件中的赋值 ，你是boss，你说的算:)。
 51 | 
 52 |         camelcase: false,
 53 |         // 强迫驼峰风格
 54 | 
 55 |         curly: true,
 56 |         //如果为真，JSHint会要求你在使用if和while等结构语句时加上{}来明确代码块。 Javascript允许在if等结构语句体只有一句的情况下不加括号。不过这样做可能会让你的代码读起来有些晦涩。
 57 | 
 58 |         debug: false,
 59 |         //如果为真，JSHint会允许代码中出现debugger的语句。不过建议你最好在检测代码前去掉debug的语句。
 60 | 
 61 |         eqeqeq: true,
 62 |         //如果为真，JSHint会看你在代码中是否都用了===或者是!==，而不是使用==和!=。 我们建议你在比较0，''(空字符)，undefined，null，false和true的时候使用===和!===。
 63 | 
 64 |         eqnull: false,
 65 |         //如果为真，JSHint会允许使用"== null"作比较。 == null 通常用来判断一个变量是undefined或者是null（当时用==，null和undefined都会转化为false）。
 66 | 
 67 |         evil: false,
 68 |         //如果为真，JSHint会允许使用eval eval提供了访问Javascript编译器的途径，这有时很有用，但是同时也对你的代码形成了注入攻击的危险，并且会对debug造成一些困难。 记住，Function构造函数也是另一个‘eval’，另外，当传入的参数是字符串的时候，setTimeout和setInterval也会类似于eval。
 69 | 
 70 |         forin: false,
 71 |         //如果为真，那么，JSHint允许在for in 循环里面不出现hasOwnProperty， for in循环一般用来遍历一个对象的属性，这其中也包括他继承自原型链的属性，而hasOwnProperty可以来判断一个属性是否是对象本身的属性而不是继承得来的。
 72 | 
 73 |         immed: true,
 74 |         //如果为真，JSHint要求匿名函数的调用如下：
 75 | 
 76 |         //(function(){ // }());
 77 |         //而不是
 78 | 
 79 |         //(function(){ //bla bla })();
 80 | 
 81 | //        indent: 2,
 82 | 
 83 |         latedef: true,
 84 |         // This option prohibits the use of a variable before it was defined. JavaScript has function scope only and, in addition to that, all variables are always moved—or hoisted— to the top of the function. This behavior can lead to some very nasty bugs and that's why it is safer to always use variable only after they have been explicitly defined.
 85 |         //      Setting this option to "nofunc" will allow function declarations to be ignored.
 86 | 
 87 |         laxbreak: false,
 88 |         //如果为真，JSHint则不会检查换行。 Javascript会通过自动补充分号来修正一些错误，因此这个选项可以检查一些潜在的问题。
 89 | 
 90 |         maxerr: 10,
 91 |         ///设定错误的阈值，超过这个阈值jshint不再向下检查，提示错误太多。
 92 | 
 93 |         newcap: true,
 94 |         //如果为真，JSHint会要求每一个构造函数名都要大写字母开头。 构造器是一种使用new运算符来创建对象的一种函数，new操作符会创建新的对象，并建立这个对象自己的this，一个构造函数如果不用new运算符来运行，那么他的this会指向全局对象而导致一些问题的发生。
 95 | 
 96 |         noarg: true,
 97 |         //如果为真，JSHint会禁止arguments.caller和arguments.callee的使用 arguments对象是一个类数组的对象，它具有一个索引值。arguments.callee指向当前执行的函数（这个在ES5的严格模式中被禁用了），而arguments.caller指向调用当前函数的函数（如果有的话），并且，他并不是在所有的Javascript实现里面都有。
 98 | 
 99 |         noempty: true,
100 |         //如果为真，JSHint会禁止出现空的代码块（没有语句的代码块）。 如果为真，JSHint会禁用构造器，以避免一些问题。 在JSLint中会主动禁用构造器的方式以避免一些潜在问题，但其实很多构造器的使用并非有害，例如如下的调用
101 | 
102 |         //new JsUIWindow(); //注意这个调用是没有把构造器的结果赋值给变量的
103 |         //因此，我们需要使用构造器的时候可以禁用这个选项。
104 | 
105 |         nomen: false,
106 |         //如果为真，JSHint会禁用下划线的变量名。 很多人使用_name的方式来命名他们的变量，以说明这是一个私有变量，但实际上，并不是，下划线只是做了一个标识。 如果要使用私有变量，可以使用闭包来实现。
107 | 
108 |         onevar: true,
109 |         //如果为真，JSHint期望函数只被var的形式声明一遍。
110 | 
111 |         passfail: false,
112 |         //如果为真，JSHint会在发现首个错误后停止检查。
113 | 
114 |         plusplus: false,
115 |         //如果为真，JSHint会禁用自增运算和自减运算 ++和--可能会带来一些代码的阅读上的困惑。
116 | 
117 |         regexp: true,
118 |         //如果为真，JSHint会不允许使用.和[^...]的正则， 因为这样的正则往往会匹配到你不期望的内容，并可能会应用造成一些危害。
119 | 
120 |         undef: true,
121 |         //如果为真，JSHint会要求所有的非全局变量，在使用前都被声明。 如果你不在一个本地作用域内使用var的方式来声明变量，Javascript会把它放到全局作用域下面。这样会很容易引起错误。
122 |         
123 |         unused: true,
124 | 
125 |         sub: true,
126 |         //如果为真，JSHint会允许各种形式的下标来访问对象。 通常，JSHint希望你只是用点运算符来读取对象的属性（除非这个属性名是一个保留字），如果你不希望这样可以关闭这个选项。
127 | 
128 |         strict: false,
129 |         //如果为真，JSHint会要求你使用use strict;语法。 Strict 模式是ES5里面的一个新特性，他允许你把一个程序或者函数放在一个“严格”的作用域中。可见Resig写的一篇关于严格模式的blog 严格模式做了几件事情:
130 | 
131 |         //1、他可以捕获一些错误和异常
132 | 
133 |         //2、当我们进行一下“不安全”的操作时，他会抛异常，例如访问全局变量。
134 | 
135 |         //3、他会禁止你使用一些奇淫技巧，或者不良的代码编写。
136 | 
137 |         white: false,
138 |         //如果为true，JSHint会依据严格的空白规范检查你的代码。
139 |         
140 |         funcscope: true,
141 |         laxcomma: true,
142 |         loopfunc: true, // 警告在循环内部定义函数
143 |         multistr: true, // 警告多行字符串
144 |         notypeof: true, // 警告不正确的错误类型，如 'function' 写成 'functin'
145 |         shadow: true,   // 警告多重定义变量
146 |         smarttabs: true,
147 |         validthis: true   // 关于 this 的警告
148 |       }
149 | 
150 |     }
151 | 
152 |   });
153 | 
154 |   // Load the plugin that provides the "uglify" task.
155 | 
156 |   grunt.loadNpmTasks('grunt-contrib-jshint');
157 |   
158 |   // Default task(s).
159 |   grunt.registerTask('default', ['jshint']);
160 | 
161 | };
162 | 
163 | 
164 | //module.exports = function(grunt) {
165 | //
166 | //  grunt.initConfig({
167 | //    pkg: grunt.file.readJSON('package.json'),
168 | //    concat: {
169 | //      options: {
170 | //        separator: ';'
171 | //      },
172 | //      dist: {
173 | //        src: ['src/**/*.js'],
174 | //        dest: 'dist/<%= pkg.name %>.js'
175 | //      }
176 | //    },
177 | //    uglify: {
178 | //      options: {
179 | //        banner: '/*! <%= pkg.name %> <%= grunt.template.today("dd-mm-yyyy") %> */\n'
180 | //      },
181 | //      dist: {
182 | //        files: {
183 | //          'dist/<%= pkg.name %>.min.js': ['<%= concat.dist.dest %>']
184 | //        }
185 | //      }
186 | //    },
187 | //    qunit: {
188 | //      files: ['test/**/*.html']
189 | //    },
190 | //    jshint: {
191 | //      files: ['gruntfile.js', 'src/**/*.js', 'test/**/*.js'],
192 | //      options: {
193 | //        // options here to override JSHint defaults
194 | //        globals: {
195 | //          jQuery: true,
196 | //          console: true,
197 | //          module: true,
198 | //          document: true
199 | //        }
200 | //      }
201 | //    },
202 | //    watch: {
203 | //      files: ['<%= jshint.files %>'],
204 | //      tasks: ['jshint', 'qunit']
205 | //    }
206 | //  });
207 | //
208 | //  grunt.loadNpmTasks('grunt-contrib-uglify');
209 | //  grunt.loadNpmTasks('grunt-contrib-jshint');
210 | //  grunt.loadNpmTasks('grunt-contrib-qunit');
211 | //  grunt.loadNpmTasks('grunt-contrib-watch');
212 | //  grunt.loadNpmTasks('grunt-contrib-concat');
213 | //
214 | //  grunt.registerTask('test', ['jshint', 'qunit']);
215 | //
216 | //  grunt.registerTask('default', ['jshint', 'qunit', 'concat', 'uglify']);
217 | //
218 | //};
219 | 


--------------------------------------------------------------------------------
/lib/AnalyzeContext.js:
--------------------------------------------------------------------------------
  1 | var Dictionary = require('./Dictionary'),
  2 |   QuickSortSet = require('./QuickSortSet'),
  3 |   AnalyzeService = require('./AnalyzeService'),
  4 |   SortedSetService = AnalyzeService.SortedSetService,
  5 |   LexemeService = AnalyzeService.LexemeService;
  6 | var Lexeme = require('./Lexeme'),
  7 |   consts = require('./consts'),
  8 |   utils = require('./utils'),
  9 |   CharType = consts.CharType,
 10 |   LexemeType = consts.LexemeType;
 11 | var CharacterUtil = require('./CharacterUtil');
 12 | 
 13 | var BUFF_EXHAUST_CRITICAL = 100;
 14 | 
 15 | var AnalyzeContext = function(opts){
 16 | 	this.segmentBuff = '';
 17 | 	this.charTypes = [];
 18 | 	this.buffLocker = {};
 19 | 	this.orgLexemes = new QuickSortSet();
 20 | 	this.pathMap = {};
 21 | 	this.results = [];
 22 | 	this.totalReadCount = 0; // 总共读取了多少字符
 23 | };
 24 | 
 25 | module.exports = AnalyzeContext;
 26 | 
 27 | AnalyzeContext.prototype.getCurrentChar = function(){
 28 | 	return this.segmentBuff[this.cursor];
 29 | };
 30 | 
 31 | AnalyzeContext.prototype.getCurrentCharType = function(){
 32 | 	return this.charTypes[this.cursor];
 33 | };
 34 | 
 35 | /**
 36 |  * 根据context的上下文情况，填充segmentBuff 
 37 |  * @param reader
 38 |  * @return 返回待分析的（有效的）字串长度
 39 |  */
 40 | AnalyzeContext.prototype.fillBuffer = function(txt){
 41 |   this.segmentBuff = txt;
 42 |   this.cursor = 0;
 43 |   this.available = this.segmentBuff.length;
 44 |   return this.available;
 45 | };
 46 | 
 47 | /**
 48 |  * 初始化buff指针，处理第一个字符
 49 |  */
 50 | AnalyzeContext.prototype.initCursor = function(){
 51 | 	this.cursor = 0;
 52 | 	//this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
 53 | 	//this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
 54 | 	var chr = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
 55 | 	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(chr);
 56 | 	
 57 | };
 58 | 
 59 | /**
 60 |  * 指针+1
 61 |  * 成功返回 true； 指针已经到了buff尾部，不能前进，返回false
 62 |  * 并处理当前字符
 63 |  */
 64 | AnalyzeContext.prototype.moveCursor = function(){
 65 |   if (this.cursor < this.available - 1){
 66 | 		this.cursor += 1;
 67 | 		
 68 |     //this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
 69 |     //this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
 70 |     var chr = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
 71 | 	  this.charTypes[this.cursor] = CharacterUtil.identifyCharType(chr);
 72 | 		return true;
 73 | 	}
 74 | 	else{
 75 | 		return false;
 76 | 	}
 77 | };
 78 | 
 79 | /**
 80 |  * 设置当前segmentBuff为锁定状态
 81 |  * 加入占用segmentBuff的子分词器名称，表示占用segmentBuff
 82 |  * @param segmenterName
 83 |  */
 84 | AnalyzeContext.prototype.lockBuffer = function(segmenterName){
 85 | 	this.buffLocker[segmenterName] = 1;
 86 | };
 87 | 
 88 | /**
 89 |  * 移除指定的子分词器名，释放对segmentBuff的占用
 90 |  * @param segmenterName
 91 |  */
 92 | AnalyzeContext.prototype.unlockBuffer = function(segmenterName){
 93 | 	this.buffLocker[segmenterName] = 0;
 94 | };
 95 | 
 96 | /**
 97 |  * 只要buffLocker中存在segmenterName
 98 |  * 则buffer被锁定
 99 |  * @return boolean 缓冲去是否被锁定
100 |  */
101 | AnalyzeContext.prototype.isBufferLocked = function(){
102 |   for(var k in this.buffLocker){
103 |     if (this.buffLocker[k]) {
104 |       return true;
105 |     }
106 | 	}
107 | 	return false;
108 | };
109 | 
110 | /**
111 |  * 判断当前segmentBuff是否已经用完
112 |  * 当前执针cursor移至segmentBuff末端this.available - 1
113 |  * @return
114 |  */
115 | /*AnalyzeContext.prototype.isBufferConsumed = function(){
116 | 	return this.cursor === this.available - 1;
117 | };*/
118 | 
119 | /**
120 |  * 判断segmentBuff是否需要读取新数据
121 |  * 
122 |  * 满足以下条件时，
123 |  * 1.available == BUFF_SIZE 表示buffer满载
124 |  * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
125 |  * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
126 |  * 要中断当前循环（buffer要进行移位，并再读取数据的操作）
127 |  * @return
128 |  */
129 | /*AnalyzeContext.prototype.needRefillBuffer = function(){
130 | 	return (this.cursor < this.available - 1 && 
131 | 	  this.cursor  > this.available - BUFF_EXHAUST_CRITICAL	&& 
132 | 	  !this.isBufferLocked());
133 | };*/
134 | 
135 | /**
136 | 	 * 累计当前的segmentBuff相对于reader起始位置的位移
137 | 	 */
138 | AnalyzeContext.prototype.markBufferOffset = function(){
139 | 	this.buffOffset += this.cursor;
140 | };
141 | 
142 | /**
143 | 	 * 向分词结果集添加词元
144 | 	 * @param lexeme
145 | 	 */
146 | /*AnalyzeContext.prototype.addLexeme = function(lexeme){
147 | 	this.orgLexemes.addLexeme(lexeme);
148 | };*/
149 | 
150 | /**
151 | 	 * 添加分词结果路径
152 | 	 * 路径起始位置 ---> 路径 映射表
153 | 	 * @param path
154 | 	 */
155 | AnalyzeContext.prototype.addLexemePath = function(crossPath){
156 | 	if (crossPath){
157 | 		this.pathMap[crossPath.pathBegin] = crossPath;
158 | 	}
159 | };
160 | 
161 | /**
162 |  * 推送分词结果到结果集合
163 |  * 1.从buff头部遍历到this.cursor已处理位置
164 |  * 2.将map中存在的分词结果推入results
165 |  * 3.将map中不存在的CJDK字符以单字方式推入results
166 |  */
167 | AnalyzeContext.prototype.outputToResult = function(){
168 |   var lexeme;
169 | 	for(var index = 0;index <= this.cursor;){
170 | 	  
171 | 		//跳过非CJK字符
172 | 		if (CharType.CHAR_USELESS === this.charTypes[index]){
173 | 			index ++;
174 | 			continue;
175 | 		}
176 | 		//从pathMap找出对应index位置的LexemePath
177 | 		var crosspath = this.pathMap[index];
178 | 		if (crosspath){
179 | 			//输出LexemePath中的lexeme到results集合
180 | 			lexeme = crosspath.pollFirst();
181 | 			while (lexeme){
182 | 				this.results.push(lexeme);
183 | 				//将index移至lexeme后
184 | 				index = lexeme.begin + lexeme.len;
185 | 
186 | 				lexeme = crosspath.pollFirst();
187 | 
188 | 				if (lexeme){
189 | 					//输出path内部，词元间遗漏的单字
190 | 					for(;index < lexeme.begin; index++){
191 | 						this.outputSingleCJK(index);
192 | 					}
193 | 				}
194 | 			}
195 | 		}
196 | 		else{//pathMap中找不到index对应的LexemePath
197 | 			//单字输出
198 | 			this.outputSingleCJK(index);
199 | 			index++;
200 | 		}
201 | 	}
202 | 	//清空当前的Map
203 | 	this.pathMap = {};
204 | 	
205 |   var result = [];
206 |   lexeme = this.results.shift();
207 |   while(lexeme){
208 |     this.compound(lexeme);
209 |     result.push(this.segmentBuff.substr(lexeme.begin, lexeme.len));
210 |     
211 |     lexeme = this.results.shift();
212 |   }
213 | /*  this.results.forEach(function(v){
214 |     result.push(segmentBuff.substr(v.begin, v.len));
215 |   });*/
216 |   
217 |   return result.join(' ');
218 | };
219 | 
220 | /**
221 |  * 对CJK字符进行单字输出
222 |  * @param index
223 |  */
224 | AnalyzeContext.prototype.outputSingleCJK = function(index){
225 | 	if (CharType.CHAR_CHINESE === this.charTypes[index]){
226 | 		var singleCharLexeme = new Lexeme(this.buffOffset, index, 1, LexemeType.TYPE_CNCHAR);
227 | 		this.results.push(singleCharLexeme);
228 | 	}
229 | 	else if(CharType.CHAR_OTHER_CJK === this.charTypes[index]){
230 | 		var singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , LexemeType.TYPE_OTHER_CJK);
231 | 		this.results.push(singleCharLexeme);
232 | 	}
233 | };
234 | 
235 | /**
236 |  * 重置分词上下文状态
237 |  */
238 | AnalyzeContext.prototype.reset = function(){		
239 | 	this.buffLocker = {};
240 |   this.orgLexemes = new QuickSortSet();
241 |   this.available =0;
242 |   this.buffOffset = 0;
243 | 	this.charTypes = [];
244 | 	this.cursor = 0;
245 | 	this.results = [];
246 | 	this.segmentBuff = [];
247 | 	this.pathMap = {};
248 | };
249 | 
250 | /**
251 |  * 组合词元
252 |  */
253 | AnalyzeContext.prototype.compound = function(lexeme){
254 |  	//数量词合并处理
255 |  	var nextLexeme, appendOk = true;
256 | 	while (appendOk && this.results.length > 0){
257 | 	  appendOk = false;
258 | 		if (LexemeType.TYPE_ARABIC === lexeme.lexemeType){
259 | 			nextLexeme = this.results[0];
260 | 			if (LexemeType.TYPE_CNUM === nextLexeme.lexemeType){
261 | 				//合并英文数词+中文数词
262 | 				appendOk = LexemeService.append(lexeme, nextLexeme, LexemeType.TYPE_CNUM);
263 | 			}
264 | 			else if (LexemeType.TYPE_COUNT === nextLexeme.lexemeType){
265 | 				//合并英文数词+中文量词
266 | 				appendOk = LexemeService.append(lexeme, nextLexeme, LexemeType.TYPE_CQUAN);
267 | 			}
268 | 			if (appendOk){
269 | 				//弹出
270 | 				this.results.shift();
271 | 			}
272 | 		}
273 | 		
274 | 		//可能存在第二轮合并
275 | 		if (LexemeType.TYPE_CNUM === lexeme.lexemeType && this.results.length > 0){
276 | 			nextLexeme = this.results[0];
277 | 		  appendOk = false;
278 | 			if (LexemeType.TYPE_COUNT == nextLexeme.lexemeType){
279 | 				//合并中文数词+中文量词
280 | 				appendOk = LexemeService.append(lexeme, nextLexeme, LexemeType.TYPE_CQUAN);
281 | 			}  
282 | 			if (appendOk){
283 | 				//弹出
284 | 				this.results.shift();   				
285 | 			}
286 | 		}
287 | 	}
288 | };
289 | 


--------------------------------------------------------------------------------
/lib/CharacterUtil.js:
--------------------------------------------------------------------------------
  1 | var CharType = require('./consts').CharType;
  2 | 
  3 | /*    
  4 | 0000..007F; Basic Latin 
  5 | 0080..00FF; Latin-1 Supplement 
  6 | 0100..017F; Latin Extended-A 
  7 | 0180..024F; Latin Extended-B 
  8 | 0250..02AF; IPA Extensions 
  9 | 02B0..02FF; Spacing Modifier Letters 
 10 | 0300..036F; Combining Diacritical Marks 
 11 | 0370..03FF; Greek and Coptic 
 12 | 0400..04FF; Cyrillic 
 13 | 0500..052F; Cyrillic Supplement 
 14 | 0530..058F; Armenian 
 15 | 0590..05FF; Hebrew 
 16 | 0600..06FF; Arabic 
 17 | 0700..074F; Syriac 
 18 | 0750..077F; Arabic Supplement 
 19 | 0780..07BF; Thaana 
 20 | 07C0..07FF; NKo 
 21 | 0800..083F; Samaritan 
 22 | 0900..097F; Devanagari 
 23 | 0980..09FF; Bengali 
 24 | 0A00..0A7F; Gurmukhi 
 25 | 0A80..0AFF; Gujarati 
 26 | 0B00..0B7F; Oriya 
 27 | 0B80..0BFF; Tamil 
 28 | 0C00..0C7F; Telugu 
 29 | 0C80..0CFF; Kannada 
 30 | 0D00..0D7F; Malayalam 
 31 | 0D80..0DFF; Sinhala 
 32 | 0E00..0E7F; Thai 
 33 | 0E80..0EFF; Lao 
 34 | 0F00..0FFF; Tibetan 
 35 | 1000..109F; Myanmar 
 36 | 10A0..10FF; Georgian 
 37 | 1100..11FF; Hangul Jamo 
 38 | 1200..137F; Ethiopic 
 39 | 1380..139F; Ethiopic Supplement 
 40 | 13A0..13FF; Cherokee 
 41 | 1400..167F; Unified Canadian Aboriginal Syllabics 
 42 | 1680..169F; Ogham 
 43 | 16A0..16FF; Runic 
 44 | 1700..171F; Tagalog 
 45 | 1720..173F; Hanunoo 
 46 | 1740..175F; Buhid 
 47 | 1760..177F; Tagbanwa 
 48 | 1780..17FF; Khmer 
 49 | 1800..18AF; Mongolian 
 50 | 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended 
 51 | 1900..194F; Limbu 
 52 | 1950..197F; Tai Le 
 53 | 1980..19DF; New Tai Lue 
 54 | 19E0..19FF; Khmer Symbols 
 55 | 1A00..1A1F; Buginese 
 56 | 1A20..1AAF; Tai Tham 
 57 | 1B00..1B7F; Balinese 
 58 | 1B80..1BBF; Sundanese 
 59 | 1C00..1C4F; Lepcha 
 60 | 1C50..1C7F; Ol Chiki 
 61 | 1CD0..1CFF; Vedic Extensions 
 62 | 1D00..1D7F; Phonetic Extensions 
 63 | 1D80..1DBF; Phonetic Extensions Supplement 
 64 | 1DC0..1DFF; Combining Diacritical Marks Supplement 
 65 | 1E00..1EFF; Latin Extended Additional 
 66 | 1F00..1FFF; Greek Extended 
 67 | 2000..206F; General Punctuation 
 68 | 2070..209F; Superscripts and Subscripts 
 69 | 20A0..20CF; Currency Symbols 
 70 | 20D0..20FF; Combining Diacritical Marks for Symbols 
 71 | 2100..214F; Letterlike Symbols 
 72 | 2150..218F; Number Forms 
 73 | 2190..21FF; Arrows 
 74 | 2200..22FF; Mathematical Operators 
 75 | 2300..23FF; Miscellaneous Technical 
 76 | 2400..243F; Control Pictures 
 77 | 2440..245F; Optical Character Recognition 
 78 | 2460..24FF; Enclosed Alphanumerics 
 79 | 2500..257F; Box Drawing 
 80 | 2580..259F; Block Elements 
 81 | 25A0..25FF; Geometric Shapes 
 82 | 2600..26FF; Miscellaneous Symbols 
 83 | 2700..27BF; Dingbats 
 84 | 27C0..27EF; Miscellaneous Mathematical Symbols-A 
 85 | 27F0..27FF; Supplemental Arrows-A 
 86 | 2800..28FF; Braille Patterns 
 87 | 2900..297F; Supplemental Arrows-B 
 88 | 2980..29FF; Miscellaneous Mathematical Symbols-B 
 89 | 2A00..2AFF; Supplemental Mathematical Operators 
 90 | 2B00..2BFF; Miscellaneous Symbols and Arrows 
 91 | 2C00..2C5F; Glagolitic 
 92 | 2C60..2C7F; Latin Extended-C 
 93 | 2C80..2CFF; Coptic 
 94 | 2D00..2D2F; Georgian Supplement 
 95 | 2D30..2D7F; Tifinagh 
 96 | 2D80..2DDF; Ethiopic Extended 
 97 | 2DE0..2DFF; Cyrillic Extended-A 
 98 | 2E00..2E7F; Supplemental Punctuation 
 99 | 2E80..2EFF; CJK Radicals Supplement 
100 | 2F00..2FDF; Kangxi Radicals 
101 | 2FF0..2FFF; Ideographic Description Characters 
102 | 3000..303F; CJK Symbols and Punctuation 
103 | 3040..309F; Hiragana 
104 | 30A0..30FF; Katakana 
105 | 3100..312F; Bopomofo 
106 | 3130..318F; Hangul Compatibility Jamo 
107 | 3190..319F; Kanbun 
108 | 31A0..31BF; Bopomofo Extended 
109 | 31C0..31EF; CJK Strokes 
110 | 31F0..31FF; Katakana Phonetic Extensions 
111 | 3200..32FF; Enclosed CJK Letters and Months 
112 | 3300..33FF; CJK Compatibility 
113 | 3400..4DBF; CJK Unified Ideographs Extension A 
114 | 4DC0..4DFF; Yijing Hexagram Symbols 
115 | 4E00..9FFF; CJK Unified Ideographs 
116 | A000..A48F; Yi Syllables 
117 | A490..A4CF; Yi Radicals 
118 | A4D0..A4FF; Lisu 
119 | A500..A63F; Vai 
120 | A640..A69F; Cyrillic Extended-B 
121 | A6A0..A6FF; Bamum 
122 | A700..A71F; Modifier Tone Letters 
123 | A720..A7FF; Latin Extended-D 
124 | A800..A82F; Syloti Nagri 
125 | A830..A83F; Common Indic Number Forms 
126 | A840..A87F; Phags-pa 
127 | A880..A8DF; Saurashtra 
128 | A8E0..A8FF; Devanagari Extended 
129 | A900..A92F; Kayah Li 
130 | A930..A95F; Rejang 
131 | A960..A97F; Hangul Jamo Extended-A 
132 | A980..A9DF; Javanese 
133 | AA00..AA5F; Cham 
134 | AA60..AA7F; Myanmar Extended-A 
135 | AA80..AADF; Tai Viet 
136 | ABC0..ABFF; Meetei Mayek 
137 | AC00..D7AF; Hangul Syllables 
138 | D7B0..D7FF; Hangul Jamo Extended-B 
139 | D800..DB7F; High Surrogates 
140 | DB80..DBFF; High Private Use Surrogates 
141 | DC00..DFFF; Low Surrogates 
142 | E000..F8FF; Private Use Area 
143 | F900..FAFF; CJK Compatibility Ideographs 
144 | FB00..FB4F; Alphabetic Presentation Forms 
145 | FB50..FDFF; Arabic Presentation Forms-A 
146 | FE00..FE0F; Variation Selectors 
147 | FE10..FE1F; Vertical Forms 
148 | FE20..FE2F; Combining Half Marks 
149 | FE30..FE4F; CJK Compatibility Forms 
150 | FE50..FE6F; Small Form Variants 
151 | FE70..FEFF; Arabic Presentation Forms-B 
152 | FF00..FFEF; Halfwidth and Fullwidth Forms 
153 | FFF0..FFFF; Specials 
154 | 10000..1007F; Linear B Syllabary 
155 | 10080..100FF; Linear B Ideograms 
156 | 10100..1013F; Aegean Numbers 
157 | 10140..1018F; Ancient Greek Numbers 
158 | 10190..101CF; Ancient Symbols 
159 | 101D0..101FF; Phaistos Disc 
160 | 10280..1029F; Lycian 
161 | 102A0..102DF; Carian 
162 | 10300..1032F; Old Italic 
163 | 10330..1034F; Gothic 
164 | 10380..1039F; Ugaritic 
165 | 103A0..103DF; Old Persian 
166 | 10400..1044F; Deseret 
167 | 10450..1047F; Shavian 
168 | 10480..104AF; Osmanya 
169 | 10800..1083F; Cypriot Syllabary 
170 | 10840..1085F; Imperial Aramaic 
171 | 10900..1091F; Phoenician 
172 | 10920..1093F; Lydian 
173 | 10A00..10A5F; Kharoshthi 
174 | 10A60..10A7F; Old South Arabian 
175 | 10B00..10B3F; Avestan 
176 | 10B40..10B5F; Inscriptional Parthian 
177 | 10B60..10B7F; Inscriptional Pahlavi 
178 | 10C00..10C4F; Old Turkic 
179 | 10E60..10E7F; Rumi Numeral Symbols 
180 | 11080..110CF; Kaithi 
181 | 12000..123FF; Cuneiform 
182 | 12400..1247F; Cuneiform Numbers and Punctuation 
183 | 13000..1342F; Egyptian Hieroglyphs 
184 | 1D000..1D0FF; Byzantine Musical Symbols 
185 | 1D100..1D1FF; Musical Symbols 
186 | 1D200..1D24F; Ancient Greek Musical Notation 
187 | 1D300..1D35F; Tai Xuan Jing Symbols 
188 | 1D360..1D37F; Counting Rod Numerals 
189 | 1D400..1D7FF; Mathematical Alphanumeric Symbols 
190 | 1F000..1F02F; Mahjong Tiles 
191 | 1F030..1F09F; Domino Tiles 
192 | 1F100..1F1FF; Enclosed Alphanumeric Supplement 
193 | 1F200..1F2FF; Enclosed Ideographic Supplement 
194 | 20000..2A6DF; CJK Unified Ideographs Extension B 
195 | 2A700..2B73F; CJK Unified Ideographs Extension C 
196 | 2F800..2FA1F; CJK Compatibility Ideographs Supplement 
197 | E0000..E007F; Tags 
198 | E0100..E01EF; Variation Selectors Supplement 
199 | F0000..FFFFF; Supplementary Private Use Area-A 
200 | 100000..10FFFF; Supplementary Private Use Area-
201 | */
202 | var CharacterUtil = {
203 |   /**
204 | 	 * 识别字符类型
205 | 	 * @param input
206 | 	 * @return int CharacterUtil定义的字符类型常量
207 | 	 */
208 |   identifyCharType: function(input){
209 |     var ident = CharType.CHAR_USELESS; //其他的不做处理的字符
210 |     if ((input >= '0' && input <= '9')){
211 | 			ident = CharType.CHAR_ARABIC;
212 | 		}
213 | 		else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')){
214 | 			ident = CharType.CHAR_ENGLISH;
215 | 		}
216 | 		else {
217 | 		  ident = CharType.CHAR_CHINESE;
218 | 
219 | 			var ub = input.charCodeAt(0);
220 | 			if ((ub >= 0x3400 && ub <= 0x4DBF) || // CJK Unified Ideographs Extension A
221 |           (ub >= 0x4E00 && ub <= 0x9FFF) || // CJK Unified Ideographs 
222 |           (ub >= 0xF900 && ub <= 0xFAFF)){  // CJK Compatibility Ideographs
223 | 				//目前已知的中文字符UTF-8集合
224 | 				ident = CharType.CHAR_CHINESE;
225 | 			}
226 | 			else if ((ub >= 0xFF00 && ub <= 0xFFEF) || // Halfwidth and Fullwidth Forms 全角数字字符和日韩字符
227 | 			  //韩文字符集
228 | 			  (ub >= 0x1100 && ub <= 0x11FF) || // Hangul Jamo
229 | 			  (ub >= 0x3130 && ub <= 0x318F) || // Hangul Compatibility Jamo
230 | 			  (ub >= 0xA960 && ub <= 0xA97F) || // Hangul Jamo Extended-A
231 | 			  (ub >= 0xAC00 && ub <= 0xD7AF) || // Hangul Syllables 
232 | 			  (ub >= 0xD7B0 && ub <= 0xD7FF) ||  // Hangul Jamo Extended-B
233 | 				//日文字符集
234 | 			  (ub >= 0x3040 && ub <= 0x309F) || // Hiragana 平假名
235 | 			  (ub >= 0x30A0 && ub <= 0x30FF) || // KATAKANA 片假名
236 | 			  (ub >= 0x31F0 && ub <= 0x31FF)){ // Katakana Phonetic Extensions
237 | 				ident = CharType.CHAR_OTHER_CJK;
238 | 			}
239 | 		}
240 | 		return ident;
241 |   },
242 |   /**
243 | 	 * 进行字符规格化（全角转半角，大写转小写处理）
244 | 	 * @param input
245 | 	 * @return char
246 | 	 */
247 |   regularize: function(input){
248 |     var code = input.charCodeAt(0);
249 |     if (code === 12288) {
250 |       input = ' ';//32;
251 |     }
252 |     else if (code > 65280 && code < 65375) {
253 |       input = String.fromCharCode(code - 65248);
254 |       if ((input >= 'A' && input <= 'Z')) {
255 |         code = input.charCodeAt(0);
256 |       	input = String.fromCharCode(code + 32);
257 |   		}
258 |     }
259 |     else if ((input >= 'A' && input <= 'Z')) {
260 |     	input = String.fromCharCode(code + 32);
261 | 		}
262 | 
263 |     return input;
264 |   }
265 | };
266 | 
267 | module.exports = CharacterUtil;


--------------------------------------------------------------------------------
/lib/AnalyzeService.js:
--------------------------------------------------------------------------------
  1 | var Lexeme = require('./Lexeme');
  2 | 
  3 | var LexemeService = {
  4 |   /**
  5 | 	 * 合并两个相邻的词元
  6 | 	 * @param l
  7 | 	 * @param lexemeType
  8 | 	 * @return boolean 词元是否成功合并
  9 | 	 */
 10 | 	append: function(prev, next, lexemeType){
 11 | 		if (next && this.getEndPosition(prev) === this.getBeginPosition(next)){
 12 | 			prev.len += next.len;
 13 | 			prev.lexemeType = lexemeType;
 14 | 			return true;
 15 | 		}else {
 16 | 			return false;
 17 | 		}
 18 | 	},
 19 |   /*
 20 |    * 判断词元相等算法
 21 |    * 起始位置偏移、起始位置、终止位置相同
 22 |    */
 23 |   equals: function(l, o){
 24 |   	if (!o){
 25 |   		return false;
 26 |   	}
 27 |   	
 28 |   	if (l === o){
 29 |   		return true;
 30 |   	}
 31 |   	
 32 |   	if (o instanceof Lexeme){
 33 |   		if (l.offset === o.offset && 
 34 |   		  l.begin === o.begin && 
 35 |   		  l.len === o.len){
 36 |   			return true;			
 37 |   		}
 38 |   		else{
 39 |   			return false;
 40 |   		}
 41 |   	}
 42 |   	else{		
 43 |   		return false;
 44 |   	}
 45 |   },
 46 |   /*
 47 |    * 词元在排序集合中的比较算法
 48 |    */
 49 |   compare: function(x, y){
 50 |     //起始位置优先
 51 |     if (x.begin < y.begin){
 52 |       return -1;
 53 |     }
 54 |     else if(x.begin === y.begin){
 55 |     	//词元长度优先
 56 |     	if(x.len > y.len){
 57 |     		return -1;
 58 |     	}
 59 |     	else if (x.len === y.len){
 60 |     		return 0;
 61 |     	}
 62 |     	else {//x.len < y.getLength()
 63 |     		return 1;
 64 |     	}
 65 |     }
 66 |     else {//x.begin > y.begin
 67 |     	return 1;
 68 |     }
 69 |   },
 70 |   /**
 71 |    * 获取词元在文本中的起始位置
 72 |    * @return int
 73 |    */
 74 |   getBeginPosition: function(lexeme){
 75 |   	return lexeme.offset + lexeme.begin;
 76 |   },
 77 |   /**
 78 |    * 获取词元在文本中的结束位置
 79 |    * @return int
 80 |    */
 81 |   getEndPosition: function(lexeme){
 82 |   	return lexeme.offset + lexeme.begin + lexeme.len;
 83 |   }
 84 | };
 85 | 
 86 | var SortedSetService = {
 87 |   /**
 88 |    * 向链表集合添加词元
 89 |    * @param lexeme
 90 |    */
 91 |   addLexeme: function(sortedSet, lexeme){
 92 |     if (sortedSet.lexemeList.length === 0){
 93 |       sortedSet.lexemeList.push(lexeme);
 94 |       return null;
 95 |     }
 96 |     var tailList = [], tail, compRes;
 97 |     tail = sortedSet.pollLast(); // 比 this.lexemeList.pop(); 快
 98 |     compRes = LexemeService.compare(tail, lexeme);
 99 |     if (compRes === 0){   // 词元与尾部词元相同，不放入集合
100 |       sortedSet.lexemeList.push(tail);
101 |       return null;
102 |     }
103 |     else if (compRes < 0){  // 词元接入链表尾部
104 |       sortedSet.lexemeList.push(tail);
105 |       sortedSet.lexemeList.push(lexeme);
106 |       return null;
107 |     }
108 |     else{                 //从尾部上逆
109 |       tailList.unshift(tail);
110 |       var arr = this.addLexeme(sortedSet, lexeme);
111 |       if (!arr) {
112 |         sortedSet.lexemeList.concat(tailList);
113 |         return null;
114 |       }
115 |       else{
116 |         tailList = arr.concat(tailList);
117 |         return tailList;
118 |       }
119 |     }
120 |   },
121 |   /**
122 |    * 返回链表头部元素
123 |    * @return
124 |    */
125 |   peekFirst: function(sortedSet){
126 |   	if (sortedSet.lexemeList.length > 0){
127 |   		return sortedSet.lexemeList[0];
128 |   	}
129 |   	return null;
130 |   },
131 |   /**
132 |    * 取出链表集合的第一个元素
133 |    * @return Lexeme
134 |    */
135 |   pollFirst: function(sortedSet){
136 |   	if (sortedSet.lexemeList.length > 0){
137 |   		return sortedSet.lexemeList.shift();
138 |   	}
139 |   	return null;
140 |   },
141 |   /**
142 |    * 取出链表集合的最后一个元素
143 |    * @return Lexeme
144 |    */
145 |   pollLast: function(sortedSet){
146 |     return sortedSet.lexemeList.pop();
147 |   },
148 |   /**
149 |    * 返回链表尾部元素
150 |    * @return
151 |    */
152 |   peekLast: function(sortedSet){
153 |     var idx = sortedSet.lexemeList.length - 1;
154 |   	if (idx >= 0){
155 |   		return sortedSet.lexemeList[idx];
156 |   	}
157 |   	return null;
158 |   }
159 | };
160 | 
161 | var LexemePathService = {
162 |   newLexemePath: function(){
163 |     
164 |   },
165 |   compare: function(x, y){
166 |     //比较有效文本长度
167 |   	if (x.payloadLength > y.payloadLength){
168 |   		return -1;
169 |   	}
170 |   	else if (x.payloadLength < y.payloadLength){
171 |   		return 1;
172 |   	}
173 |   	else{
174 |   		//比较词元个数，越少越好
175 |   		if (x.lexemeList.length < y.lexemeList.length){
176 |   			return -1;
177 |   		}
178 |   		else if (x.lexemeList.length > y.lexemeList.length){
179 |   			return 1;
180 |   		}
181 |   		else{
182 |   			//路径跨度越大越好
183 |   			if (LexemePathService.getPathLength(x) >  LexemePathService.getPathLength(y)){
184 |   				return -1;
185 |   			}
186 |   			else if (LexemePathService.getPathLength(x) <  LexemePathService.getPathLength(y)){
187 |   				return 1;
188 |   			}
189 |   			else {
190 |   				//根据统计学结论，逆向切分概率高于正向切分，因此位置越靠后的优先
191 |   				if (x.pathEnd > y.pathEnd){
192 |   					return -1;
193 |   				}
194 |   				else if (x.pathEnd < y.pathEnd){
195 |   					return 1;
196 |   				}
197 |   				else{
198 |   					//词长越平均越好
199 |   					if (LexemePathService.getXWeight(x) > LexemePathService.getXWeight(y)){
200 |   						return -1;
201 |   					}
202 |   					else if (LexemePathService.getXWeight(x) < LexemePathService.getXWeight(y)){
203 |   						return 1;
204 |   					}
205 |   					else {
206 |   						//词元位置权重比较
207 |   						if (LexemePathService.getPWeight(x) > LexemePathService.getPWeight(y)){
208 |   							return -1;
209 |   						}
210 |   						else if (LexemePathService.getPWeight(x) < LexemePathService.getPWeight(y)){
211 |   							return 1;
212 |   						}
213 |   					}
214 |   				}
215 |   			}
216 |   		}
217 |   	}
218 |   	return 0;
219 |   },
220 |   /**
221 |    * 获取LexemePath的路径长度
222 |    * @return
223 |    */
224 |   getPathLength: function(lexemePath){
225 |   	return lexemePath.pathEnd - lexemePath.pathBegin;
226 |   },
227 |   getNextLexeme: function(lexemePath, currLexeme){
228 |     var idx = lexemePath.lexemeList.indexOf(currLexeme);
229 |     if (idx >= 0 && lexemePath.lexemeList.length > idx){
230 |       return lexemePath.lexemeList[idx+1];
231 |     }
232 |     return null;
233 |   },
234 |   /**
235 |    * 检测词元位置交叉（有歧义的切分），有交叉时返回 true
236 |    * @param lexeme
237 |    * @return
238 |    */
239 |   checkCross: function(lexemePath, lexeme){
240 |   	return (lexeme.begin >= lexemePath.pathBegin && lexeme.begin < lexemePath.pathEnd) ||
241 |   	  (lexemePath.pathBegin >= lexeme.begin && lexemePath.pathBegin < lexeme.begin+ lexeme.len);
242 |   },
243 |   /**
244 |    * 移除尾部的Lexeme
245 |    * @return
246 |    */
247 |   removeTail: function(lexemePath){
248 |   	var tail = lexemePath.pollLast();
249 |   	if (lexemePath.lexemeList.length === 0){
250 |   		lexemePath.pathBegin = -1;
251 |   		lexemePath.pathEnd = -1;
252 |   		lexemePath.payloadLength = 0;			
253 |   	}
254 |   	else{		
255 |   		lexemePath.payloadLength -= tail.len;
256 |   		var newTail = lexemePath.peekLast();
257 |   		lexemePath.pathEnd = newTail.begin + newTail.len;
258 |   	}
259 |   	return tail;
260 |   },
261 |   /**
262 |    * 向LexemePath追加相交的Lexeme
263 |    * @param lexeme
264 |    * @return 
265 |    */
266 |   addCrossLexeme: function(lexemePath, lexeme){
267 |   	if (lexemePath.lexemeList.length === 0){
268 |   		SortedSetService.addLexeme(lexemePath, lexeme);
269 |   		lexemePath.pathBegin = lexeme.begin;
270 |   		lexemePath.pathEnd = lexeme.begin + lexeme.len;
271 |   		lexemePath.payloadLength += lexeme.len;
272 |   		return true;
273 |   	}
274 |   	else if (LexemePathService.checkCross(lexemePath, lexeme)){
275 |   		SortedSetService.addLexeme(lexemePath, lexeme);
276 |   		if (lexeme.begin + lexeme.len > lexemePath.pathEnd){
277 |   			lexemePath.pathEnd = lexeme.begin + lexeme.len;
278 |   		}
279 |   		lexemePath.payloadLength = lexemePath.pathEnd - lexemePath.pathBegin;
280 |   		return true;
281 |   	}
282 |   	else{
283 |   		return false;
284 |   	}
285 |   },
286 |   /**
287 |    * 向LexemePath追加不相交的Lexeme
288 |    * @param lexeme
289 |    * @return 
290 |    */
291 |   addNotCrossLexeme: function(lexemePath, lexeme){
292 |   	if (lexemePath.lexemeList.length === 0){
293 |   		SortedSetService.addLexeme(lexemePath, lexeme);
294 |   		lexemePath.pathBegin = lexeme.begin;
295 |   		lexemePath.pathEnd = lexeme.begin + lexeme.len;
296 |   		lexemePath.payloadLength += lexeme.len;
297 |   		return true;
298 |   		
299 |   	}
300 |   	else if (LexemePathService.checkCross(lexemePath, lexeme)){
301 |   		return false;
302 |   	}
303 |   	else{
304 |   		SortedSetService.addLexeme(lexemePath, lexeme);
305 |   		lexemePath.payloadLength += lexeme.len;
306 |   		var head = lexemePath.peekFirst();
307 |   		lexemePath.pathBegin = head.begin;
308 |   		var tail = lexemePath.peekLast();
309 |   		lexemePath.pathEnd = tail.begin + tail.len;
310 |   		return true;
311 |   	}
312 |   },
313 |   /**
314 |    * X权重（词元长度积）
315 |    * @return
316 |    */
317 |   getXWeight: function(lexemePath){
318 |   	var product = 1;
319 |   	var c = lexemePath.peekFirst();
320 |   	while(c){
321 |   		product *= c.len;
322 |   		c = LexemePathService.getNextLexeme(lexemePath, c);
323 |   	}
324 |   	return product;
325 |   },
326 |   /**
327 |    * 词元位置权重
328 |    * @return
329 |    */
330 |   getPWeight: function(lexemePath){
331 |   	var pWeight = 0;
332 |   	var p = 0;
333 |   	var c = lexemePath.peekFirst();
334 |   	while(c){
335 |   		p++;
336 |   		pWeight += p * c.len;
337 |   		c = LexemePathService.getNextLexeme(lexemePath, c);
338 |   	}
339 |   	return pWeight;		
340 |   }
341 | };
342 | 
343 | 
344 | 
345 | module.exports = {
346 |   LexemeService: LexemeService,
347 |   LexemePathService: LexemePathService,
348 |   SortedSetService: SortedSetService
349 | };


--------------------------------------------------------------------------------
/lib/Dictionary.js:
--------------------------------------------------------------------------------
  1 | var fs = require('fs'),
  2 |     config = require('./config'),
  3 |     Hit = require('./Hit'),
  4 |     HitService = require('./HitService');
  5 |     
  6 | var DictSegment = require('./DictSegment');
  7 | 
  8 | var PATH_DIC_MAIN      = __dirname + "/dict/main.dic",
  9 |    PATH_DIC_SURNAME    = __dirname + "/dict/surname.dic",
 10 |    PATH_DIC_QUANTIFIER = __dirname + "/dict/quantifier.dic",
 11 |    PATH_DIC_SUFFIX     = __dirname + "/dict/suffix.dic",
 12 |    PATH_DIC_PREP       = __dirname + "/dict/preposition.dic",
 13 |    PATH_DIC_STOP       = __dirname + "/dict/stopword.dic";
 14 | 
 15 | var Dictionary = {
 16 |   inited: false,
 17 |   initial: function(opts){
 18 |     this.inited = true;
 19 |     this.config = opts.Config || config;
 20 |     
 21 |     if (!this.config.ext_dict) { this.config.ext_dict = config.ext_dict;}
 22 |     if (!this.config.ext_stopwords) { this.config.ext_stopwords = config.ext_stopwords;}
 23 |     
 24 |     this.loadMainDict(opts.MainDictPath);
 25 |     
 26 |     this.loadSurnameDict(opts.SurnameDictPath);
 27 |     this.loadQuantifierDict(opts.QuantifierDictPath);
 28 |     this.loadSuffixDict(opts.SuffixDictPath);
 29 |     this.loadPrepDict(opts.PrepDictPath);
 30 |     this.loadStopWordDict(opts.StopWordDictPath);
 31 |     
 32 |     // todo 缓存字典
 33 |     //fs.writeFileSync('./dict/main.dic.json', JSON.stringify(this._MainDict, null, '\t'));
 34 |   },
 35 |   loadExtDict: function(filepath){
 36 | 		//读取扩展词典文件
 37 |     var file = fs.readFileSync(filepath, {encoding: 'utf8'});
 38 |     file = file.replace(/ |\r/g, '');
 39 |     file = file.split('\n');
 40 |     
 41 |     var theWord;
 42 |     for(var i=0;i<file.length;i++){
 43 |       theWord = file[i];
 44 |       if (theWord && theWord !== ''){
 45 |         this.fillSegment(this._MainDict, theWord);
 46 |       }
 47 |     }
 48 |   },
 49 |   loadMainDict: function(dictPath){
 50 | 		//建立一个主词典实例
 51 | 		this._MainDict = new DictSegment();
 52 | 
 53 | 		//读取主词典文件
 54 |     var file = fs.readFileSync(dictPath || PATH_DIC_MAIN, {encoding: 'utf8'});
 55 |     file = file.replace(/ |\r/g, '');
 56 |     file = file.split('\n');
 57 | 
 58 |     var theWord;
 59 |     for(var i=0;i<file.length;i++){
 60 |       theWord = file[i];
 61 |       if (theWord && theWord !== ''){
 62 |         this.fillSegment(this._MainDict, theWord);
 63 |       }
 64 |     }
 65 |     
 66 | 		//加载配置的扩展词典
 67 | 		var extDictFiles  = this.config.ext_dict;
 68 | 		if (extDictFiles){
 69 | 		  for(var i=0;i<extDictFiles.length;i++){
 70 | 		    this.loadExtDict(extDictFiles[i]);
 71 | 		  }
 72 | 		}
 73 | 		//console.log('主字典+扩展字典条数：', this._MainDict.storeSize);
 74 | 	},
 75 | 	loadSurnameDict: function(dictPath){
 76 | 		//建立 词典实例
 77 | 		this._SurnameDict = new DictSegment();
 78 | 
 79 | 		//读取 词典文件
 80 |     var file = fs.readFileSync(dictPath || PATH_DIC_SURNAME, {encoding: 'utf8'});
 81 |     file = file.replace(/ |\r/g, '');
 82 |     file = file.split('\n');
 83 |     
 84 |     var theWord;
 85 |     for(var i=0;i<file.length;i++){
 86 |       theWord = file[i];
 87 |       if (theWord && theWord !== ''){
 88 |         this.fillSegment(this._SurnameDict, theWord);
 89 |       }
 90 |     }
 91 | 	},
 92 | 	loadQuantifierDict: function(dictPath){
 93 | 		//建立一个量词典实例
 94 | 		this._QuantifierDict = new DictSegment();
 95 | 
 96 | 		//读取量词词典文件
 97 |     var file = fs.readFileSync(dictPath || PATH_DIC_QUANTIFIER, {encoding: 'utf8'});
 98 |     file = file.replace(/ |\r/g, '');
 99 |     file = file.split('\n');
100 |     
101 |     var theWord;
102 |     for(var i=0;i<file.length;i++){
103 |       theWord = file[i];
104 |       if (theWord && theWord !== ''){
105 |         this.fillSegment(this._QuantifierDict, theWord);
106 |       }
107 |     }
108 | 	},
109 | 	loadSuffixDict: function(dictPath){
110 | 		//建立 词典实例
111 | 		this._SuffixDict = new DictSegment();
112 | 
113 | 		//读取 词典文件
114 |     var file = fs.readFileSync(dictPath || PATH_DIC_SUFFIX, {encoding: 'utf8'});
115 |     file = file.replace(/ |\r/g, '');
116 |     file = file.split('\n');
117 |     
118 |     var theWord;
119 |     for(var i=0;i<file.length;i++){
120 |       theWord = file[i];
121 |       if (theWord && theWord !== ''){
122 |         this.fillSegment(this._SuffixDict, theWord);
123 |       }
124 |     }
125 | 	},
126 | 	loadPrepDict: function(dictPath){
127 | 		//建立 词典实例
128 | 		this._PrepDict = new DictSegment();
129 | 
130 | 		//读取 词典文件
131 |     var file = fs.readFileSync(dictPath || PATH_DIC_PREP, {encoding: 'utf8'});
132 |     file = file.replace(/ |\r/g, '');
133 |     file = file.split('\n');
134 |     
135 |     var theWord;
136 |     for(var i=0;i<file.length;i++){
137 |       theWord = file[i];
138 |       if (theWord && theWord !== ''){
139 |         this.fillSegment(this._PrepDict, theWord);
140 |       }
141 |     }
142 | 	},
143 | 	loadStopWordDict: function(dictPath){
144 | 		//建立停词词典实例
145 | 		this._StopWords = new DictSegment();
146 | 
147 | 		//读取停词词典文件
148 |     var file = fs.readFileSync(dictPath || PATH_DIC_STOP, {encoding: 'utf8'});
149 |     file = file.replace(/ |\r/g, '');
150 |     file = file.split('\n');
151 |     
152 |     var theWord;
153 |     for(var i=0;i<file.length;i++){
154 |       theWord = file[i];
155 |       if (theWord && theWord !== ''){
156 |         this.fillSegment(this._StopWords, theWord);
157 |       }
158 |     }
159 |     
160 |     //加载配置的扩展词典
161 | 		var extStopDictFiles  = this.config.ext_stopwords;
162 | 		if (extStopDictFiles){
163 | 		  for(var i=0;i<extStopDictFiles.length;i++){
164 | 		    this.loadExtStopWordDict(extStopDictFiles[i]);
165 | 		  }
166 | 		}
167 | 	},
168 | 	loadExtStopWordDict: function(filepath){
169 | 		this._StopWords = new DictSegment();
170 | 
171 |     var file = fs.readFileSync(filepath, {encoding: 'utf8'});
172 |     file = file.replace(/ |\r/g, '');
173 |     file = file.split('\n');
174 |     
175 |     var theWord;
176 |     for(var i=0;i<file.length;i++){
177 |       theWord = file[i];
178 |       if (theWord && theWord !== ''){
179 |         this.fillSegment(this._StopWords, theWord);
180 |       }
181 |     }
182 |   },
183 |   addWords: function(words){
184 | 		if (words){
185 | 		  var word;
186 | 			for(var i=0;i<words.length;i++){
187 | 			  word = words[i];
188 | 				if (word) {
189 | 				  word = word.trim();
190 | 					//批量加载词条到主内存词典中
191 | 					this.fillSegment(this._MainDict, word);
192 | 				}
193 | 			}
194 | 		}
195 | 	},
196 | 	disableWords: function(words){
197 | 		if (words){
198 | 			var word;
199 | 			for(var i=0;i<words.length;i++){
200 | 			  word = words[i].trim();
201 | 				if (word) {
202 | 				  word = word.trim();
203 | 					//批量屏蔽词条
204 | 					//this._MainDict.disableSegment(word);
205 | 					this.fillSegment(this._MainDict, word, 0, word.length, 0);
206 | 				}
207 | 			}
208 | 		}
209 | 	},
210 | 	matchInMainDict: function(charArray, begin, len){
211 | 		return this.matchInDictSegment(this._MainDict, charArray, begin, len);
212 | 	},
213 | 	matchInQuantifierDict: function(charArray, begin, len){
214 | 		return this.matchInDictSegment(this._QuantifierDict, charArray, begin, len);
215 | 	},
216 | 	matchWithHit: function(charArray, currentIndex, matchedHit){
217 | 		var ds = matchedHit.matchedDictSegment;
218 | 		return this.matchInDictSegment(ds, charArray, currentIndex, 1 , matchedHit);
219 | 	},
220 | 	isStopWord: function(charArray, begin, len){			
221 | 		return HitService.isMatch(this.matchInDictSegment(this._StopWords, charArray, begin, len));
222 | 	},
223 | 	fillSegment: function(dictSeg, charArray, begin, len, enabled){
224 |     begin = begin || 0;
225 |     len = len || charArray.length;
226 |     if (enabled !== 0) { enabled = 1; }
227 |     
228 |     //获取字典表中的汉字对象
229 |   	var beginChar = charArray[begin];
230 |   	var keyChar = beginChar;// = dictSeg.charMap[beginChar];
231 |   
232 |   	//字典中没有该字，则将其添加入字典
233 |   	//if(!dictSeg.charMap[beginChar]){
234 |   	//if (!dictSeg.childrenMap[beginChar]){
235 |   		//dictSeg.charMap[beginChar] = beginChar;
236 |   		//dictSeg.childrenMap[beginChar] = {};
237 |   		//keyChar = beginChar;
238 |   	//}
239 |   
240 |   	//搜索当前节点的存储，查询对应keyChar的keyChar，如果没有则创建
241 |   	var ds = Dictionary.lookforSegment(dictSeg, keyChar, enabled);
242 |   	if (ds){
243 |   		//处理keyChar对应的segment
244 |   		if(len > 1){
245 |   			//词元还没有完全加入词典树
246 |   			Dictionary.fillSegment(ds, charArray, begin + 1, len - 1 , enabled);
247 |   		}
248 |   		else if (len === 1){
249 |   			//已经是词元的最后一个char,设置当前节点状态为enabled，
250 |   			//enabled=1表明一个完整的词，enabled=0表示从词典中屏蔽当前词
251 |   			ds.nodeState = enabled;
252 |   		}
253 |   	}
254 |   },
255 |   matchInDictSegment: function(dictSeg, charArray, begin, len, searchHit) {
256 |   	if (!searchHit){
257 |   		//如果hit为空，新建
258 |   		searchHit = new Hit();
259 |   		//设置hit的起始文本位置
260 |   		searchHit.begin = begin;
261 |   	}
262 |   	else{
263 |   		//否则要将HIT状态重置
264 |   		HitService.setUnmatch(searchHit);
265 |   	}
266 |   	//设置hit的当前处理位置
267 |   	searchHit.end = begin;
268 |   
269 |     var keyChar = charArray[begin];
270 |   	//在map中查找
271 |   	var ds = dictSeg.childrenMap[keyChar];
272 |   
273 |   	//STEP2 找到DictSegment，判断词的匹配状态，是否继续递归，还是返回结果
274 |   	if (ds){
275 |   		if (len > 1){
276 |   			//词未匹配完，继续往下搜索
277 |   			return Dictionary.match(ds, charArray, begin + 1 , len - 1 , searchHit);
278 |   		}
279 |   		else if (len === 1){
280 |   			//搜索最后一个char
281 |   			if(ds.nodeState === 1){
282 |   				//添加HIT状态为完全匹配
283 |   				HitService.setMatch(searchHit);
284 |   			}
285 |   			if(ds.storeSize > 0){
286 |   				//添加HIT状态为前缀匹配
287 |   				HitService.setPrefix(searchHit);
288 |   				//记录当前位置的DictSegment
289 |   				searchHit.matchedDictSegment = ds;
290 |   			}
291 |   			return searchHit;
292 |   		}
293 |   	}
294 |   	//STEP3 没有找到DictSegment， 将HIT设置为不匹配
295 |   	return searchHit;
296 |   },
297 |   lookforSegment: function(dictSeg, keyChar, create){
298 |   	//搜索Map
299 |   	var ds = dictSeg.childrenMap[keyChar];
300 |   	if (!ds && create){
301 |   		//构造新的segment
302 |   		ds = new DictSegment();
303 |   		dictSeg.childrenMap[keyChar] = ds;
304 |   		//当前节点存储segment数目+1
305 |   		dictSeg.storeSize += 1;
306 |   	}
307 |   
308 |   	return ds;
309 |   }
310 | 
311 | };
312 | 
313 | module.exports = Dictionary;
314 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/lib/dict/custom/single_word_low_freq.dic:
--------------------------------------------------------------------------------
   1 | 踧
   2 | 覢
   3 | 觓
   4 | 覛
   5 | 覅
   6 | 覟
   7 | 覗
   8 | 覣
   9 | 覭
  10 | 覂
  11 | 觡
  12 | 覝
  13 | 觟
  14 | 褱
  15 | 褰
  16 | 襒
  17 | 覞
  18 | 袨
  19 | 觏
  20 | 赒
  21 | 觇
  22 | 謍
  23 | 讙
  24 | 襦
  25 | 袤
  26 | 誸
  27 | 诮
  28 | 衩
  29 | 茷
  30 | 趒
  31 | 襌
  32 | 诰
  33 | 譠
  34 | 袄
  35 | 聱
  36 | 豸
  37 | 蠓
  38 | 讵
  39 | 袅
  40 | 诂
  41 | 裞
  42 | 訄
  43 | 荺
  44 | 褂
  45 | 蠡
  46 | 裐
  47 | 諴
  48 | 芫
  49 | 赧
  50 | 触
  51 | 跫
  52 | 褫
  53 | 赝
  54 | 褡
  55 | 衪
  56 | 裎
  57 | 豜
  58 | 褶
  59 | 裟
  60 | 跏
  61 | 袪
  62 | 袈
  63 | 觐
  64 | 跄
  65 | 坏
  66 | 肱
  67 | 裾
  68 | 考
  69 | 豝
  70 | 踰
  71 | 覃
  72 | 蹓
  73 | 黾
  74 | 褴
  75 | 轲
  76 | 裨
  77 | 蜇
  78 | 鮆
  79 | 褥
  80 | 誊
  81 | 貉
  82 | 褊
  83 | 蜉
  84 | 衔
  85 | 詄
  86 | 豋
  87 | 胼
  88 | 荞
  89 | 踫
  90 | 谗
  91 | 耦
  92 | 誏
  93 | 衮
  94 | 胝
  95 | 幔
  96 | 轭
  97 | 赈
  98 | 贲
  99 | 蓼
 100 | 褛
 101 | 迵
 102 | 觊
 103 | 蚜
 104 | 讫
 105 | 颢
 106 | 葄
 107 | 觎
 108 | 诎
 109 | 謢
 110 | 蹧
 111 | 邬
 112 | 芊
 113 | 赣
 114 | 囱
 115 | 蝎
 116 | 夆
 117 | 蠋
 118 | 蠕
 119 | 蹼
 120 | 臊
 121 | 蛭
 122 | 颚
 123 | 讴
 124 | 踽
 125 | 菫
 126 | 臾
 127 | 薮
 128 | 蹒
 129 | 谀
 130 | 菀
 131 | 佶
 132 | 摀
 133 | 佚
 134 | 邸
 135 | 跺
 136 | 豊
 137 | 荔
 138 | 锌
 139 | 诿
 140 | 蕤
 141 | 诳
 142 | 芩
 143 | 蹴
 144 | 褉
 145 | 觔
 146 | 舴
 147 | 腋
 148 | 颍
 149 | 膊
 150 | 脯
 151 | 荪
 152 | 郢
 153 | 坛
 154 | 轫
 155 | 醺
 156 | 捺
 157 | 姝
 158 | 胭
 159 | 饷
 160 | 谪
 161 | 驮
 162 | 僮
 163 | 踯
 164 | 忪
 165 | 驷
 166 | 躅
 167 | 忑
 168 | 彧
 169 | 衲
 170 | 唠
 171 | 跚
 172 | 吃
 173 | 诩
 174 | 褓
 175 | 诤
 176 | 豨
 177 | 诋
 178 | 菈
 179 | 逖
 180 | 荟
 181 | 裆
 182 | 喋
 183 | 忖
 184 | 闾
 185 | 诌
 186 | 啻
 187 | 铀
 188 | 菡
 189 | 胱
 190 | 蹬
 191 | 隹
 192 | 鹬
 193 | 诒
 194 | 轧
 195 | 萏
 196 | 舶
 197 | 鳅
 198 | 药
 199 | 酯
 200 | 夯
 201 | 偬
 202 | 酝
 203 | 跻
 204 | 咤
 205 | 掬
 206 | 呆
 207 | 蹶
 208 | 踞
 209 | 蝌
 210 | 咋
 211 | 谧
 212 | 舫
 213 | 啐
 214 | 茸
 215 | 谟
 216 | 嵌
 217 | 蜿
 218 | 魇
 219 | 帷
 220 | 觑
 221 | 鳍
 222 | 谏
 223 | 哽
 224 | 乓
 225 | 蚌
 226 | 嗙
 227 | 巿
 228 | 刽
 229 | 踱
 230 | 腆
 231 | 薏
 232 | 蜃
 233 | 谑
 234 | 躄
 235 | 鸾
 236 | 齁
 237 | 腼
 238 | 呷
 239 | 吆
 240 | 荀
 241 | 裱
 242 | 辇
 243 | 睫
 244 | 伎
 245 | 妲
 246 | 菠
 247 | 鼐
 248 | 麾
 249 | 芮
 250 | 鲑
 251 | 辉
 252 | 啜
 253 | 苞
 254 | 踼
 255 | 荃
 256 | 杞
 257 | 浣
 258 | 沬
 259 | 胤
 260 | 恿
 261 | 驭
 262 | 逵
 263 | 钛
 264 | 徕
 265 | 贮
 266 | 蔫
 267 | 锚
 268 | 衙
 269 | 肄
 270 | 豺
 271 | 闸
 272 | 隋
 273 | 腑
 274 | 脐
 275 | 脓
 276 | 叱
 277 | 迥
 278 | 踝
 279 | 馥
 280 | 佣
 281 | 喳
 282 | 迩
 283 | 贻
 284 | 诙
 285 | 椭
 286 | 琬
 287 | 赂
 288 | 诧
 289 | 苯
 290 | 怂
 291 | 蟆
 292 | 龊
 293 | 漳
 294 | 迭
 295 | 垛
 296 | 铲
 297 | 馊
 298 | 娓
 299 | 葆
 300 | 赑
 301 | 卍
 302 | 遽
 303 | 谯
 304 | 賏
 305 | 蛹
 306 | 锤
 307 | 粟
 308 | 衿
 309 | 渥
 310 | 铳
 311 | 刍
 312 | 镳
 313 | 匮
 314 | 万
 315 | 骁
 316 | 酣
 317 | 酉
 318 | 骥
 319 | 寨
 320 | 蓁
 321 | 诽
 322 | 钡
 323 | 浙
 324 | 酗
 325 | 跩
 326 | 拗
 327 | 坷
 328 | 雱
 329 | 闺
 330 | 喈
 331 | 晔
 332 | 螳
 333 | 谙
 334 | 蹂
 335 | 鞑
 336 | 蔗
 337 | 账
 338 | 垚
 339 | 瞩
 340 | 谩
 341 | 掳
 342 | 媲
 343 | 葾
 344 | 鳗
 345 | 钣
 346 | 檀
 347 | 阕
 348 | 聿
 349 | 蜍
 350 | 仆
 351 | 嗅
 352 | 峥
 353 | 蜈
 354 | 垠
 355 | 蚓
 356 | 麓
 357 | 殉
 358 | 弩
 359 | 朴
 360 | 胥
 361 | 瘴
 362 | 篑
 363 | 镍
 364 | 鹂
 365 | 暐
 366 | 榷
 367 | 咀
 368 | 佯
 369 | 蚣
 370 | 荻
 371 | 鬓
 372 | 仝
 373 | 裴
 374 | 讷
 375 | 孺
 376 | 咨
 377 | 俑
 378 | 遴
 379 | 吽
 380 | 笋
 381 | 耀
 382 | 霾
 383 | 绎
 384 | 咿
 385 | 骸
 386 | 霭
 387 | 昕
 388 | 漩
 389 | 浒
 390 | 轼
 391 | 婿
 392 | 嗳
 393 | 钙
 394 | 谲
 395 | 蛾
 396 | 跛
 397 | 惺
 398 | 翎
 399 | 炽
 400 | 晒
 401 | 钳
 402 | 鞘
 403 | 谚
 404 | 钊
 405 | 背
 406 | 瀛
 407 | 槌
 408 | 臀
 409 | 跋
 410 | 窒
 411 | 藤
 412 | 噬
 413 | 蓊
 414 | 褐
 415 | 蔺
 416 | 鲍
 417 | 鲨
 418 | 舔
 419 | 箔
 420 | 萦
 421 | 诏
 422 | 褔
 423 | 咄
 424 | 俘
 425 | 彪
 426 | 饪
 427 | 嘱
 428 | 诬
 429 | 踮
 430 | 囝
 431 | 佢
 432 | 汶
 433 | 讹
 434 | 踅
 435 | 咐
 436 | 讼
 437 | 玟
 438 | 迂
 439 | 亵
 440 | 婵
 441 | 馁
 442 | 崭
 443 | 惦
 444 | 蠹
 445 | 濒
 446 | 匈
 447 | 蟋
 448 | 谕
 449 | 酪
 450 | 眛
 451 | 煦
 452 | 甭
 453 | 谄
 454 | 妾
 455 | 梧
 456 | 芜
 457 | 蛎
 458 | 颐
 459 | 雌
 460 | 褒
 461 | 臼
 462 | 圳
 463 | 剔
 464 | 噶
 465 | 耨
 466 | 嗈
 467 | 勋
 468 | 冶
 469 | 扑
 470 | 膺
 471 | 腺
 472 | 荤
 473 | 坞
 474 | 羲
 475 | 栾
 476 | 傌
 477 | 幌
 478 | 噗
 479 | 蛀
 480 | 觞
 481 | 塾
 482 | 耙
 483 | 枭
 484 | 擞
 485 | 缅
 486 | 踌
 487 | 蟀
 488 | 侥
 489 | 诣
 490 | 姜
 491 | 甸
 492 | 俭
 493 | 泠
 494 | 躇
 495 | 萌
 496 | 虏
 497 | 匕
 498 | 藩
 499 | 嗽
 500 | 蜻
 501 | 咛
 502 | 艹
 503 | 跎
 504 | 蔬
 505 | 鸠
 506 | 跆
 507 | 肋
 508 | 巅
 509 | 芯
 510 | 荐
 511 | 荼
 512 | 慵
 513 | 咸
 514 | 杭
 515 | 樟
 516 | 夸
 517 | 戮
 518 | 吱
 519 | 模
 520 | 葔
 521 | 迢
 522 | 砰
 523 | 须
 524 | 蒜
 525 | 骐
 526 | 茱
 527 | 痊
 528 | 蛤
 529 | 蜴
 530 | 诟
 531 | 俾
 532 | 疮
 533 | 悴
 534 | 袒
 535 | 蒹
 536 | 镖
 537 | 娥
 538 | 鹉
 539 | 婊
 540 | 噫
 541 | 矜
 542 | 岳
 543 | 鹦
 544 | 葭
 545 | 褚
 546 | 嵩
 547 | 丫
 548 | 凛
 549 | 峦
 550 | 惚
 551 | 懊
 552 | 韶
 553 | 憋
 554 | 聋
 555 | 讪
 556 | 瘫
 557 | 霓
 558 | 哺
 559 | 蝙
 560 | 靥
 561 | 堇
 562 | 铺
 563 | 趾
 564 | 褪
 565 | 缆
 566 | 媛
 567 | 胧
 568 | 肛
 569 | 珈
 570 | 畴
 571 | 驹
 572 | 熔
 573 | 臆
 574 | 肘
 575 | 豁
 576 | 冕
 577 | 吊
 578 | 韧
 579 | 炜
 580 | 舱
 581 | 恁
 582 | 巳
 583 | 舵
 584 | 臻
 585 | 戊
 586 | 稽
 587 | 诲
 588 | 隽
 589 | 铐
 590 | 鲫
 591 | 畸
 592 | 饥
 593 | 茉
 594 | 蒲
 595 | 矶
 596 | 峨
 597 | 蚵
 598 | 蔼
 599 | 诛
 600 | 焰
 601 | 偈
 602 | 蚱
 603 | 骯
 604 | 盔
 605 | 巩
 606 | 折
 607 | 偕
 608 | 嗓
 609 | 辙
 610 | 鸶
 611 | 酵
 612 | 莘
 613 | 耘
 614 | 汹
 615 | 楞
 616 | 陡
 617 | 裳
 618 | 憎
 619 | 讳
 620 | 荆
 621 | 笃
 622 | 屉
 623 | 霈
 624 | 恬
 625 | 蹦
 626 | 扬
 627 | 侃
 628 | 艳
 629 | 璇
 630 | 韬
 631 | 烬
 632 | 傀
 633 | 铮
 634 | 曦
 635 | 搂
 636 | 蝠
 637 | 霄
 638 | 胺
 639 | 遐
 640 | 飨
 641 | 郡
 642 | 困
 643 | 呎
 644 | 墅
 645 | 鞠
 646 | 瘤
 647 | 藻
 648 | 咆
 649 | 踹
 650 | 狷
 651 | 镀
 652 | 桐
 653 | 赘
 654 | 揽
 655 | 炬
 656 | 氢
 657 | 膛
 658 | 搪
 659 | 湿
 660 | 唆
 661 | 兑
 662 | 暸
 663 | 厮
 664 | 懈
 665 | 媳
 666 | 塘
 667 | 靡
 668 | 鹭
 669 | 祟
 670 | 冀
 671 | 豚
 672 | 蹄
 673 | 橙
 674 | 阎
 675 | 硫
 676 | 埠
 677 | 噱
 678 | 妃
 679 | 搓
 680 | 啃
 681 | 俞
 682 | 龚
 683 | 橄
 684 | 嚎
 685 | 椎
 686 | 蓦
 687 | 朔
 688 | 痘
 689 | 鳞
 690 | 铠
 691 | 叽
 692 | 跤
 693 | 裔
 694 | 诃
 695 | 岫
 696 | 怯
 697 | 讥
 698 | 聂
 699 | 垢
 700 | 藐
 701 | 濑
 702 | 莒
 703 | 淇
 704 | 毯
 705 | 礁
 706 | 赃
 707 | 庐
 708 | 辕
 709 | 瞌
 710 | 锯
 711 | 莓
 712 | 涡
 713 | 昼
 714 | 捌
 715 | 嗡
 716 | 倌
 717 | 禹
 718 | 蹋
 719 | 卯
 720 | 粪
 721 | 耽
 722 | 闰
 723 | 曳
 724 | 苔
 725 | 诵
 726 | 菇
 727 | 斟
 728 | 芥
 729 | 莅
 730 | 喀
 731 | 麒
 732 | 颊
 733 | 扛
 734 | 曜
 735 | 咎
 736 | 缮
 737 | 诫
 738 | 躁
 739 | 茜
 740 | 缤
 741 | 暧
 742 | 郄
 743 | 酥
 744 | 僻
 745 | 躬
 746 | 峙
 747 | 驯
 748 | 噎
 749 | 厦
 750 | 澜
 751 | 杏
 752 | 樽
 753 | 勘
 754 | 煤
 755 | 茎
 756 | 嚷
 757 | 昆
 758 | 铸
 759 | 烘
 760 | 邹
 761 | 廓
 762 | 拚
 763 | 俐
 764 | 裘
 765 | 饵
 766 | 恃
 767 | 蔓
 768 | 笙
 769 | 茁
 770 | 楷
 771 | 嚼
 772 | 锻
 773 | 蕊
 774 | 脖
 775 | 茍
 776 | 壤
 777 | 琮
 778 | 莽
 779 | 塌
 780 | 蚤
 781 | 膳
 782 | 磋
 783 | 蓓
 784 | 澈
 785 | 萎
 786 | 擒
 787 | 禄
 788 | 儡
 789 | 懦
 790 | 瞻
 791 | 虔
 792 | 粥
 793 | 赦
 794 | 畜
 795 | 彷
 796 | 寥
 797 | 揣
 798 | 嫖
 799 | 朽
 800 | 挂
 801 | 啄
 802 | 浇
 803 | 崖
 804 | 棠
 805 | 禽
 806 | 台
 807 | 邂
 808 | 矫
 809 | 茅
 810 | 惫
 811 | 吠
 812 | 苟
 813 | 叩
 814 | 徊
 815 | 巍
 816 | 舆
 817 | 邵
 818 | 彗
 819 | 萃
 820 | 拱
 821 | 嘶
 822 | 貂
 823 | 趴
 824 | 愿
 825 | 脊
 826 | 冗
 827 | 杆
 828 | 蕙
 829 | 铎
 830 | 囚
 831 | 啼
 832 | 谤
 833 | 徘
 834 | 芹
 835 | 骆
 836 | 夭
 837 | 饺
 838 | 馒
 839 | 溺
 840 | 咫
 841 | 屐
 842 | 绅
 843 | 诅
 844 | 缉
 845 | 渣
 846 | 敞
 847 | 萱
 848 | 丰
 849 | 俏
 850 | 螃
 851 | 蜀
 852 | 徽
 853 | 逞
 854 | 跪
 855 | 虞
 856 | 隙
 857 | 匀
 858 | 憧
 859 | 辄
 860 | 鸳
 861 | 疵
 862 | 跷
 863 | 呱
 864 | 穆
 865 | 阑
 866 | 搏
 867 | 肾
 868 | 靶
 869 | 阱
 870 | 囡
 871 | 寰
 872 | 庄
 873 | 蟾
 874 | 怠
 875 | 腕
 876 | 烟
 877 | 巾
 878 | 奢
 879 | 垄
 880 | 姨
 881 | 躯
 882 | 肺
 883 | 钰
 884 | 佰
 885 | 阙
 886 | 雏
 887 | 溉
 888 | 焚
 889 | 丑
 890 | 锥
 891 | 诘
 892 | 瞪
 893 | 茹
 894 | 绊
 895 | 蚀
 896 | 袱
 897 | 煽
 898 | 窕
 899 | 掷
 900 | 沮
 901 | 钞
 902 | 涕
 903 | 浏
 904 | 仄
 905 | 孰
 906 | 峻
 907 | 皱
 908 | 芦
 909 | 膏
 910 | 晰
 911 | 衬
 912 | 谍
 913 | 丞
 914 | 绽
 915 | 蔽
 916 | 呕
 917 | 轿
 918 | 隶
 919 | 楠
 920 | 匣
 921 | 葵
 922 | 沫
 923 | 刃
 924 | 禧
 925 | 晦
 926 | 哔
 927 | 晖
 928 | 绣
 929 | 仟
 930 | 窟
 931 | 谛
 932 | 瀚
 933 | 黛
 934 | 忿
 935 | 姚
 936 | 蜘
 937 | 耸
 938 | 捍
 939 | 斐
 940 | 卜
 941 | 辗
 942 | 刁
 943 | 涅
 944 | 泓
 945 | 梵
 946 | 扳
 947 | 暇
 948 | 袜
 949 | 柠
 950 | 傍
 951 | 逮
 952 | 呃
 953 | 蜗
 954 | 窍
 955 | 琉
 956 | 喃
 957 | 溢
 958 | 抉
 959 | 旷
 960 | 卅
 961 | 亟
 962 | 膝
 963 | 伶
 964 | 闇
 965 | 莺
 966 | 蔚
 967 | 醋
 968 | 瑛
 969 | 拭
 970 | 绮
 971 | 鑫
 972 | 圭
 973 | 脂
 974 | 酿
 975 | 诈
 976 | 膨
 977 | 隧
 978 | 惭
 979 | 庚
 980 | 衅
 981 | 哨
 982 | 凋
 983 | 里
 984 | 祯
 985 | 撼
 986 | 谭
 987 | 稻
 988 | 迋
 989 | 碌
 990 | 罕
 991 | 逾
 992 | 嗜
 993 | 蹲
 994 | 檬
 995 | 肖
 996 | 辖
 997 | 襟
 998 | 扎
 999 | 槟
1000 | 缔
1001 | 袂
1002 | 敷
1003 | 腥
1004 | 喘
1005 | 簿
1006 | 鳖
1007 | 出
1008 | 噢
1009 | 炫
1010 | 佑
1011 | 贷
1012 | 粮
1013 | 荳
1014 | 桦
1015 | 颉
1016 | 哑
1017 | 倪
1018 | 颤
1019 | 御
1020 | 芽
1021 | 朦
1022 | 裹
1023 | 贬
1024 | 蕉
1025 | 蝉
1026 | 赎
1027 | 崔
1028 | 滔
1029 | 茵
1030 | 径
1031 | 克
1032 | 啤
1033 | 拯
1034 | 坟
1035 | 葱
1036 | 芋
1037 | 瞒
1038 | 掠
1039 | 绳
1040 | 蛛
1041 | 匠
1042 | 凸
1043 | 苛
1044 | 押
1045 | 楣
1046 | 芙
1047 | 酌
1048 | 俺
1049 | 掏
1050 | 倡
1051 | 唾
1052 | 瞄
1053 | 磊
1054 | 吼
1055 | 搅
1056 | 溃
1057 | 聆
1058 | 沌
1059 | 蝇
1060 | 鸥
1061 | 妒
1062 | 焕
1063 | 拙
1064 | 夷
1065 | 迄
1066 | 绰
1067 | 锵
1068 | 耿
1069 | 祺
1070 | 吶
1071 | 惶
1072 | 廊
1073 | 兜
1074 | 倩
1075 | 杖
1076 | 窄
1077 | 僚
1078 | 竖
1079 | 芷
1080 | 咚
1081 | 鲢
1082 | 沛
1083 | 挪
1084 | 柄
1085 | 顷
1086 | 璞
1087 | 裸
1088 | 鵰
1089 | 郊
1090 | 屿
1091 | 仕
1092 | 艘
1093 | 铅
1094 | 铝
1095 | 饲
1096 | 黯
1097 | 疫
1098 | 栽
1099 | 喉
1100 | 逗
1101 | 祇
1102 | 阪
1103 | 侍
1104 | 抒
1105 | 弗
1106 | 尬
1107 | 浦
1108 | 鄙
1109 | 盏
1110 | 喽
1111 | 炳
1112 | 卵
1113 | 肌
1114 | 迦
1115 | 擅
1116 | 豹
1117 | 胏
1118 | 炼
1119 | 悸
1120 | 谴
1121 | 贾
1122 | 胀
1123 | 疋
1124 | 矿
1125 | 梨
1126 | 碑
1127 | 髓
1128 | 巢
1129 | 叹
1130 | 屡
1131 | 滩
1132 | 侮
1133 | 橘
1134 | 嘲
1135 | 酬
1136 | 枚
1137 | 氓
1138 | 菌
1139 | 颁
1140 | 萝
1141 | 谘
1142 | 曝
1143 | 薯
1144 | 襄
1145 | 辽
1146 | 萄
1147 | 寇
1148 | 舜
1149 | 颂
1150 | 撰
1151 | 腻
1152 | 崩
1153 | 咕
1154 | 癌
1155 | 歇
1156 | 汰
1157 | 烁
1158 | 撇
1159 | 宴
1160 | 惩
1161 | 烛
1162 | 贰
1163 | 呻
1164 | 呒
1165 | 翩
1166 | 绑
1167 | 捞
1168 | 爹
1169 | 秉
1170 | 棉
1171 | 妓
1172 | 尉
1173 | 霍
1174 | 甫
1175 | 尝
1176 | 葡
1177 | 蒸
1178 | 鸦
1179 | 挚
1180 | 奸
1181 | 纬
1182 | 艰
1183 | 履
1184 | 葬
1185 | 滨
1186 | 耕
1187 | 婴
1188 | 醇
1189 | 堵
1190 | 钉
1191 | 喧
1192 | 遂
1193 | 锣
1194 | 垮
1195 | 蓬
1196 | 薛
1197 | 虐
1198 | 睁
1199 | 厨
1200 | 娶
1201 | 浆
1202 | 挨
1203 | 矢
1204 | 蕾
1205 | 伺
1206 | 券
1207 | 鹏
1208 | 削
1209 | 蓄
1210 | 琦
1211 | 熄
1212 | 湘
1213 | 慌
1214 | 枕
1215 | 衍
1216 | 薇
1217 | 囊
1218 | 喂
1219 | 蕴
1220 | 倘
1221 | 峡
1222 | 浊
1223 | 窃
1224 | 颈
1225 | 裙
1226 | 晕
1227 | 缚
1228 | 获
1229 | 帕
1230 | 脾
1231 | 莹
1232 | 逍
1233 | 姬
1234 | 韦
1235 | 畔
1236 | 伐
1237 | 霞
1238 | 嘘
1239 | 盐
1240 | 摧
1241 | 债
1242 | 佩
1243 | 畏
1244 | 驴
1245 | 氧
1246 | 奴
1247 | 瘦
1248 | 菊
1249 | 廿
1250 | 狭
1251 | 赴
1252 | 碳
1253 | 坊
1254 | 盆
1255 | 趟
1256 | 匿
1257 | 肇
1258 | 溶
1259 | 揭
1260 | 剥
1261 | 沦
1262 | 秃
1263 | 郝
1264 | 唔
1265 | 锡
1266 | 娇
1267 | 抚
1268 | 屎
1269 | 甩
1270 | 娱
1271 | 表
1272 | 犬
1273 | 魁
1274 | 蒂
1275 | 皓
1276 | 祷
1277 | 瞎
1278 | 瘾
1279 | 煎
1280 | 螺
1281 | 遮
1282 | 坠
1283 | 剎
1284 | 筝
1285 | 棵
1286 | 冤
1287 | 崎
1288 | 昔
1289 | 驼
1290 | 竿
1291 | 甄
1292 | 斑
1293 | 歹
1294 | 骏
1295 | 缝
1296 | 鞭
1297 | 垫
1298 | 淹
1299 | 并
1300 | 遨
1301 | 宠
1302 | 掰
1303 | 枯
1304 | 艇
1305 | 豫
1306 | 募
1307 | 郁
1308 | 稚
1309 | 懿
1310 | 辐
1311 | 酱
1312 | 恕
1313 | 范
1314 | 涂
1315 | 滤
1316 | 肃
1317 | 膜
1318 | 佬
1319 | 哼
1320 | 慨
1321 | 穗
1322 | 辰
1323 | 雁
1324 | 瑟
1325 | 帆
1326 | 拢
1327 | 汁
1328 | 蝴
1329 | 冈
1330 | 诠
1331 | 蹈
1332 | 黏
1333 | 痞
1334 | 屑
1335 | 潇
1336 | 觅
1337 | 钧
1338 | 挣
1339 | 谐
1340 | 霜
1341 | 诊
1342 | 熬
1343 | 讽
1344 | 歧
1345 | 戈
1346 | 闯
1347 | 饶
1348 | 斤
1349 | 婉
1350 | 致
1351 | 贿
1352 | 苑
1353 | 矮
1354 | 毋
1355 | 詹
1356 | 祈
1357 | 咳
1358 | 昱
1359 | 佐
1360 | 帖
1361 | 猩
1362 | 尹
1363 | 诇
1364 | 肆
1365 | 亭
1366 | 丘
1367 | 淘
1368 | 颠
1369 | 勃
1370 | 讶
1371 | 抖
1372 | 袁
1373 | 柱
1374 | 僧
1375 | 蚊
1376 | 匹
1377 | 辣
1378 | 螂
1379 | 澡
1380 | 昧
1381 | 诡
1382 | 槽
1383 | 穴
1384 | 斩
1385 | 聘
1386 | 扶
1387 | 熙
1388 | 驰
1389 | 棍
1390 | 兆
1391 | 蟑
1392 | 矩
1393 | 谬
1394 | 贫
1395 | 鼎
1396 | 践
1397 | 盲
1398 | 眷
1399 | 尿
1400 | 伫
1401 | 饿
1402 | 砸
1403 | 妄
1404 | 荡
1405 | 炒
1406 | 冥
1407 | 偿
1408 | 墓
1409 | 骄
1410 | 毙
1411 | 淋
1412 | 芝
1413 | 胃
1414 | 宅
1415 | 董
1416 | 梭
1417 | 凑
1418 | 宰
1419 | 卑
1420 | 丛
1421 | 纠
1422 | 肢
1423 | 闽
1424 | 铜
1425 | 寺
1426 | 瞬
1427 | 澳
1428 | 庞
1429 | 腔
1430 | 泼
1431 | 昂
1432 | 梁
1433 | 躺
1434 | 姻
1435 | 潭
1436 | 吋
1437 | 撤
1438 | 殖
1439 | 轴
1440 | 颖
1441 | 冻
1442 | 琼
1443 | 恳
1444 | 衫
1445 | 譬
1446 | 猎
1447 | 衰
1448 | 桶
1449 | 辜
1450 | 筒
1451 | 赫
1452 | 仗
1453 | 膀
1454 | 乳
1455 | 嚣
1456 | 划
1457 | 玮
1458 | 卿
1459 | 枉
1460 | 埃
1461 | 跨
1462 | 粹
1463 | 猴
1464 | 愤
1465 | 壹
1466 | 卢
1467 | 尧
1468 | 翰
1469 | 叮
1470 | 媚
1471 | 钮
1472 | 袖
1473 | 斌
1474 | 卓
1475 | 粽
1476 | 雀
1477 | 谦
1478 | 傅
1479 | 殿
1480 | 睹
1481 | 菁
1482 | 桂
1483 | 诱
1484 | 舌
1485 | 惟
1486 | 岗
1487 | 衷
1488 | 屈
1489 | 陋
1490 | 陌
1491 | 宵
1492 | 麟
1493 | 魏
1494 | 贸
1495 | 几
1496 | 埔
1497 | 谎
1498 | 袍
1499 | 卸
1500 | 仓
1501 | 匪
1502 | 叛
1503 | 肠
1504 | 肝
1505 | 俄
1506 | 孕
1507 | 庙
1508 | 嫁
1509 | 肤
1510 | 拦
1511 | 羯
1512 | 匙
1513 | 咏
1514 | 蠢
1515 | 纽
1516 | 拘
1517 | 旨
1518 | 胁
1519 | 馨
1520 | 珊
1521 | 签
1522 | 赔
1523 | 秩
1524 | 喻
1525 | 谜
1526 | 翠
1527 | 芭
1528 | 摊
1529 | 侣
1530 | 灿
1531 | 寡
1532 | 罐
1533 | 贼
1534 | 叙
1535 | 谨
1536 | 体
1537 | 敲
1538 | 浴
1539 | 吻
1540 | 臂
1541 | 袭
1542 | 煮
1543 | 腹
1544 | 暮
1545 | 曹
1546 | 虹
1547 | 抑
1548 | 贩
1549 | 踩
1550 | 澎
1551 | 糖
1552 | 催
1553 | 萍
1554 | 垂
1555 | 斥
1556 | 侬
1557 | 拷
1558 | 唤
1559 | 匆
1560 | 阮
1561 | 飙
1562 | 柴
1563 | 剂
1564 | 妖
1565 | 添
1566 | 畅
1567 | 汗
1568 | 鸭
1569 | 稀
1570 | 晋
1571 | 埋
1572 | 弊
1573 | 返
1574 | 叡
1575 | 娟
1576 | 玻
1577 | 腾
1578 | 栋
1579 | 歪
1580 | 邓
1581 | 渴
1582 | 粒
1583 | 泣
1584 | 疾
1585 | 蓉
1586 | 塑
1587 | 祂
1588 | 储
1589 | 劣
1590 | 柯
1591 | 陶
1592 | 患
1593 | 蛇
1594 | 腐
1595 | 琳
1596 | 慎
1597 | 泊
1598 | 牢
1599 | 呈
1600 | 趁
1601 | 恶
1602 | 浑
1603 | 扮
1604 | 樱
1605 | 臣
1606 | 遵
1607 | 缠
1608 | 虫
1609 | 撒
1610 | 叉
1611 | 刑
1612 | 苗
1613 | 脉
1614 | 盈
1615 | 津
1616 | 愧
1617 | 摔
1618 | 盒
1619 | 丧
1620 | 鹤
1621 | 呦
1622 | 厕
1623 | 斜
1624 | 芒
1625 | 翅
1626 | 悄
1627 | 晃
1628 | 茂
1629 | 寸
1630 | 杉
1631 | 旺
1632 | 俩
1633 | 雯
1634 | 霖
1635 | 递
1636 | 胶
1637 | 氛
1638 | 谣
1639 | 捉
1640 | 虾
1641 | 秘
1642 | 漠
1643 | 扭
1644 | 贞
1645 | 陵
1646 | 叔
1647 | 轨
1648 | 鹅
1649 | 液
1650 | 妥
1651 | 贱
1652 | 涨
1653 | 滥
1654 | 痕
1655 | 沿
1656 | 秤
1657 | 措
1658 | 巡
1659 | 丈
1660 | 魅
1661 | 欲
1662 | 缸
1663 | 鹿
1664 | 汝
1665 | 迁
1666 | 矣
1667 | 肩
1668 | 烤
1669 | 笛
1670 | 迅
1671 | 劫
1672 | 趋
1673 | 披
1674 | 荷
1675 | 卒
1676 | 丙
1677 | 碗
1678 | 伙
1679 | 椅
1680 | 赞
1681 | 侦
1682 | 灾
1683 | 秦
1684 | 蛙
1685 | 禅
1686 | 慰
1687 | 余
1688 | 朗
1689 | 辱
1690 | 征
1691 | 愚
1692 | 抛
1693 | 挺
1694 | 彭
1695 | 允
1696 | 靖
1697 | 滋
1698 | 凝
1699 | 赠
1700 | 莎
1701 | 顽
1702 | 狠
1703 | 堕
1704 | 翘
1705 | 惹
1706 | 纲
1707 | 贯
1708 | 饼
1709 | 抬
1710 | 逆
1711 | 堪
1712 | 坤
1713 | 斗
1714 | 钦
1715 | 疏
1716 | 羞
1717 | 扇
1718 | 蜂
1719 | 赌
1720 | 驻
1721 | 屏
1722 | 爵
1723 | 轰
1724 | 契
1725 | 悦
1726 | 邻
1727 | 哉
1728 | 陀
1729 | 裂
1730 | 刷
1731 | 毅
1732 | 拾
1733 | 疼
1734 | 阔
1735 | 耍
1736 | 亏
1737 | 吟
1738 | 锐
1739 | 惧
1740 | 锅
1741 | 蝶
1742 | 壳
1743 | 糕
1744 | 舟
1745 | 牧
1746 | 妮
1747 | 粗
1748 | 仇
1749 | 驶
1750 | 促
1751 | 孝
1752 | 裤
1753 | 誉
1754 | 家
1755 | 迈
1756 | 姿
1757 | 踪
1758 | 兔
1759 | 综
1760 | 旭
1761 | 韵
1762 | 齿
1763 | 乔
1764 | 怖
1765 | 晴
1766 | 闷
1767 | 墨
1768 | 咬
1769 | 侧
1770 | 狱
1771 | 琪
1772 | 梯
1773 | 宾
1774 | 枫
1775 | 锦
1776 | 瑜
1777 | 敦
1778 | 矛
1779 | 弘
1780 | 玛
1781 | 茫
1782 | 迪
1783 | 览
1784 | 挤
1785 | 雳
1786 | 岚
1787 | 卷
1788 | 黎
1789 | 薄
1790 | 柳
1791 | 咦
1792 | 廷
1793 | 瞧
1794 | 幅
1795 | 挖
1796 | 唬
1797 | 侯
1798 | 祸
1799 | 饰
1800 | 儒
1801 | 捡
1802 | 筋
1803 | 融
1804 | 耗
1805 | 铃
1806 | 奉
1807 | 鼻
1808 | 坜
1809 | 曼
1810 | 贡
1811 | 嗨
1812 | 炎
1813 | 啡
1814 | 捐
1815 | 炮
1816 | 霹
1817 | 貌
1818 | 鸣
1819 | 饱
1820 | 廉
1821 | 绘
1822 | 咪
1823 | 吝
1824 | 肚
1825 | 云
1826 | 翼
1827 | 氏
1828 | 骚
1829 | 爷
1830 | 寿
1831 | 绕
1832 | 唷
1833 | 牺
1834 | 屠
1835 | 谋
1836 | 彻
1837 | 俱
1838 | 粉
1839 | 雾
1840 | 涵
1841 | 侨
1842 | 础
1843 | 疗
1844 | 署
1845 | 稿
1846 | 涉
1847 | 稣
1848 | 誓
1849 | 箭
1850 | 涯
1851 | 锺
1852 | 迹
1853 | 抄
1854 | 踢
1855 | 贪
1856 | 咖
1857 | 莱
1858 | 夺
1859 | 勉
1860 | 焦
1861 | 蒋
1862 | 桑
1863 | 沧
1864 | 恰
1865 | 泳
1866 | 牲
1867 | 戒
1868 | 恼
1869 | 夕
1870 | 棚
1871 | 爬
1872 | 菲
1873 | 翁
1874 | 奔
1875 | 滴
1876 | 玄
1877 | 捷
1878 | 曰
1879 | 愉
1880 | 逊
1881 | 憾
1882 | 钓
1883 | 壁
1884 | 躲
1885 | 嫌
1886 | 姆
1887 | 乏
1888 | 洛
1889 | 逼
1890 | 磨
1891 | 剪
1892 | 逝
1893 | 亨
1894 | 盼
1895 | 杯
1896 | 敝
1897 | 碍
1898 | 痴
1899 | 植
1900 | 瑰
1901 | 勤
1902 | 悟
1903 | 彬
1904 | 删
1905 | 薪
1906 | 悠
1907 | 胎
1908 | 侵
1909 | 坪
1910 | 赋
1911 | 弯
1912 | 丹
1913 | 巫
1914 | 轩
1915 | 辨
1916 | 吐
1917 | 么
1918 | 盾
1919 | 扯
1920 | 割
1921 | 艾
1922 | 幼
1923 | 捕
1924 | 召
1925 | 怒
1926 | 坡
1927 | 缓
1928 | 猛
1929 | 驾
1930 | 莉
1931 | 彦
1932 | 韩
1933 | 鞋
1934 | 碧
1935 | 泽
1936 | 泉
1937 | 缴
1938 | 跃
1939 | 喇
1940 | 腿
1941 | 糟
1942 | 胆
1943 | 摘
1944 | 朵
1945 | 逛
1946 | 甜
1947 | 拔
1948 | 劲
1949 | 悉
1950 | 穷
1951 | 汤
1952 | 唐
1953 | 臭
1954 | 玲
1955 | 怡
1956 | 舍
1957 | 欺
1958 | 蜜
1959 | 耻
1960 | 坦
1961 | 叭
1962 | 亿
1963 | 忌
1964 | 鲁
1965 | 繁
1966 | 泥
1967 | 伸
1968 | 壮
1969 | 串
1970 | 圾
1971 | 币
1972 | 荒
1973 | 垃
1974 | 妇
1975 | 旦
1976 | 截
1977 | 喷
1978 | 碎
1979 | 吕
1980 | 犹
1981 | 抹
1982 | 脆
1983 | 煞
1984 | 胞
1985 | 晶
1986 | 潜
1987 | 玫
1988 | 妻
1989 | 估
1990 | 陷
1991 | 孔
1992 | 娃
1993 | 兽
1994 | 肥
1995 | 凉
1996 | 岂
1997 | 逻
1998 | 胸
1999 | 杜
2000 | 袋
2001 | 甘
2002 | 邀
2003 | 培
2004 | 龄
2005 | 辆
2006 | 廖
2007 | 冲
2008 | 渡
2009 | 羽
2010 | 秒
2011 | 辞
2012 | 倾
2013 | 窝
2014 | 柏
2015 | 淑
2016 | 诞
2017 | 漏
2018 | 姑
2019 | 托
2020 | 吾
2021 | 纷
2022 | 拆
2023 | 浩
2024 | 税
2025 | 邱
2026 | 迟
2027 | 筹
2028 | 监
2029 | 汪
2030 | 擎
2031 | 衡
2032 | 狐
2033 | 灰
2034 | 尖
2035 | 番
2036 | 罚
2037 | 证
2038 | 盗
2039 | 祥
2040 | 毫
2041 | 彰
2042 | 扩
2043 | 幽
2044 | 阐
2045 | 喊
2046 | 菩
2047 | 赐
2048 | 奋
2049 | 鲜
2050 | 劝
2051 | 栏
2052 | 慈
2053 | 扫
2054 | 尽
2055 | 穹
2056 | 丌
2057 | 绪
2058 | 砂
2059 | 勿
2060 | 抢
2061 | 啪
2062 | 庸
2063 | 赤
2064 | 饮
2065 | 萨
2066 | 兼
2067 | 访
2068 | 舒
2069 | 裕
2070 | 逸
2071 | 宙
2072 | 丸
2073 | 准
2074 | 魂
2075 | 厚
2076 | 励
2077 | 仰
2078 | 糊
2079 | 顿
2080 | 闭
2081 | 塔
2082 | 枪
2083 | 睛
2084 | 斋
2085 | 奥
2086 | 恭
2087 | 翔
2088 | 遥
2089 | 航
2090 | 孟
2091 | 昌
2092 | 卧
2093 | 颇
2094 | 革
2095 | 邪
2096 | 阻
2097 | 蟹
2098 | 裁
2099 | 后
2100 | 函
2101 | 于
2102 | 拳
2103 | 宽
2104 | 锋
2105 | 州
2106 | 葛
2107 | 拒
2108 | 池
2109 | 镇
2110 | 芬
2111 | 岸
2112 | 寞
2113 | 凭
2114 | 姊
2115 | 殊
2116 | 板
2117 | 勒
2118 | 慕
2119 | 跌
2120 | 踏
2121 | 填
2122 | 陪
2123 | 逐
2124 | 洽
2125 | 描
2126 | 妨
2127 | 仪
2128 | 摄
2129 | 紫
2130 | 谅
2131 | 阅
2132 | 邦
2133 | 麦
2134 | 莲
2135 | 闪
2136 | 纵
2137 | 庭
2138 | 圈
2139 | 榜
2140 | 滑
2141 | 舰
2142 | 面
2143 | 献
2144 | 浅
2145 | 飘
2146 | 宋
2147 | 俗
2148 | 沟
2149 | 巷
2150 | 眠
2151 | 帽
2152 | 惑
2153 | 羊
2154 | 牵
2155 | 净
2156 | 厉
2157 | 撞
2158 | 崇
2159 | 竞
2160 | 回
2161 | 乙
2162 | 聪
2163 | 桃
2164 | 伍
2165 | 役
2166 | 潮
2167 | 损
2168 | 凯
2169 | 锁
2170 | 震
2171 | 醉
2172 | 屁
2173 | 牠
2174 | 孙
2175 | 酷
2176 | 染
2177 | 尺
2178 | 摸
2179 | 盛
2180 | 闹
2181 | 棋
2182 | 吓
2183 | 迫
2184 | 瓜
2185 | 松
2186 | 搬
2187 | 戴
2188 | 瞭
2189 | 乌
2190 | 谱
2191 | 滚
2192 | 赚
2193 | 障
2194 | 逃
2195 | 齐
2196 | 牙
2197 | 怨
2198 | 拖
2199 | 皇
2200 | 贺
2201 | 横
2202 | 塞
2203 | 摆
2204 | 农
2205 | 倍
2206 | 额
2207 | 乘
2208 | 户
2209 | 奈
2210 | 川
2211 | 徐
2212 | 井
2213 | 寝
2214 | 洞
2215 | 劳
2216 | 船
2217 | 域
2218 | 屋
2219 | 胖
2220 | 藉
2221 | 销
2222 | 拼
2223 | 桌
2224 | 忧
2225 | 违
2226 | 拟
2227 | 吵
2228 | 媒
2229 | 辩
2230 | 妙
2231 | 鸿
2232 | 恩
2233 | 映
2234 | 耳
2235 | 傻
2236 | 京
2237 | 搭
2238 | 残
2239 | 稍
2240 | 颜
2241 | 固
2242 | 眉
2243 | 龟
2244 | 哀
2245 | 发
2246 | 沈
2247 | 拨
2248 | 丁
2249 | 愁
2250 | 耐
2251 | 宪
2252 | 覆
2253 | 盟
2254 | 昭
2255 | 握
2256 | 萧
2257 | 延
2258 | 豆
2259 | 弱
2260 | 隆
2261 | 页
2262 | 烧
2263 | 遍
2264 | 距
2265 | 摩
2266 | 祖
2267 | 探
2268 | 倚
2269 | 寂
2270 | 阴
2271 | 悔
2272 | 库
2273 | 嘴
2274 | 沉
2275 | 伊
2276 | 暂
2277 | 霸
2278 | 喵
2279 | 频
2280 | 鼓
2281 | 冒
2282 | 鼠
2283 | 企
2284 | 副
2285 | 菜
2286 | 款
2287 | 忽
2288 | 尾
2289 | 租
2290 | 椰
2291 | 隔
2292 | 狼
2293 | 浮
2294 | 惠
2295 | 峰
2296 | 索
2297 | 芳
2298 | 摇
2299 | 洪
2300 | 伦
2301 | 骨
2302 | 吹
2303 | 郑
2304 | 哩
2305 | 珍
2306 | 纳
2307 | 零
2308 | 哲
2309 | 遭
2310 | 瓶
2311 | 亡
2312 | 振
2313 | 予
2314 | 村
2315 | 旅
2316 | 惨
2317 | 汽
2318 | 爸
2319 | 隐
2320 | 械
2321 | 寒
2322 | 危
2323 | 邮
2324 | 贝
2325 | 阶
2326 | 赖
2327 | 茶
2328 | 谊
2329 | 涛
2330 | 惯
2331 | 尘
2332 | 丝
2333 | 森
2334 | 询
2335 | 露
2336 | 稳
2337 | 桥
2338 | 夏
2339 | 哭
2340 | 坚
2341 | 籍
2342 | 厌
2343 | 苍
2344 | 析
2345 | 冰
2346 | 仙
2347 | 布
2348 | 箱
2349 | 脱
2350 | 贤
2351 | 途
2352 | 订
2353 | 财
2354 | 欧
2355 | 赢
2356 | 枢
2357 | 泪
2358 | 废
2359 | 钢
2360 | 渐
2361 | 泡
2362 | 刊
2363 | 肯
2364 | 恨
2365 | 砍
2366 | 抽
2367 | 股
2368 | 咧
2369 | 婆
2370 | 禁
2371 | 郎
2372 | 默
2373 | 符
2374 | 缩
2375 | 童
2376 | 绿
2377 | 骗
2378 | 辈
2379 | 尼
2380 | 届
2381 | 彼
2382 | 兮
2383 | 聚
2384 | 宇
2385 | 辛
2386 | 疯
2387 | 减
2388 | 米
2389 | 念
2390 | 降
2391 | 街
2392 | 临
2393 | 敏
2394 | 洗
2395 | 玉
2396 | 伴
2397 | 辅
2398 | 诺
2399 | 鸡
2400 | 侠
2401 | 健
2402 | 熊
2403 | 顶
2404 | 挑
2405 | 替
2406 | 豪
2407 | 掌
2408 | 饭
2409 | 银
2410 | 圆
2411 | 志
2412 | 休
2413 | 材
2414 | 灭
2415 | 烈
2416 | 爆
2417 | 透
2418 | 遗
2419 | 虚
2420 | 醒
2421 | 货
2422 | 雅
2423 | 宏
2424 | 帅
2425 | 宫
2426 | 港
2427 | 偶
2428 | 丢
2429 | 篮
2430 | 凡
2431 | 瑞
2432 | 硕
2433 | 雪
2434 | 忠
2435 | 蔡
2436 | 插
2437 | 积
2438 | 乖
2439 | 挥
2440 | 抗
2441 | 察
2442 | 末
2443 | 盖
2444 | 厅
2445 | 移
2446 | 吸
2447 | 括
2448 | 笨
2449 | 孤
2450 | 译
2451 | 避
2452 | 秀
2453 | 富
2454 | 漂
2455 | 柔
2456 | 私
2457 | 围
2458 | 狮
2459 | 祝
2460 | 庆
2461 | 序
2462 | 拥
2463 | 洲
2464 | 徒
2465 | 借
2466 | 晓
2467 | 嘉
2468 | 诗
2469 | 淡
2470 | 束
2471 | 姓
2472 | 颗
2473 | 勇
2474 | 犯
2475 | 喝
2476 | 食
2477 | 镜
2478 | 偏
2479 | 猜
2480 | 层
2481 | 帐
2482 | 仅
2483 | 购
2484 | 衣
2485 | 申
2486 | 伯
2487 | 紧
2488 | 县
2489 | 婚
2490 | 季
2491 | 敬
2492 | 弃
2493 | 尊
2494 | 蛋
2495 | 鹰
2496 | 熟
2497 | 冠
2498 | 唯
2499 | 混
2500 | 藏
2501 | 河
2502 | 忍
2503 | 窗
2504 | 朝
2505 | 轮
2506 | 册
2507 | 乡
2508 | 敌
2509 | 散
2510 | 沙
2511 | 幻
2512 | 短
2513 | 略
2514 | 批
2515 | 游
2516 | 奖
2517 | 岛
2518 | 逢
2519 | 脸
2520 | 顾
2521 | 督
2522 | 协
2523 | 雷
2524 | 详
2525 | 穿
2526 | 慧
2527 | 巧
2528 | 罢
2529 | 呼
2530 | 暗
2531 | 贴
2532 | 纸
2533 | 歉
2534 | 郭
2535 | 努
2536 | 担
2537 | 蓝
2538 | 训
2539 | 享
2540 | 架
2541 | 济
2542 | 猪
2543 | 派
2544 | 均
2545 | 妈
2546 | 哦
2547 | 宣
2548 | 检
2549 | 鬼
2550 | 灯
2551 | 策
2552 | 梅
2553 | 启
2554 | 嘿
2555 | 洋
2556 | 伟
2557 | 萤
2558 | 磁
2559 | 啰
2560 | 付
2561 | 弄
2562 | 寄
2563 | 钟
2564 | 播
2565 | 险
2566 | 载
2567 | 赏
2568 | 汉
2569 | 块
2570 | 刀
2571 | 铭
2572 | 施
2573 | 卫
2574 | 弹
2575 | 售
2576 | 叶
2577 | 皆
2578 | 罪
2579 | 虎
2580 | 归
2581 | 毛
2582 | 昨
2583 | 荣
2584 | 律
2585 | 树
2586 | 奏
2587 | 注
2588 | 扁
2589 | 笔
2590 | 旁
2591 | 键
2592 | 制
2593 | 莫
2594 | 堆
2595 | 射
2596 | 承
2597 | 波
2598 | 皮
2599 | 释
2600 | 判
2601 | 含
2602 | 既
2603 | 退
2604 | 纪
2605 | 刻
2606 | 肉
2607 | 靠
2608 | 麻
2609 | 湖
2610 | 继
2611 | 诚
2612 | 姐
2613 | 益
2614 | 置
2615 | 惜
2616 | 艺
2617 | 尚
2618 | 纯
2619 | 骂
2620 | 琴
2621 | 漫
2622 | 援
2623 | 缺
2624 | 诸
2625 | 尤
2626 | 忆
2627 | 景
2628 | 府
2629 | 委
2630 | 刘
2631 | 绍
2632 | 虑
2633 | 暴
2634 | 草
2635 | 充
2636 | 授
2637 | 防
2638 | 素
2639 | 房
2640 | 搞
2641 | 典
2642 | 仔
2643 | 父
2644 | 吉
2645 | 招
2646 | 剑
2647 | 脚
2648 | 突
2649 | 牌
2650 | 餐
2651 | 仁
2652 | 酒
2653 | 礼
2654 | 巴
2655 | 丽
2656 | 亮
2657 | 恐
2658 | 述
2659 | 周
2660 | 杂
2661 | 旧
2662 | 套
2663 | 赵
2664 | 堂
2665 | 创
2666 | 母
2667 | 辑
2668 | 络
2669 | 俊
2670 | 毒
2671 | 威
2672 | 冷
2673 | 蛮
2674 | 普
2675 | 登
2676 | 微
2677 | 控
2678 | 爽
2679 | 香
2680 | 坐
2681 | 缘
2682 | 幕
2683 | 兰
2684 | 悲
2685 | 势
2686 | 午
2687 | 睡
2688 | 密
2689 | 垒
2690 | 警
2691 | 宗
2692 | 严
2693 | 阵
2694 | 江
2695 | 亚
2696 | 攻
2697 | 静
2698 | 抱
2699 | 啥
2700 | 急
2701 | 宿
2702 | 剧
2703 | 词
2704 | 忙
2705 | 牛
2706 | 吴
2707 | 陆
2708 | 维
2709 | 激
2710 | 增
2711 | 聊
2712 | 浪
2713 | 状
2714 | 良


--------------------------------------------------------------------------------