├── README.md ├── scripts ├── data │ ├── dict.js │ ├── dict.txt.big │ └── dictionary.js ├── finalseg.js ├── finalseg │ ├── prob_emit.js │ ├── prob_start.js │ └── prob_trans.js ├── main.js └── require.js └── test.html /README.md: -------------------------------------------------------------------------------- 1 | jieba-js 2 | ======== 3 | 4 | A JavaScript Chinese word segmentation tool based on Python Jieba 5 | 6 | To run, import `dictionary.js` and `main.js`, and then call `jieba.cut("我的中文東西。")` to get a result `["我", "的", "中文", "東西", "。"]` -------------------------------------------------------------------------------- /scripts/data/dict.js: -------------------------------------------------------------------------------- 1 | var dict = [ 2 | "1号店", 3, "n", 3 | "1號店", 3, "n", 4 | "4S店", 3, "n", 5 | "4s店", 3, "n", 6 | "AA制", 3, "n", 7 | "AB型", 3, "n", 8 | "AT&T", 3, "nz", 9 | "A型", 3, "n", 10 | "A座", 3, "n", 11 | "A股", 3, "n", 12 | "A輪", 3, "n", 13 | "A轮", 3, "n", 14 | "BB机", 3, "n", 15 | "BB機", 3, "n", 16 | "BP机", 3, "n", 17 | "BP機", 3, "n", 18 | "B型", 3, "n", 19 | "B座", 3, "n", 20 | "B股", 3, "n", 21 | "B超", 3, "n", 22 | "B輪", 3, "n", 23 | "B轮", 3, "n", 24 | "C#", 3, "nz", 25 | "C++", 3, "nz", 26 | "CALL机", 3, "n", 27 | "CALL機", 3, "n", 28 | "CD机", 3, "n", 29 | "CD機", 3, "n", 30 | "CD盒", 3, "n", 31 | "C座", 3, "n", 32 | "C盘", 3, "n", 33 | "C盤", 3, "n", 34 | "C語言", 3, "n", 35 | "C语言", 3, "n", 36 | "D座", 3, "n", 37 | "D版", 3, "n", 38 | "D盘", 3, "n", 39 | "D盤", 3, "n", 40 | "E化", 3, "n", 41 | "E座", 3, "n", 42 | "E盘", 3, "n", 43 | "E盤", 3, "n", 44 | "E通", 3, "n", 45 | "F座", 3, "n", 46 | "F盘", 3, "n", 47 | "F盤", 3, "n", 48 | "G盘", 3, "n", 49 | "G盤", 3, "n", 50 | "H盘", 3, "n", 51 | "H盤", 3, "n", 52 | "H股", 3, "n", 53 | "IC卡", 3, "n", 54 | "IP卡", 3, "n", 55 | "IP地址", 3, "n", 56 | "IP电话", 3, "n", 57 | "IP電話", 3, "n", 58 | "I盘", 3, "n", 59 | "I盤", 3, "n", 60 | "K党", 3, "n", 61 | "K歌之王", 3, "n", 62 | "K黨", 3, "n", 63 | "N年", 3, "n", 64 | "O型", 3, "n", 65 | "PC机", 3, "n", 66 | "PC機", 3, "n", 67 | "PH值", 3, "n", 68 | "QQ号", 3, "n", 69 | "QQ號", 3, "n", 70 | "Q版", 3, "n", 71 | "RSS訂閱", 3, "n", 72 | "RSS订阅", 3, "n", 73 | "SIM卡", 3, "n", 74 | "T台", 3, "n", 75 | "T型台", 3, "n", 76 | "T型臺", 3, "n", 77 | "T恤", 4, "n", 78 | "T恤衫", 3, "n", 79 | "T盘", 3, "n", 80 | "T盤", 3, "n", 81 | "T臺", 3, "n", 82 | "U盘", 3, "n", 83 | "U盤", 3, "n", 84 | "VISA卡", 3, "n", 85 | "X光", 3, "n", 86 | "X光線", 3, "n", 87 | "X光线", 3, "n", 88 | "X射線", 3, "n", 89 | "X射线", 3, "n", 90 | "Z盘", 3, "n", 91 | "Z盤", 3, "n", 92 | "c#", 3, "nz", 93 | "c++", 3, "nz", 94 | "γ射線", 3, "n", 95 | "γ射线", 3, "n", 96 | "䰾", 7, "zg", 97 | "䲁", 17, "zg", 98 | "䴉", 22, "zg", 99 | "一", 217830, "m", 100 | "一一", 1670, "m", 101 | "一一二", 11, "m", 102 | "一一例", 3, "m", 103 | "一一分", 8, "m", 104 | "一一列举", 34, "i", 105 | "一一列舉", 34, "i", 106 | "一一对", 9, "m", 107 | "一一对应", 43, "l", 108 | "一一對", 9, "m", 109 | "一一對應", 43, "l", 110 | "一一記", 2, "m", 111 | "一一记", 2, "m", 112 | "一一道來", 4, "l", 113 | "一一道来", 4, "l", 114 | "一丁", 18, "d", 115 | "一丁不識", 3, "i" 116 | ]; 117 | 118 | define(function (require) { 119 | return dict; 120 | }); -------------------------------------------------------------------------------- /scripts/finalseg.js: -------------------------------------------------------------------------------- 1 | require(["finalseg/prob_emit", "finalseg/prob_start", "finalseg/prob_trans"], function(prob_emit, prob_trans, prob_emit) { 2 | 3 | 4 | } 5 | 6 | define(function (require) { 7 | var re_han = /([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)/, 8 | re_skip = /(\r\n|\s)/; 9 | 10 | def viterbi(obs, states, start_p, trans_p, emit_p): 11 | V = [{}] #tabular 12 | path = {} 13 | for y in states: #init 14 | V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) 15 | path[y] = [y] 16 | for t in range(1,len(obs)): 17 | V.append({}) 18 | newpath = {} 19 | for y in states: 20 | em_p = emit_p[y].get(obs[t],MIN_FLOAT) 21 | (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ]) 22 | V[t][y] =prob 23 | newpath[y] = path[state] + [y] 24 | path = newpath 25 | 26 | (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')]) 27 | 28 | return (prob, path[state]) 29 | 30 | 31 | def __cut(sentence): 32 | global emit_P 33 | prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P) 34 | begin, next = 0,0 35 | #print pos_list, sentence 36 | for i,char in enumerate(sentence): 37 | pos = pos_list[i] 38 | if pos=='B': 39 | begin = i 40 | elif pos=='E': 41 | yield sentence[begin:i+1] 42 | next = i+1 43 | elif pos=='S': 44 | yield char 45 | next = i+1 46 | if next= N) { 83 | i += 1; 84 | j = i; 85 | p = trie; 86 | } 87 | } 88 | else { 89 | p = trie; 90 | i += 1; 91 | j = i; 92 | } 93 | } 94 | for (i = 0; i < sentence.length; i++) { 95 | if (!(i in DAG)) { 96 | DAG[i] = [i]; 97 | } 98 | } 99 | return DAG; 100 | } 101 | 102 | var calc = function( sentence, DAG, idx, route ) { 103 | var N = sentence.length; 104 | route[N] = [0.0, '']; 105 | for (idx = N - 1; idx > -1; idx--) { 106 | candidates = []; 107 | candidates_x = []; 108 | for (xi in DAG[idx]) { 109 | var x = DAG[idx][xi]; 110 | var f = ((sentence.substring(idx, x+1) in FREQ) ? FREQ[sentence.substring(idx, x+1)] : min_freq); 111 | candidates.push(f + route[x+1][0]); 112 | candidates_x.push(x); 113 | } 114 | var m = max_of_array(candidates); 115 | console.log('max is', m); 116 | route[idx] = [m, candidates_x[candidates.indexOf(m)]]; 117 | } 118 | } 119 | 120 | var __cut_DAG = function(sentence) { 121 | // finalseg is still to be implemented, 122 | // so this is also unfinished. Use __cut_DAG_NO_HMM 123 | // for now 124 | 125 | var DAG = get_DAG(sentence); 126 | var route = {}; 127 | var yieldValues = []; 128 | 129 | calc(sentence, DAG, 0, route); 130 | 131 | var x = 0, 132 | buf = '', 133 | N = sentence.length; 134 | 135 | while(x < N) { 136 | var y = route[x][1]+1, 137 | l_word = sentence.substring(x, y); 138 | if (y - x == 1) { 139 | buf += l_word; 140 | } 141 | else { 142 | if (buf.length > 0) { 143 | if (buf.length == 1) { 144 | yieldValues.push(buf); 145 | } 146 | else { 147 | if (!(buf in FREQ)) { 148 | var recognized = finalseg.cut(buf); 149 | for (t in recognized) { 150 | yieldValues.push(recognized[t]); 151 | } 152 | } 153 | else { 154 | for (elem in buf) { 155 | yieldValues.push(buf[elem]); 156 | } 157 | } 158 | buf = ""; 159 | } 160 | } 161 | yieldValues.push(l_word); 162 | } 163 | x = y; 164 | } 165 | 166 | 167 | if (buf.length > 0) { 168 | if (buf.length == 1) { 169 | yieldValues.push(buf); 170 | } 171 | else { 172 | if (!(buf in FREQ)) { 173 | var recognized = finalseg.cut(buf); 174 | for (t in recognized) { 175 | yieldValues.push(recognized[t]); 176 | } 177 | } 178 | else { 179 | for (elem in buf) { 180 | yieldValues.push(buf[elem]); 181 | } 182 | } 183 | } 184 | } 185 | return yieldValues; 186 | } 187 | 188 | var __cut_DAG_NO_HMM = function (sentence) { 189 | var re_eng = /[a-zA-Z0-9]/, 190 | route = {}, 191 | yieldValues = []; 192 | 193 | var DAG = get_DAG(sentence); 194 | console.log("DAG", DAG); 195 | calc(sentence, DAG, 0, route); 196 | 197 | console.log(route); 198 | 199 | var x = 0, 200 | buf = '', 201 | N = sentence.length; 202 | 203 | while (x < N) { 204 | y = route[x][1] + 1; 205 | l_word = sentence.substring(x, y); 206 | console.log(l_word, l_word.match(re_eng)) 207 | if (l_word.match(re_eng) && l_word.length == 1) { 208 | buf += l_word; 209 | x = y; 210 | } 211 | else { 212 | if (buf.length > 0) { 213 | yieldValues.push(buf); 214 | buf = ''; 215 | } 216 | yieldValues.push(l_word); 217 | x = y; 218 | } 219 | } 220 | if (buf.length > 0) { 221 | yieldValues.push(buf); 222 | buf = ''; 223 | } 224 | return yieldValues; 225 | } 226 | 227 | var cut = function(sentence){ 228 | var cut_all = false, 229 | HMM = false, 230 | yieldValues = []; 231 | 232 | var re_han = /([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)/, 233 | re_skip = /(\r\n|\s)/; 234 | 235 | var blocks = sentence.split(re_han); 236 | var cut_block = HMM ? __cut_DAG : __cut_DAG_NO_HMM; 237 | 238 | for (b in blocks) { 239 | var blk = blocks[b]; 240 | console.log(b, blk); 241 | if (blk.length == 0) { 242 | continue; 243 | } 244 | 245 | if (blk.match(re_han)) { 246 | var cutted = cut_block(blk); 247 | console.log("matches", cutted); 248 | for (w in cutted) { 249 | var word = cutted[w]; 250 | yieldValues.push(word); 251 | } 252 | } 253 | else { 254 | var tmp = blk.split(re_skip); 255 | for (var i = 0; i < tmp.length; i++) { 256 | var x = tmp[i]; 257 | if (x.match(re_skip)) { 258 | yieldValues.push(x); 259 | } 260 | else if (!cut_all) { 261 | for (xi in x) { 262 | yieldValues.push(x[xi]); 263 | } 264 | } 265 | else { 266 | yieldValues.push(x); 267 | } 268 | } 269 | } 270 | } 271 | return yieldValues; 272 | } 273 | 274 | // initialize when the file loads (no lazy-loading yet): 275 | initialize(); 276 | 277 | console.log(cut("我爸新学会了一项解决日常烦闷的活动,就是把以前的照片抱回办公室扫描保存,弄成电子版的。更无法接受的是,还居然放到网上来,时不时给我两张。这些积尘的化石居然突然重现,简直是招架不住。这个怀旧的阀门一旦打开,那就直到意识模糊都没停下来。")); 278 | }); -------------------------------------------------------------------------------- /scripts/require.js: -------------------------------------------------------------------------------- 1 | /* 2 | RequireJS 2.1.9 Copyright (c) 2010-2012, The Dojo Foundation All Rights Reserved. 3 | Available via the MIT or new BSD license. 4 | see: http://github.com/jrburke/requirejs for details 5 | */ 6 | var requirejs,require,define; 7 | (function(Z){function H(b){return"[object Function]"===L.call(b)}function I(b){return"[object Array]"===L.call(b)}function y(b,c){if(b){var e;for(e=0;ethis.depCount&&!this.defined){if(H(m)){if(this.events.error&&this.map.isDefine||j.onError!==aa)try{d=i.execCb(c,m,b,d)}catch(e){a=e}else d=i.execCb(c,m,b,d);this.map.isDefine&&((b=this.module)&&void 0!==b.exports&&b.exports!== 19 | this.exports?d=b.exports:void 0===d&&this.usingExports&&(d=this.exports));if(a)return a.requireMap=this.map,a.requireModules=this.map.isDefine?[this.map.id]:null,a.requireType=this.map.isDefine?"define":"require",v(this.error=a)}else d=m;this.exports=d;if(this.map.isDefine&&!this.ignore&&(r[c]=d,j.onResourceLoad))j.onResourceLoad(i,this.map,this.depMaps);x(c);this.defined=!0}this.defining=!1;this.defined&&!this.defineEmitted&&(this.defineEmitted=!0,this.emit("defined",this.exports),this.defineEmitComplete= 20 | !0)}}else this.fetch()}},callPlugin:function(){var a=this.map,b=a.id,e=n(a.prefix);this.depMaps.push(e);s(e,"defined",u(this,function(d){var m,e;e=this.map.name;var g=this.map.parentMap?this.map.parentMap.name:null,h=i.makeRequire(a.parentMap,{enableBuildCallback:!0});if(this.map.unnormalized){if(d.normalize&&(e=d.normalize(e,function(a){return c(a,g,!0)})||""),d=n(a.prefix+"!"+e,this.map.parentMap),s(d,"defined",u(this,function(a){this.init([],function(){return a},null,{enabled:!0,ignore:!0})})), 21 | e=l(p,d.id)){this.depMaps.push(d);if(this.events.error)e.on("error",u(this,function(a){this.emit("error",a)}));e.enable()}}else m=u(this,function(a){this.init([],function(){return a},null,{enabled:!0})}),m.error=u(this,function(a){this.inited=!0;this.error=a;a.requireModules=[b];F(p,function(a){0===a.map.id.indexOf(b+"_unnormalized")&&x(a.map.id)});v(a)}),m.fromText=u(this,function(d,c){var e=a.name,g=n(e),B=O;c&&(d=c);B&&(O=!1);q(g);t(k.config,b)&&(k.config[e]=k.config[b]);try{j.exec(d)}catch(ca){return v(A("fromtexteval", 22 | "fromText eval for "+b+" failed: "+ca,ca,[b]))}B&&(O=!0);this.depMaps.push(g);i.completeLoad(e);h([e],m)}),d.load(a.name,h,m,k)}));i.enable(e,this);this.pluginMaps[e.id]=e},enable:function(){T[this.map.id]=this;this.enabling=this.enabled=!0;y(this.depMaps,u(this,function(a,b){var c,d;if("string"===typeof a){a=n(a,this.map.isDefine?this.map:this.map.parentMap,!1,!this.skipMap);this.depMaps[b]=a;if(c=l(N,a.id)){this.depExports[b]=c(this);return}this.depCount+=1;s(a,"defined",u(this,function(a){this.defineDep(b, 23 | a);this.check()}));this.errback&&s(a,"error",u(this,this.errback))}c=a.id;d=p[c];!t(N,c)&&(d&&!d.enabled)&&i.enable(a,this)}));F(this.pluginMaps,u(this,function(a){var b=l(p,a.id);b&&!b.enabled&&i.enable(a,this)}));this.enabling=!1;this.check()},on:function(a,b){var c=this.events[a];c||(c=this.events[a]=[]);c.push(b)},emit:function(a,b){y(this.events[a],function(a){a(b)});"error"===a&&delete this.events[a]}};i={config:k,contextName:b,registry:p,defined:r,urlFetched:S,defQueue:G,Module:X,makeModuleMap:n, 24 | nextTick:j.nextTick,onError:v,configure:function(a){a.baseUrl&&"/"!==a.baseUrl.charAt(a.baseUrl.length-1)&&(a.baseUrl+="/");var b=k.pkgs,c=k.shim,d={paths:!0,config:!0,map:!0};F(a,function(a,b){d[b]?"map"===b?(k.map||(k.map={}),Q(k[b],a,!0,!0)):Q(k[b],a,!0):k[b]=a});a.shim&&(F(a.shim,function(a,b){I(a)&&(a={deps:a});if((a.exports||a.init)&&!a.exportsFn)a.exportsFn=i.makeShimExports(a);c[b]=a}),k.shim=c);a.packages&&(y(a.packages,function(a){a="string"===typeof a?{name:a}:a;b[a.name]={name:a.name, 25 | location:a.location||a.name,main:(a.main||"main").replace(ja,"").replace(ea,"")}}),k.pkgs=b);F(p,function(a,b){!a.inited&&!a.map.unnormalized&&(a.map=n(b))});if(a.deps||a.callback)i.require(a.deps||[],a.callback)},makeShimExports:function(a){return function(){var b;a.init&&(b=a.init.apply(Z,arguments));return b||a.exports&&ba(a.exports)}},makeRequire:function(a,f){function h(d,c,e){var g,k;f.enableBuildCallback&&(c&&H(c))&&(c.__requireJsBuild=!0);if("string"===typeof d){if(H(c))return v(A("requireargs", 26 | "Invalid require call"),e);if(a&&t(N,d))return N[d](p[a.id]);if(j.get)return j.get(i,d,a,h);g=n(d,a,!1,!0);g=g.id;return!t(r,g)?v(A("notloaded",'Module name "'+g+'" has not been loaded yet for context: '+b+(a?"":". Use require([])"))):r[g]}K();i.nextTick(function(){K();k=q(n(null,a));k.skipMap=f.skipMap;k.init(d,c,e,{enabled:!0});C()});return h}f=f||{};Q(h,{isBrowser:z,toUrl:function(b){var f,e=b.lastIndexOf("."),g=b.split("/")[0];if(-1!==e&&(!("."===g||".."===g)||1h.attachEvent.toString().indexOf("[native code"))&&!W?(O=!0,h.attachEvent("onreadystatechange",b.onScriptLoad)):(h.addEventListener("load",b.onScriptLoad,!1),h.addEventListener("error", 34 | b.onScriptError,!1)),h.src=e,K=h,C?x.insertBefore(h,C):x.appendChild(h),K=null,h;if(da)try{importScripts(e),b.completeLoad(c)}catch(l){b.onError(A("importscripts","importScripts failed for "+c+" at "+e,l,[c]))}};z&&!s.skipDataMain&&M(document.getElementsByTagName("script"),function(b){x||(x=b.parentNode);if(J=b.getAttribute("data-main"))return q=J,s.baseUrl||(D=q.split("/"),q=D.pop(),fa=D.length?D.join("/")+"/":"./",s.baseUrl=fa),q=q.replace(ea,""),j.jsExtRegExp.test(q)&&(q=J),s.deps=s.deps?s.deps.concat(q): 35 | [q],!0});define=function(b,c,e){var h,j;"string"!==typeof b&&(e=c,c=b,b=null);I(c)||(e=c,c=null);!c&&H(e)&&(c=[],e.length&&(e.toString().replace(la,"").replace(ma,function(b,e){c.push(e)}),c=(1===e.length?["require"]:["require","exports","module"]).concat(c)));if(O){if(!(h=K))P&&"interactive"===P.readyState||M(document.getElementsByTagName("script"),function(b){if("interactive"===b.readyState)return P=b}),h=P;h&&(b||(b=h.getAttribute("data-requiremodule")),j=E[h.getAttribute("data-requirecontext")])}(j? 36 | j.defQueue:R).push([b,c,e])};define.amd={jQuery:!0};j.exec=function(b){return eval(b)};j(s)}})(this); 37 | -------------------------------------------------------------------------------- /test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | --------------------------------------------------------------------------------