├── index.js
├── .travis.yml
├── .gitignore
├── .npmignore
├── binding.gyp
├── test
    ├── segment.js
    ├── pos_tagger.js
    └── query_segment.js
├── src
    ├── CppJieba
    │   ├── ISegment.hpp
    │   ├── Limonp
    │   │   ├── InitOnOff.hpp
    │   │   ├── NonCopyable.hpp
    │   │   ├── HandyMacro.hpp
    │   │   ├── Condition.hpp
    │   │   ├── Thread.hpp
    │   │   ├── MutexLock.hpp
    │   │   ├── BoundedQueue.hpp
    │   │   ├── ArgvContext.hpp
    │   │   ├── Logger.hpp
    │   │   ├── CastFloat.hpp
    │   │   ├── ThreadPool.hpp
    │   │   ├── Config.hpp
    │   │   ├── StdExtension.hpp
    │   │   ├── BlockingQueue.hpp
    │   │   ├── MysqlClient.hpp
    │   │   ├── LocalVector.hpp
    │   │   ├── StringUtil.hpp
    │   │   └── Md5.hpp
    │   ├── TransCode.hpp
    │   ├── SegmentBase.hpp
    │   ├── PosTagger.hpp
    │   ├── QuerySegment.hpp
    │   ├── MixSegment.hpp
    │   ├── MPSegment.hpp
    │   ├── FullSegment.hpp
    │   ├── KeywordExtractor.hpp
    │   ├── DictTrie.hpp
    │   ├── Trie.hpp
    │   └── HMMSegment.hpp
    ├── utils.h
    ├── mix_segment.h
    ├── pos_tagger.h
    ├── query_segment.h
    ├── mix_segment.cpp
    ├── pos_tagger.cpp
    ├── segment.cpp
    └── query_segment.cpp
├── package.json
├── ChangeLog.md
├── LICENSE
└── README.md


/index.js:
--------------------------------------------------------------------------------
1 | var segment = require("./build/Release/segment");
2 | module.exports = segment;
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: node_js
 2 | node_js:
 3 |   - "0.10"
 4 | notifications:
 5 |   recipients:
 6 |     - wuyanyi09@foxmail.com
 7 |   email:
 8 |     on_success: change
 9 |     on_failure: always
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | tags
 2 | build
 3 | *.demo
 4 | *swp
 5 | *.out
 6 | *.o
 7 | *.d
 8 | *.ut
 9 | log
10 | main
11 | lib*.a
12 | *_demo
13 | segdict*
14 | tmp
15 | t.*
16 | *.pid
17 | node_modules
18 | npm-debug.log
19 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | .*.swp
 3 | npm-debug.log
 4 | node_modules
 5 | 
 6 | # don't need these in the npm package.
 7 | html/*.png
 8 | 
 9 | # don't ignore .npmignore files
10 | # these are used in some tests.
11 | !.npmignore
12 | 
13 | *.pyc
14 | 


--------------------------------------------------------------------------------
/binding.gyp:
--------------------------------------------------------------------------------
 1 | {
 2 |   "targets": [
 3 |     {
 4 |       "target_name": "segment",
 5 |       "sources": [ "./src/segment.cpp", "./src/mix_segment.cpp", "./src/query_segment.cpp", "./src/pos_tagger.cpp" ],
 6 |       "cflags": [
 7 |         "-DLOGGER_LEVEL=LL_WARN"
 8 |       ],
 9 |       "include_dirs" : [
10 |         "<!(node -e \"require('nan')\")"
11 |       ],
12 |     }
13 |   ]
14 | }
15 | 


--------------------------------------------------------------------------------
/test/segment.js:
--------------------------------------------------------------------------------
 1 | var segment = require("../index.js");
 2 | segment.loadDict("./dict/jieba.dict.utf8", "./dict/hmm_model.utf8");
 3 | segment.cut("非阻塞的南京市长江大桥",  function(tl){
 4 | 	for(var i = 0; i < tl.length; i++) {
 5 | 		console.log(i + " => " + tl[i]);
 6 | 	}
 7 | });
 8 | var tl = segment.cutSync("阻塞的南京市长江大桥");
 9 | for(var i = 0; i < tl.length; i++) {
10 | 		console.log(i + " == " + tl[i]);
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/test/pos_tagger.js:
--------------------------------------------------------------------------------
 1 | var segment = require("../index.js");
 2 | segment.taggerLoadDict("./dict/jieba.dict.utf8", "./dict/hmm_model.utf8");
 3 | segment.tag("非阻塞的南京市长江大桥",  function(tl){
 4 | 	for(var i = 0; i < tl.length; i++) {
 5 | 		console.log(i + " => " + tl[i]);
 6 | 	}
 7 | });
 8 | var tl = segment.tagSync("阻塞的南京市长江大桥");
 9 | for(var i = 0; i < tl.length; i++) {
10 | 		console.log(i + " == " + tl[i]);
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/src/CppJieba/ISegment.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_SEGMENTINTERFACE_H
 2 | #define CPPJIEBA_SEGMENTINTERFACE_H
 3 | 
 4 | 
 5 | namespace CppJieba
 6 | {
 7 |     class ISegment
 8 |     {
 9 |         public:
10 |             virtual ~ISegment(){};
11 |         public:
12 |             virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
13 |             virtual bool cut(const string& str, vector<string>& res) const = 0;
14 |     };
15 | }
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/test/query_segment.js:
--------------------------------------------------------------------------------
 1 | var segment = require("../index.js");
 2 | // 第三个参数是分词的粒度阈值，当词长大于3时，会进行细粒度的再切割，不填时默认阈值是4。
 3 | segment.queryLoadDict("./dict/jieba.dict.utf8", "./dict/hmm_model.utf8", 3);
 4 | console.log("非阻塞的:");
 5 | segment.queryCut("小明硕士毕业于中国科学院计算所，后在日本京都大学深造",  function(tl){
 6 | 	for(var i = 0; i < tl.length; i++) {
 7 | 		console.log(i + " => " + tl[i]);
 8 | 	}
 9 | });
10 | console.log("阻塞的:");
11 | var tl = segment.queryCutSync("小明硕士毕业于中国科学院计算所，后在日本京都大学深造");
12 | for(var i = 0; i < tl.length; i++) {
13 | 		console.log(i + " == " + tl[i]);
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/InitOnOff.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_INITONOFF_H
 2 | #define LIMONP_INITONOFF_H
 3 | 
 4 | namespace Limonp
 5 | {
 6 |     class InitOnOff
 7 |     {
 8 |         public:
 9 |             InitOnOff():isInited_(false){};
10 |             ~InitOnOff(){};
11 |         protected:
12 |             bool isInited_;
13 |             bool getInitFlag_()const{return isInited_;};
14 |             bool setInitFlag_(bool flag){return isInited_ = flag;};
15 |         public:
16 |             operator bool() const {return getInitFlag_();};
17 | 
18 |     };
19 | }
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/NonCopyable.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  ************************************/
 3 | #ifndef LIMONP_NONCOPYABLE_H
 4 | #define LIMONP_NONCOPYABLE_H
 5 | 
 6 | #include <iostream>
 7 | #include <string>
 8 | 
 9 | namespace Limonp
10 | {
11 |     class NonCopyable
12 |     {
13 |         protected:
14 |             NonCopyable(){};
15 |             ~NonCopyable(){};
16 |         private:
17 |             NonCopyable(const NonCopyable& );
18 |             const NonCopyable& operator=(const NonCopyable& );
19 |     };
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "nodejieba",
 3 |   "description": "chinese segment for node",
 4 |   "version": "0.2.5",
 5 |   "author": "Yanyi Wu <wuyanyi09@gmail.com>",
 6 |   "maintainers": [
 7 |     "aszxqw <wuyanyi09@gmail.com>"
 8 |   ],
 9 |   "main": "./index.js",
10 |   "engines": {
11 |     "node": "0.10.x"
12 |   },
13 |   "repository": {
14 |     "type": "git",
15 |     "url": "http://github.com/aszxqw/nodejieba.git"
16 |   },
17 |   "keywords": [
18 |     "chinese",
19 |     "segment",
20 |     "cppjieba",
21 |     "jieba"
22 |   ],
23 |   "dependencies": {
24 |     "nan": "~1.2.0"
25 |   },
26 |   "devDependencies": {},
27 |   "scripts": {
28 |       "test": "node test/segment.js && node test/query_segment.js && node test/pos_tagger.js"
29 |   },
30 |   "license": "MIT"
31 | }
32 | 


--------------------------------------------------------------------------------
/ChangeLog.md:
--------------------------------------------------------------------------------
 1 | ## v0.2.5
 2 | 
 3 | * 增加词性标注功能
 4 | 
 5 | ## v0.2.4
 6 | 
 7 | * 更新 package 兼容更低版本的 npm
 8 | 
 9 | ## v0.2.3
10 | 
11 | * 更新 cppjieba ，减少内存使用。
12 | 
13 | ## v0.2.2
14 | 
15 | * 在queryLoadDict 函数中增加query模式的粒度阈值作为可选参数。 
16 | 
17 | ## v0.2.1
18 | 
19 | * 增加搜索引擎分词模式，分别对应的调用函数是 `queryLoadDict, queryCutSync, queryCut`。 
20 | 
21 | ## v0.2.0
22 | 
23 | * 将原来的 cut 阻塞分词模式改为非阻塞模式
24 | * 阻塞分词模型的函数名为 cutSync
25 | 
26 | ## v0.1.4
27 | 
28 | * 修复关于较低版本编译器需要使用`tr1/unordered_map`导致和`node-gyp`编译选项`-fno-rtti`冲突的编译错误问题。
29 | 
30 | ## v0.1.3
31 | 
32 | * 更新[CppJieba]，支持更低版本的g++。
33 | 
34 | ## v0.1.2
35 | 
36 | * 更新[CppJieba]，使用`less_memory`这个branch来减少Trie树内存的开销。
37 | 
38 | ## v0.1.1
39 | 
40 | * 依照node的c++扩展的常规写法，对CppJieba进行简单的包装，并已`npm publish`
41 | 
42 | [CppJieba]:http://github.com/aszxqw/cppjieba.git
43 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef NODEJIEBA_SRC_UTLS_H
 2 | #define NODEJIEBA_SRC_UTLS_H
 3 | 
 4 | #include <node.h>
 5 | #include <v8.h>
 6 | #include <nan.h>
 7 | #include <string.h>
 8 | #include <iostream>
 9 | #include <string>
10 | #include <vector>
11 | 
12 | using namespace std;
13 | using namespace v8;
14 | 
15 | inline void WrapVector(vector<string> &ov, Local<Array> &array) {
16 |     array = Array::New(ov.size());
17 |     for(size_t i = 0; i < ov.size(); i++) {
18 |         array->Set(i, String::New(ov[i].c_str()));
19 |     }
20 | }
21 | 
22 | inline void WrapPairVector(vector<pair<string,string> > &ov, Local<Array> &array) {
23 |     array = Array::New(ov.size());
24 |     for(size_t i = 0; i < ov.size(); i++) {
25 |         array->Set(i, String::New((ov[i].first + ":" + ov[i].second).c_str()));
26 |     }
27 | }
28 | 
29 | inline string ValueToString(Local<Value> val) {
30 |     String::Utf8Value su(val);
31 |     return string(*su);
32 | }
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/mix_segment.h:
--------------------------------------------------------------------------------
 1 | #ifndef NODEJIEAB_SRC_MIX_SEGMENT_H
 2 | #define NODEJIEAB_SRC_MIX_SEGMENT_H
 3 | #include "utils.h"
 4 | #include "CppJieba/MixSegment.hpp"
 5 | 
 6 | extern CppJieba::MixSegment segment;
 7 | 
 8 | extern NAN_METHOD(loadDict);
 9 | extern NAN_METHOD(cutSync);
10 | extern NAN_METHOD(cut);
11 | 
12 | class CutWorker : public NanAsyncWorker {
13 |     public:
14 |         CutWorker(NanCallback *callback, string inputStr)
15 |             : NanAsyncWorker(callback), inputStr(inputStr) {}
16 | 
17 |         ~CutWorker() {}
18 | 
19 | 
20 |         void Execute () {
21 |             segment.cut(inputStr, outputWords);
22 |         }
23 | 
24 |         void HandleOKCallback () {
25 |             NanScope();
26 |             Local<Value> args[1];
27 |             Local<Array> wordList;
28 |             WrapVector(outputWords, wordList);
29 |             args[0] = wordList;
30 |             callback->Call(1, args);
31 |         }
32 | 
33 |     private:
34 |         string inputStr;
35 |         vector<string> outputWords;
36 | };
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/pos_tagger.h:
--------------------------------------------------------------------------------
 1 | #ifndef NODEJIEAB_SRC_POSTAGGER_H
 2 | #define NODEJIEAB_SRC_POSTAGGER_H
 3 | #include "utils.h"
 4 | #include "CppJieba/PosTagger.hpp"
 5 | 
 6 | extern CppJieba::PosTagger tagger;
 7 | 
 8 | extern NAN_METHOD(taggerLoadDict);
 9 | extern NAN_METHOD(tagSync);
10 | extern NAN_METHOD(tag);
11 | 
12 | class TaggerWorker : public NanAsyncWorker {
13 |     public:
14 |         TaggerWorker(NanCallback *callback, string inputStr)
15 |             : NanAsyncWorker(callback), inputStr(inputStr) {}
16 | 
17 |         ~TaggerWorker() {}
18 | 
19 | 
20 |         void Execute () {
21 |             tagger.tag(inputStr, outputWords);
22 |         }
23 | 
24 |         void HandleOKCallback () {
25 |             NanScope();
26 |             Local<Value> args[1];
27 |             Local<Array> wordList;
28 |             WrapPairVector(outputWords, wordList);
29 |             args[0] = wordList;
30 |             callback->Call(1, args);
31 |         }
32 | 
33 |     private:
34 |         string inputStr;
35 |         vector<pair<string, string> > outputWords;
36 | };
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/query_segment.h:
--------------------------------------------------------------------------------
 1 | #ifndef NODEJIEAB_SRC_QUERY_SEGMENT_H
 2 | #define NODEJIEAB_SRC_QUERY_SEGMENT_H
 3 | 
 4 | #include "utils.h"
 5 | #include "CppJieba/QuerySegment.hpp"
 6 | 
 7 | extern CppJieba::QuerySegment querySegment;
 8 | 
 9 | extern NAN_METHOD(queryLoadDict);
10 | extern NAN_METHOD(queryCutSync);
11 | extern NAN_METHOD(queryCut);
12 | 
13 | class QueryCutWorker : public NanAsyncWorker {
14 |     public:
15 |         QueryCutWorker(NanCallback *callback, string inputStr)
16 |             : NanAsyncWorker(callback), inputStr(inputStr) {}
17 | 
18 |         ~QueryCutWorker() {}
19 | 
20 | 
21 |         void Execute () {
22 |             querySegment.cut(inputStr, outputWords);
23 |         }
24 | 
25 |         void HandleOKCallback () {
26 |             NanScope();
27 |             Local<Value> args[1];
28 |             Local<Array> wordList;
29 |             WrapVector(outputWords, wordList);
30 |             args[0] = wordList;
31 |             callback->Call(1, args);
32 |         }
33 | 
34 |     private:
35 |         string inputStr;
36 |         vector<string> outputWords;
37 | };
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/src/mix_segment.cpp:
--------------------------------------------------------------------------------
 1 | #include "mix_segment.h"
 2 | 
 3 | CppJieba::MixSegment segment;
 4 | 
 5 | NAN_METHOD (cutSync) {
 6 |     NanScope();
 7 | 
 8 |     String::Utf8Value param1(args[0]->ToString());
 9 |     vector<string> words;
10 | 
11 |     segment.cut(*param1, words); 
12 | 
13 |     Local<Array> outArray;
14 |     WrapVector(words, outArray);
15 | 
16 |     NanReturnValue(outArray);
17 | }
18 | NAN_METHOD (loadDict) {
19 |     NanScope();
20 |     String::Utf8Value param0(args[0]->ToString());
21 |     String::Utf8Value param1(args[1]->ToString());
22 |     NanReturnValue (Boolean::New(segment.init(*param0, *param1)));
23 | }
24 | 
25 | NAN_METHOD (cut) { 
26 |     NanScope();
27 |     if (args.Length() == 2){
28 |         string inputStr = ValueToString(args[0]);
29 |         Local<Function> callback = args[1].As<Function>();
30 | 
31 |         NanCallback* nanCallback = new NanCallback(callback);
32 |         CutWorker* worker = new CutWorker(nanCallback, inputStr);
33 |         NanAsyncQueueWorker(worker);
34 |     }
35 |     else {
36 |         NanThrowTypeError("argc must equals to 2");
37 |     }
38 |     NanReturnUndefined();
39 | }
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Yanyi Wu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/HandyMacro.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_HANDY_MACRO_HPP
 2 | #define LIMONP_HANDY_MACRO_HPP
 3 | 
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | 
 7 | #define LIMONP_CHECK(exp) \
 8 |     if(!(exp)){fprintf(stderr, "File:%s, Line:%d Exp:[" #exp "] is true, abort.\n", __FILE__, __LINE__); abort();}
 9 | 
10 | #define print(x) cout<< #x": " << x <<endl
11 | /*
12 | #define XX_GET_SET(varType, varName, funName)\
13 | private: varType varName;\
14 | public: inline varType get##funName(void) const {return varName;}\
15 | public: inline void set##funName(varType var) {varName = var;}
16 | 
17 | #define XX_GET(varType, varName, funName)\
18 | private: varType varName;\
19 | public: inline varType get##funName(void) const {return varName;}
20 | 
21 | #define XX_SET(varType, varName, funName)\
22 | private: varType varName;\
23 | public: inline void set##funName(varType var) {varName = var;}
24 | 
25 | #define XX_GET_SET_BY_REF(varType, varName, funName)\
26 | private: varType varName;\
27 | public: inline const varType& get##funName(void) const {return varName;}\
28 | public: inline void set##funName(const varType& var){varName = var;}
29 | */
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/pos_tagger.cpp:
--------------------------------------------------------------------------------
 1 | #include "pos_tagger.h"
 2 | 
 3 | CppJieba::PosTagger tagger;
 4 | 
 5 | NAN_METHOD (tagSync) {
 6 |     NanScope();
 7 | 
 8 |     String::Utf8Value param1(args[0]->ToString());
 9 |     vector<pair<string, string> > words;
10 | 
11 |     tagger.tag(*param1, words); 
12 | 
13 |     Local<Array> outArray;
14 |     WrapPairVector(words, outArray);
15 | 
16 |     NanReturnValue(outArray);
17 | }
18 | NAN_METHOD (taggerLoadDict) {
19 |     NanScope();
20 |     String::Utf8Value param0(args[0]->ToString());
21 |     String::Utf8Value param1(args[1]->ToString());
22 |     tagger.init(*param0, *param1);
23 |     NanReturnValue (Boolean::New(true));
24 | }
25 | 
26 | NAN_METHOD (tag) { 
27 |     NanScope();
28 |     if (args.Length() == 2){
29 |         string inputStr = ValueToString(args[0]);
30 |         Local<Function> callback = args[1].As<Function>();
31 | 
32 |         NanCallback* nanCallback = new NanCallback(callback);
33 |         TaggerWorker* worker = new TaggerWorker(nanCallback, inputStr);
34 |         NanAsyncQueueWorker(worker);
35 |     }
36 |     else {
37 |         NanThrowTypeError("argc must equals to 2");
38 |     }
39 |     NanReturnUndefined();
40 | }
41 | 


--------------------------------------------------------------------------------
/src/segment.cpp:
--------------------------------------------------------------------------------
 1 | #include "mix_segment.h"
 2 | #include "query_segment.h"
 3 | #include "pos_tagger.h"
 4 | 
 5 | void init(Handle<Object> exports) {
 6 |     exports->Set(NanNew("loadDict"),
 7 |             NanNew<FunctionTemplate>(loadDict)->GetFunction());
 8 |     exports->Set(NanNew("cutSync"),
 9 |             NanNew<FunctionTemplate>(cutSync)->GetFunction());
10 |     exports->Set(NanNew("cut"), 
11 |             NanNew<FunctionTemplate>(cut)->GetFunction());
12 |     
13 |     exports->Set(NanNew("queryLoadDict"),
14 |             NanNew<FunctionTemplate>(queryLoadDict)->GetFunction());
15 |     exports->Set(NanNew("queryCutSync"),
16 |             NanNew<FunctionTemplate>(queryCutSync)->GetFunction());
17 |     exports->Set(NanNew("queryCut"), 
18 |             NanNew<FunctionTemplate>(queryCut)->GetFunction());
19 | 
20 |     exports->Set(NanNew("taggerLoadDict"),
21 |             NanNew<FunctionTemplate>(taggerLoadDict)->GetFunction());
22 |     exports->Set(NanNew("tagSync"),
23 |             NanNew<FunctionTemplate>(tagSync)->GetFunction());
24 |     exports->Set(NanNew("tag"),
25 |             NanNew<FunctionTemplate>(tag)->GetFunction());
26 | }
27 | 
28 | NODE_MODULE(segment, init)
29 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Condition.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * https://github.com/chenshuo/muduo/blob/master/muduo/base/Condition.h
 3 |  */
 4 | 
 5 | #ifndef LIMONP_CONDITION_HPP
 6 | #define LIMONP_CONDITION_HPP
 7 | 
 8 | #include "MutexLock.hpp"
 9 | 
10 | namespace Limonp
11 | {
12 |     class Condition : NonCopyable
13 |     {
14 |         public:
15 |             explicit Condition(MutexLock& mutex)
16 |                 : mutex_(mutex)
17 |             {
18 |                 LIMONP_CHECK(!pthread_cond_init(&pcond_, NULL));
19 |             }
20 | 
21 |             ~Condition()
22 |             {
23 |                 LIMONP_CHECK(!pthread_cond_destroy(&pcond_));
24 |             }
25 | 
26 |             void wait()
27 |             {
28 |                 LIMONP_CHECK(!pthread_cond_wait(&pcond_, mutex_.getPthreadMutex()));
29 |             }
30 | 
31 |             void notify()
32 |             {
33 |                 LIMONP_CHECK(!pthread_cond_signal(&pcond_));
34 |             }
35 | 
36 |             void notifyAll()
37 |             {
38 |                 LIMONP_CHECK(!pthread_cond_broadcast(&pcond_));
39 |             }
40 | 
41 |         private:
42 |             MutexLock& mutex_;
43 |             pthread_cond_t pcond_;
44 |     };
45 | 
46 | }
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/query_segment.cpp:
--------------------------------------------------------------------------------
 1 | #include "query_segment.h"
 2 | 
 3 | CppJieba::QuerySegment querySegment;
 4 | 
 5 | NAN_METHOD (queryLoadDict) {
 6 |     NanScope();
 7 |     String::Utf8Value param0(args[0]->ToString());
 8 |     String::Utf8Value param1(args[1]->ToString());
 9 |     int param2 = args.Length() >= 3 ? args[2]->Int32Value() : 4;
10 |     NanReturnValue (Boolean::New(querySegment.init(*param0, *param1, param2)));
11 | }
12 | 
13 | NAN_METHOD (queryCutSync) {
14 |     NanScope();
15 | 
16 |     String::Utf8Value param1(args[0]->ToString());
17 |     vector<string> words;
18 | 
19 |     querySegment.cut(*param1, words); 
20 | 
21 |     Local<Array> outArray;
22 |     WrapVector(words, outArray);
23 | 
24 |     NanReturnValue(outArray);
25 | }
26 | 
27 | NAN_METHOD (queryCut) { 
28 |     NanScope();
29 |     if (args.Length() == 2){
30 |         string inputStr = ValueToString(args[0]);
31 |         Local<Function> callback = args[1].As<Function>();
32 | 
33 |         NanCallback* nanCallback = new NanCallback(callback);
34 |         QueryCutWorker* worker = new QueryCutWorker(nanCallback, inputStr);
35 |         NanAsyncQueueWorker(worker);
36 |     }
37 |     else {
38 |         NanThrowTypeError("argc must equals to 2");
39 |     }
40 |     NanReturnUndefined();
41 | }
42 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Thread.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_THREAD_HPP
 2 | #define LIMONP_THREAD_HPP
 3 | 
 4 | #include "HandyMacro.hpp"
 5 | #include "NonCopyable.hpp"
 6 | 
 7 | namespace Limonp
 8 | {
 9 |     class IThread: NonCopyable
10 |     {
11 |         private:
12 |             pthread_t thread_;
13 |             bool isStarted;
14 |             bool isJoined;
15 |         public:
16 |             IThread(): isStarted(false), isJoined(false)
17 |             {
18 |             }
19 |             virtual ~IThread()
20 |             {
21 |                 if(isStarted && !isJoined)
22 |                 {
23 |                     LIMONP_CHECK(!pthread_detach(thread_));
24 |                 }
25 |             };
26 |         public:
27 |             virtual void run() = 0;
28 |             void start()
29 |             {
30 |                 assert(!isStarted);
31 |                 LIMONP_CHECK(!pthread_create(&thread_, NULL, worker_, this));
32 |                 isStarted = true;
33 |             }
34 |             void join()
35 |             {
36 |                 assert(!isJoined);
37 |                 LIMONP_CHECK(!pthread_join(thread_, NULL));
38 |                 isJoined = true;
39 |             }
40 |         private:
41 |             static void * worker_(void * data)
42 |             {
43 |                 IThread * ptr = (IThread* ) data;
44 |                 ptr->run();
45 |                 return NULL;
46 |             }
47 |     };
48 | }
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/MutexLock.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_MUTEX_LOCK_HPP
 2 | #define LIMONP_MUTEX_LOCK_HPP
 3 | 
 4 | #include <pthread.h>
 5 | #include "NonCopyable.hpp"
 6 | #include "HandyMacro.hpp"
 7 | 
 8 | namespace Limonp
 9 | {
10 |     class MutexLock: NonCopyable
11 |     {
12 |         private:
13 |             pthread_mutex_t mutex_;
14 |         public:
15 |             pthread_mutex_t* getPthreadMutex()
16 |             {
17 |                 return &mutex_;
18 |             }
19 |         public:
20 |             MutexLock()
21 |             {
22 |                 LIMONP_CHECK(!pthread_mutex_init(&mutex_, NULL));
23 |             }
24 |             ~MutexLock()
25 |             {
26 |                 LIMONP_CHECK(!pthread_mutex_destroy(&mutex_));
27 |             }
28 |         private:
29 |             void lock()
30 |             {
31 |                 LIMONP_CHECK(!pthread_mutex_lock(&mutex_));
32 |             }
33 |             void unlock()
34 |             {
35 |                 LIMONP_CHECK(!pthread_mutex_unlock(&mutex_));
36 |             }
37 |             friend class MutexLockGuard;
38 |     };
39 |     class MutexLockGuard: NonCopyable
40 |     {
41 |         public:
42 |             explicit MutexLockGuard(MutexLock & mutex)
43 |                 : mutex_(mutex)
44 |             {
45 |                 mutex_.lock();
46 |             }
47 |             ~MutexLockGuard()
48 |             {
49 |                 mutex_.unlock();
50 |             }
51 |         private:
52 |             MutexLock & mutex_;
53 |     };
54 | #define MutexLockGuard(x) assert(false);
55 | }
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/src/CppJieba/TransCode.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  * file enc : utf-8
 3 |  * author   : wuyanyi09@gmail.com
 4 |  ************************************/
 5 | #ifndef CPPJIEBA_TRANSCODE_H
 6 | #define CPPJIEBA_TRANSCODE_H
 7 | 
 8 | 
 9 | #include "Limonp/StringUtil.hpp"
10 | #include "Limonp/LocalVector.hpp"
11 | 
12 | namespace CppJieba
13 | {
14 | 
15 |     using namespace Limonp;
16 |     typedef uint16_t UnicodeValueType;
17 |     typedef Limonp::LocalVector<UnicodeValueType> Unicode;
18 |     namespace TransCode
19 |     {
20 |         inline bool decode(const string& str, Unicode& res)
21 |         {
22 | #ifdef CPPJIEBA_GBK
23 |             return gbkTrans(str, res);
24 | #else
25 |             return utf8ToUnicode(str, res);
26 | #endif
27 |         }
28 | 
29 |         inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
30 |         {
31 | #ifdef CPPJIEBA_GBK
32 |             return gbkTrans(begin, end, res);
33 | #else
34 |             return unicodeToUtf8(begin, end, res);
35 | #endif
36 |         }
37 |         
38 |         inline bool encode(const Unicode& uni, string& res)
39 |         {
40 |             return encode(uni.begin(), uni.end(), res);
41 |         }
42 | 
43 |         // compiler is expected to optimized this function to avoid return value copy
44 |         inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) 
45 |         {
46 |             string res;
47 |             res.reserve(end - begin);
48 |             encode(begin, end, res);
49 |             return res;
50 |         }
51 | 
52 |         // compiler is expected to optimized this function to avoid return value copy
53 |         inline Unicode decode(const string& str)
54 |         {
55 |             Unicode unicode;
56 |             unicode.reserve(str.size());
57 |             decode(str, unicode);
58 |             return unicode;
59 |         }
60 |     }
61 | }
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/BoundedQueue.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_BOUNDED_QUEUE_HPP
 2 | #define LIMONP_BOUNDED_QUEUE_HPP
 3 | 
 4 | #include <vector>
 5 | #include <fstream>
 6 | #include <cassert>
 7 | 
 8 | namespace Limonp
 9 | {
10 |     using namespace std;
11 |     template<class T>
12 |         class BoundedQueue
13 |         {
14 |             private:
15 |                 size_t head_;
16 |                 size_t tail_;
17 |                 size_t size_;
18 |                 const size_t capacity_;
19 |                 vector<T> circular__buffer;
20 |             public:
21 |                 explicit BoundedQueue(size_t capacity): capacity_(capacity), circular__buffer(capacity)
22 |                 {
23 |                     head_ = 0;
24 |                     tail_ = 0;
25 |                     size_ = 0;
26 |                     assert(capacity_);
27 |                 }
28 |                 ~BoundedQueue(){}
29 |             public:
30 |                 void clear()
31 |                 {
32 |                     head_ = 0;
33 |                     tail_ = 0;
34 |                     size_ = 0;
35 |                 }
36 |                 bool empty() const
37 |                 {
38 |                     return !size_;
39 |                 }
40 |                 bool full() const
41 |                 {
42 |                     return capacity_ == size_;
43 |                 }
44 |                 size_t size() const
45 |                 {
46 |                     return size_;
47 |                 }
48 |                 size_t capacity() const
49 |                 {
50 |                     return capacity_;
51 |                 }
52 | 
53 |                 void push(const T& t)
54 |                 {
55 |                     assert(!full());
56 |                     circular__buffer[tail_] = t;
57 |                     tail_ = (tail_ + 1) % capacity_;
58 |                     size_ ++;
59 |                 }
60 | 
61 |                 T pop()
62 |                 {
63 |                     assert(!empty());
64 |                     size_t oldPos = head_;
65 |                     head_ = (head_ + 1) % capacity_;
66 |                     size_ --;
67 |                     return circular__buffer[oldPos];
68 |                 }
69 | 
70 |         };
71 | }
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/ArgvContext.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  * file enc : ascii
 3 |  * author   : wuyanyi09@gmail.com
 4 |  ************************************/
 5 | 
 6 | #ifndef LIMONP_ARGV_FUNCTS_H
 7 | #define LIMONP_ARGV_FUNCTS_H
 8 | 
 9 | #include <set>
10 | #include <sstream>
11 | #include "StringUtil.hpp"
12 | 
13 | namespace Limonp
14 | {
15 |     using namespace std;
16 |     class ArgvContext
17 |     {
18 |         public :
19 |             ArgvContext(int argc, const char* const * argv)
20 |             {
21 | 
22 |                 for(int i = 0; i < argc; i++)
23 |                 {
24 |                     if(startsWith(argv[i], "-"))
25 |                     {
26 |                         if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
27 |                         {
28 |                             mpss_[argv[i]] = argv[i+1];
29 |                             i++;
30 |                         }
31 |                         else
32 |                         {
33 |                             sset_.insert(argv[i]);
34 |                         }
35 |                     }
36 |                     else
37 |                     {
38 |                         args_.push_back(argv[i]);
39 |                     }
40 |                 }
41 |             }
42 |             ~ArgvContext(){};
43 |         public:
44 |             friend ostream& operator << (ostream& os, const ArgvContext& args); 
45 |             string operator [](size_t i) const
46 |             {
47 |                 if(i < args_.size())
48 |                 {
49 |                     return args_[i];
50 |                 }
51 |                 return "";
52 |             }
53 |             string operator [](const string& key) const
54 |             {
55 |                 map<string, string>::const_iterator it = mpss_.find(key);
56 |                 if(it != mpss_.end())
57 |                 {
58 |                     return it->second;
59 |                 }
60 |                 return "";
61 |             }
62 |         public:
63 |             bool hasKey(const string& key) const
64 |             {
65 |                 if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end())
66 |                 {
67 |                     return true;
68 |                 }
69 |                 return false;
70 |             }
71 |         private:
72 |             vector<string> args_;
73 |             map<string, string> mpss_;
74 |             set<string> sset_; 
75 | 
76 |     };
77 | 
78 |     inline ostream& operator << (ostream& os, const ArgvContext& args)
79 |     {
80 |         return os<<args.args_<<args.mpss_<<args.sset_;
81 |     }
82 | }
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/src/CppJieba/SegmentBase.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_SEGMENTBASE_H
 2 | #define CPPJIEBA_SEGMENTBASE_H
 3 | 
 4 | #include "TransCode.hpp"
 5 | #include "Limonp/Logger.hpp"
 6 | #include "Limonp/NonCopyable.hpp"
 7 | #include "Limonp/HandyMacro.hpp"
 8 | #include "ISegment.hpp"
 9 | #include <cassert>
10 | 
11 | 
12 | namespace CppJieba
13 | {
14 |     using namespace Limonp;
15 | 
16 |     //const char* const SPECIAL_CHARS = " \t\n";
17 | #ifndef CPPJIEBA_GBK
18 |     const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};  
19 | #else
20 |     const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};  
21 | #endif
22 | 
23 |     class SegmentBase: public ISegment, public NonCopyable
24 |     {
25 |         public:
26 |             SegmentBase(){_loadSpecialSymbols();};
27 |             virtual ~SegmentBase(){};
28 |         private:
29 |             unordered_set<UnicodeValueType> _specialSymbols;
30 |         private:
31 |             void _loadSpecialSymbols()
32 |             {
33 |                 size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
34 |                 for(size_t i = 0; i < size; i ++)
35 |                 {
36 |                     _specialSymbols.insert(SPECIAL_SYMBOL[i]);
37 |                 }
38 |                 assert(_specialSymbols.size());
39 |             }
40 | 
41 |         public:
42 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
43 |             virtual bool cut(const string& str, vector<string>& res) const
44 |             {
45 |                 res.clear();
46 | 
47 |                 Unicode unicode;
48 |                 unicode.reserve(str.size());
49 | 
50 |                 TransCode::decode(str, unicode);
51 |                 
52 |                 Unicode::const_iterator left = unicode.begin();
53 |                 Unicode::const_iterator right;
54 |                 
55 |                 for(right = unicode.begin(); right != unicode.end(); right++)
56 |                 {
57 |                     if(isIn(_specialSymbols, *right))
58 |                     {
59 |                         if(left != right)
60 |                         {
61 |                             cut(left, right, res);
62 |                         }
63 |                         res.resize(res.size() + 1);
64 |                         TransCode::encode(right, right + 1, res.back());
65 |                         left = right + 1;
66 |                     }
67 |                 }
68 |                 if(left != right)
69 |                 {
70 |                     cut(left, right, res);
71 |                 }
72 |                 
73 |                 return true;
74 |             }
75 |     };
76 | }
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Logger.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  * file enc : utf8
 3 |  * author   : wuyanyi09@gmail.com
 4 |  ************************************/
 5 | #ifndef LIMONP_LOGGER_H
 6 | #define LIMONP_LOGGER_H
 7 | 
 8 | #include <vector>
 9 | #include <iostream>
10 | #include <fstream>
11 | #include <string>
12 | #include <cstring>
13 | #include <stdio.h>
14 | #include <cstdlib>
15 | #include <stdarg.h>
16 | #include <time.h>
17 | #include <cassert>
18 | 
19 | #define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
20 | 
21 | #define LogDebug(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
22 | #define LogInfo(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
23 | #define LogWarn(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
24 | #define LogError(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
25 | #define LogFatal(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
26 | 
27 | namespace Limonp
28 | {
29 |     using namespace std;
30 |     enum {LL_DEBUG = 0, LL_INFO = 1, LL_WARN = 2, LL_ERROR = 3, LL_FATAL = 4, LEVEL_ARRAY_SIZE = 5, CSTR_BUFFER_SIZE = 32};
31 |     static const char * LOG_LEVEL_ARRAY[LEVEL_ARRAY_SIZE]= {"DEBUG","INFO","WARN","ERROR","FATAL"};
32 |     static const char * LOG_FORMAT = "%s %s:%d %s %s\n";
33 |     static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
34 | 
35 |     class Logger
36 |     {
37 |         public:
38 |             static void Logging(size_t level, const string& msg, const char* fileName, int lineno)
39 |             {
40 |                 assert(level <= LL_FATAL);
41 |                 char buf[CSTR_BUFFER_SIZE];
42 |                 time_t timeNow;
43 |                 time(&timeNow);
44 |                 strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow));
45 |                 fprintf(stderr, LOG_FORMAT, buf, fileName, lineno,LOG_LEVEL_ARRAY[level], msg.c_str());
46 |             }
47 |             static void LoggingF(size_t level, const char* fileName, int lineno, const char* const fmt, ...)
48 |             {
49 | #ifdef LOGGER_LEVEL
50 |                 if(level < LOGGER_LEVEL) return;
51 | #endif
52 |                 int size = 256;
53 |                 string msg;
54 |                 va_list ap;
55 |                 while (1) {
56 |                     msg.resize(size);
57 |                     va_start(ap, fmt);
58 |                     int n = vsnprintf((char *)msg.c_str(), size, fmt, ap);
59 |                     va_end(ap);
60 |                     if (n > -1 && n < size) {
61 |                         msg.resize(n);
62 |                         break;
63 |                     }
64 |                     if (n > -1)
65 |                       size = n + 1;
66 |                     else
67 |                       size *= 2;
68 |                 }
69 |                 Logging(level, msg, fileName, lineno);
70 |             }
71 |     };
72 | }
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/CastFloat.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_CAST_FUNCTS_H
 2 | #define LIMONP_CAST_FUNCTS_H
 3 | 
 4 | namespace Limonp
 5 | {
 6 |     namespace CastFloat
 7 |     {
 8 |         //logical and or
 9 |         static const int sign_32 = 0xC0000000;
10 |         static const int exponent_32 = 0x07800000;
11 |         static const int mantissa_32 = 0x007FE000;
12 |         static const int sign_exponent_32 = 0x40000000;
13 |         static const int loss_32 = 0x38000000;
14 | 
15 |         static const short sign_16 = (short)0xC000;
16 |         static const short exponent_16 = (short)0x3C00;
17 |         static const short mantissa_16 = (short)0x03FF;
18 |         static const short sign_exponent_16 = (short)0x4000;
19 |         static const int exponent_fill_32 = 0x38000000;
20 | 
21 |         //infinite
22 |         static const short infinite_16 = (short) 0x7FFF;
23 |         static const short infinitesmall_16 = (short) 0x0000;
24 | 
25 |         inline float intBitsToFloat(unsigned int x)
26 |         {
27 |             union
28 |             {
29 |                 float f;
30 |                 int i;
31 |             }u;
32 |             u.i = x;
33 |             return u.f;
34 |         }
35 | 
36 |         inline int floatToIntBits(float f)
37 |         {
38 |             union
39 |             {
40 |                 float f;
41 |                 int i ;
42 |             }u;
43 |             u.f = f;
44 |             return u.i;
45 |         }
46 | 
47 |         inline short floatToShortBits(float f)
48 |         {
49 |             int fi = floatToIntBits(f);
50 | 
51 |             // 提取关键信息
52 |             short sign = (short) ((unsigned int)(fi & sign_32) >> 16);
53 |             short exponent = (short) ((unsigned int)(fi & exponent_32) >> 13);
54 |             short mantissa = (short) ((unsigned int)(fi & mantissa_32) >> 13);
55 |             // 生成编码结果
56 |             short code = (short) (sign | exponent | mantissa);
57 |             // 无穷大量、无穷小量的处理
58 |             if ((fi & loss_32) > 0 && (fi & sign_exponent_32) > 0) {
59 |                 // 当指数符号为1时(正次方)，且左234位为1，返回无穷大量
60 |                 return (short) (code | infinite_16);
61 |             }
62 |             if (((fi & loss_32) ^ loss_32) > 0 && (fi & sign_exponent_32) == 0) {
63 |                 // 当指数符号位0时(负次方)，且左234位为0(与111异或>0)，返回无穷小量
64 |                 return infinitesmall_16;
65 |             }
66 | 
67 |             return code;
68 |         }
69 | 
70 |         inline float shortBitsToFloat(short s)
71 |         {
72 |             /*
73 |              * 指数空余3位：若符号位为1，补0；若符号位为0，补1。 尾数位在后补0(13个)
74 |              */
75 |             int sign = ((int) (s & sign_16)) << 16;
76 |             int exponent = ((int) (s & exponent_16)) << 13;
77 |             // 指数符号位为0，234位补1
78 |             if ((s & sign_exponent_16) == 0 && s != 0) {
79 |                 exponent |= exponent_fill_32;
80 |             }
81 |             int mantissa = ((int) (s & mantissa_16)) << 13;
82 |             // 生成解码结果
83 |             int code = sign | exponent | mantissa;
84 |             return intBitsToFloat(code);
85 | 
86 |         }
87 |     }
88 | }
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/aszxqw/nodejieba.png?branch=master)](https://travis-ci.org/aszxqw/nodejieba)
  2 | [![Dependency Status](https://david-dm.org/aszxqw/nodejieba.png?theme=shields.io)](https://david-dm.org/aszxqw/nodejieba)
  3 | [![devDependency Status](https://david-dm.org/aszxqw/nodejieba/dev-status.png?theme=shields.io)](https://david-dm.org/aszxqw/nodejieba#info=devDependencies)
  4 | [![NpmDownload Status](http://img.shields.io/npm/dm/nodejieba.svg)](https://www.npmjs.org/package/nodejieba)
  5 | - - -
  6 | 
  7 | # NodeJieba "结巴"分词的Node.js版本
  8 | 
  9 | ## Introduction
 10 | 
 11 | `NodeJieba`只是[CppJieba]简单包装而成的`node`扩展，用来进行中文分词。
 12 | 
 13 | 详见[NodeJiebaBlog]
 14 | 
 15 | ## Install
 16 | 
 17 | ```sh
 18 | npm install nodejieba
 19 | ```
 20 | 
 21 | 因为`npm`速度很慢而且经常因为墙的原因出现莫名其妙的问题，在此强烈建议使用[cnpm]，命令如下：
 22 | 
 23 | ```sh
 24 | npm --registry=http://r.cnpmjs.org install nodejieba
 25 | ```
 26 | 
 27 | ## Usage
 28 | 
 29 | ### 默认分词算法
 30 | 
 31 | #### 初始化
 32 | 
 33 | ```js
 34 | var segment = require("nodejieba");
 35 | segment.loadDict("./node_modules/nodejieba/dict/jieba.dict.utf8", "./node_modules/nodejieba/dict/hmm_model.utf8");
 36 | ```
 37 | 
 38 | #### 阻塞式调用
 39 | 
 40 | ```js
 41 | var wordList = segment.cutSync("阻塞模式分词");
 42 | if (wordList.constructor == Array) // just for tutorial, this is always be true 
 43 | {
 44 |     wordList.forEach(function(word) {
 45 |         console.log(word);     
 46 |     });
 47 | }
 48 | ```
 49 | 
 50 | #### 非阻塞式调用
 51 | 
 52 | ```js
 53 | segment.cut("非阻塞模式分词", function(wordList) {
 54 |     wordList.forEach(function(word) {
 55 |         console.log(word);     
 56 |     });
 57 | });
 58 | ```
 59 | 
 60 | ### 搜索引擎分词算法
 61 | 
 62 | #### 初始化
 63 | 
 64 | ```js
 65 | var segment = require("nodejieba");
 66 | segment.queryLoadDict("./node_modules/nodejieba/dict/jieba.dict.utf8", "./node_modules/nodejieba/dict/hmm_model.utf8");
 67 | ```
 68 | 
 69 | #### 阻塞式调用
 70 | 
 71 | ```js
 72 | var wordList = segment.queryCutSync("阻塞模式分词");
 73 | if (wordList.constructor == Array) // just for tutorial, this is always be true 
 74 | {
 75 |     wordList.forEach(function(word) {
 76 |         console.log(word);     
 77 |     });
 78 | }
 79 | ```
 80 | 
 81 | #### 非阻塞式调用
 82 | 
 83 | ```js
 84 | segment.queryCut("非阻塞模式分词", function(wordList) {
 85 |     wordList.forEach(function(word) {
 86 |         console.log(word);     
 87 |     });
 88 | });
 89 | ```
 90 | 
 91 | 具体用法可以参考 `test/segment.js test/query_segment.js`
 92 | 
 93 | ### 词性标注
 94 | 
 95 | 具体用法可以参考 `test/pos_tagger.js`
 96 | 
 97 | ## Testing
 98 | 
 99 | 在`node v0.10.2`下测试通过
100 | 
101 | ## Demo
102 | 
103 | http://cppjieba-webdemo.herokuapp.com/
104 | (chrome is suggested)
105 | 
106 | ## Thanks
107 | 
108 | [Jieba中文分词]
109 | 
110 | ## Author
111 | 
112 | - aszxqw   https://github.com/aszxqw   wuyanyi09@gmail.com
113 | - myl2821  https://github.com/myl2821  myl2821@gmail.com
114 | 
115 | [NodeJiebaBlog]:http://www.aszxqw.com/work/2014/02/22/nodejs-cpp-addon-nodejieba.html
116 | [CppJieba]:https://github.com/aszxqw/cppjieba.git
117 | [cnpm]:http://cnpmjs.org
118 | [Jieba中文分词]:https://github.com/fxsjy/jieba
119 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/ThreadPool.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_THREAD_POOL_HPP
  2 | #define LIMONP_THREAD_POOL_HPP
  3 | 
  4 | #include "Thread.hpp"
  5 | #include "BlockingQueue.hpp"
  6 | 
  7 | namespace Limonp
  8 | {
  9 |     class ITask
 10 |     {
 11 |         public:
 12 |             virtual void run() = 0;
 13 |             virtual ~ITask() {}
 14 |     };
 15 | 
 16 |     template <class TaskType, class ArgType>
 17 |         ITask* CreateTask(ArgType arg) 
 18 |         {
 19 |             return new TaskType(arg);
 20 |         }
 21 |     template <class TaskType, class ArgType0, class ArgType1>
 22 |         ITask* CreateTask(ArgType0 arg0, ArgType1 arg1) 
 23 |         {
 24 |             return new TaskType(arg0, arg1);
 25 |         }
 26 | 
 27 |     //class ThreadPool;
 28 |     class ThreadPool: NonCopyable
 29 |     {
 30 |         private:
 31 |             class Worker: public IThread
 32 |             {
 33 |                 private:
 34 |                     ThreadPool * ptThreadPool_;
 35 |                 public:
 36 |                     Worker(ThreadPool* pool): ptThreadPool_(pool)
 37 |                     {
 38 |                         assert(ptThreadPool_);
 39 |                     }
 40 |                     virtual ~Worker()
 41 |                     {
 42 |                     }
 43 |                 public:
 44 |                     virtual void run()
 45 |                     {
 46 |                         while(true)
 47 |                         {
 48 |                             ITask * task = ptThreadPool_->queue_.pop();
 49 |                             if(task == NULL) 
 50 |                             {
 51 |                                 break;
 52 |                             }
 53 |                             task->run();
 54 |                             delete task;
 55 |                         }
 56 |                     }
 57 |             };
 58 |         private:
 59 |             friend class Worker;
 60 |         private:
 61 |             vector<IThread*> threads_;
 62 |             BoundedBlockingQueue<ITask*> queue_;
 63 |             //mutable MutexLock mutex_;
 64 |             //Condition isEmpty__;
 65 |         public:
 66 |             ThreadPool(size_t threadNum, size_t queueMaxSize): threads_(threadNum), queue_(queueMaxSize)//, mutex_(), isEmpty__(mutex_)
 67 |             {
 68 |                 assert(threadNum);
 69 |                 assert(queueMaxSize);
 70 |                 for(size_t i = 0; i < threads_.size(); i ++)
 71 |                 {
 72 |                     threads_[i] = new Worker(this);
 73 |                 }
 74 |             }
 75 |             ~ThreadPool()
 76 |             {
 77 |                 for(size_t i = 0; i < threads_.size(); i ++)
 78 |                 {
 79 |                     queue_.push(NULL);
 80 |                 }
 81 |                 for(size_t i = 0; i < threads_.size(); i ++)
 82 |                 {
 83 |                     threads_[i]->join();
 84 |                     delete threads_[i];
 85 |                 }
 86 |             }
 87 |             
 88 |         public:
 89 |             void start()
 90 |             {
 91 |                 for(size_t i = 0; i < threads_.size(); i++)
 92 |                 {
 93 |                     threads_[i]->start();
 94 |                 }
 95 |             }
 96 | 
 97 |             void add(ITask* task)
 98 |             {
 99 |                 assert(task);
100 |                 queue_.push(task);
101 |             }
102 |     };
103 | }
104 | 
105 | #endif
106 | 


--------------------------------------------------------------------------------
/src/CppJieba/PosTagger.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_POS_TAGGING_H
  2 | #define CPPJIEBA_POS_TAGGING_H
  3 | 
  4 | #include "MixSegment.hpp"
  5 | #include "Limonp/StringUtil.hpp"
  6 | #include "DictTrie.hpp"
  7 | 
  8 | namespace CppJieba
  9 | {
 10 |     using namespace Limonp;
 11 | 
 12 |     static const char* const POS_M = "m";
 13 |     static const char* const POS_ENG = "eng";
 14 |     static const char* const POS_X = "x";
 15 | 
 16 |     class PosTagger
 17 |     {
 18 |         private:
 19 |             MixSegment _segment;
 20 |             const DictTrie * _dictTrie;
 21 | 
 22 |         public:
 23 |             PosTagger()
 24 |             {}
 25 |             PosTagger(
 26 |                 const string& dictPath, 
 27 |                 const string& hmmFilePath,
 28 |                 const string& userDictPath = ""
 29 |             )
 30 |             {
 31 |                 init(dictPath, hmmFilePath, userDictPath);
 32 |             };
 33 |             ~PosTagger(){};
 34 |         public:
 35 |             void init(
 36 |                 const string& dictPath, 
 37 |                 const string& hmmFilePath,
 38 |                 const string& userDictPath = ""
 39 |             )
 40 |             {
 41 |                 LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
 42 |                 _dictTrie = _segment.getDictTrie();
 43 |                 LIMONP_CHECK(_dictTrie);
 44 |             };
 45 |             
 46 | 
 47 |             bool tag(const string& src, vector<pair<string, string> >& res) const
 48 |             {
 49 |                 vector<string> cutRes;
 50 |                 if (!_segment.cut(src, cutRes))
 51 |                 {
 52 |                     LogError("_mixSegment cut failed");
 53 |                     return false;
 54 |                 }
 55 | 
 56 |                 const DictUnit *tmp = NULL;
 57 |                 Unicode unico;
 58 |                 for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
 59 |                 {
 60 |                     if (!TransCode::decode(*itr, unico))
 61 |                     {
 62 |                         LogError("decode failed.");
 63 |                         return false;
 64 |                     }
 65 |                     tmp = _dictTrie->find(unico.begin(), unico.end());
 66 |                     if(tmp == NULL || tmp->tag.empty())
 67 |                     {
 68 |                         res.push_back(make_pair(*itr, _specialRule(unico)));
 69 |                     }
 70 |                     else
 71 |                     {
 72 |                         res.push_back(make_pair(*itr, tmp->tag));
 73 |                     }
 74 |                 }
 75 |                 return !res.empty();
 76 |             }
 77 |         private:
 78 |             const char* _specialRule(const Unicode& unicode) const
 79 |             {
 80 |                 size_t m = 0;
 81 |                 size_t eng = 0;
 82 |                 for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) 
 83 |                 {
 84 |                     if(unicode[i] < 0x80)
 85 |                     {
 86 |                         eng ++;
 87 |                         if('0' <= unicode[i] && unicode[i] <= '9')
 88 |                         {
 89 |                             m++;
 90 |                         }
 91 |                     }
 92 |                 }
 93 |                 // ascii char is not found
 94 |                 if(eng == 0)
 95 |                 {
 96 |                     return POS_X;
 97 |                 }
 98 |                 // all the ascii is number char
 99 |                 if(m == eng)
100 |                 {
101 |                     return POS_M;
102 |                 }
103 |                 // the ascii chars contain english letter
104 |                 return POS_ENG;
105 |             }
106 |     };
107 | }
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Config.hpp:
--------------------------------------------------------------------------------
  1 | /************************************
  2 |  * file enc : utf8
  3 |  * author   : wuyanyi09@gmail.com
  4 |  ************************************/
  5 | #ifndef LIMONP_CONFIG_H
  6 | #define LIMONP_CONFIG_H
  7 | 
  8 | 
  9 | #include <map>
 10 | #include <fstream>
 11 | #include <iostream>
 12 | #include <assert.h>
 13 | #include "StringUtil.hpp"
 14 | 
 15 | namespace Limonp
 16 | {
 17 |     using namespace std;
 18 |     class Config
 19 |     {
 20 |         public:
 21 |             explicit Config(const string& filePath)
 22 |             {
 23 |                 loadFile_(filePath);
 24 |             }
 25 |         public:
 26 |             operator bool ()
 27 |             {
 28 |                 return !map_.empty();
 29 |             }
 30 |         private:
 31 |             void loadFile_(const string& filePath)
 32 |             {
 33 |                 ifstream ifs(filePath.c_str());
 34 |                 assert(ifs);
 35 |                 string line;
 36 |                 vector<string> vecBuf;
 37 |                 size_t lineno = 0;
 38 |                 while(getline(ifs, line))
 39 |                 {
 40 |                     lineno ++;
 41 |                     trim(line);
 42 |                     if(line.empty() || startsWith(line, "#"))
 43 |                     {
 44 |                         continue;
 45 |                     }
 46 |                     vecBuf.clear();
 47 |                     if(!split(line, vecBuf, "=") || 2 != vecBuf.size())
 48 |                     {
 49 |                         fprintf(stderr, "line[%s] illegal.\n", line.c_str());
 50 |                         assert(false);
 51 |                         continue;
 52 |                     }
 53 |                     string& key = vecBuf[0];
 54 |                     string& value = vecBuf[1];
 55 |                     trim(key);
 56 |                     trim(value);
 57 |                     if(!map_.insert(make_pair(key, value)).second)
 58 |                     {
 59 |                         fprintf(stderr, "key[%s] already exits.\n", key.c_str());
 60 |                         assert(false);
 61 |                         continue;
 62 |                     }
 63 |                 }
 64 |                 ifs.close();
 65 |             }
 66 |         public:
 67 |             bool get(const string& key, string& value) const
 68 |             {
 69 |                 map<string, string>::const_iterator it = map_.find(key);
 70 |                 if(map_.end() != it)
 71 |                 {
 72 |                     value = it->second;
 73 |                     return true;
 74 |                 }
 75 |                 return false;
 76 |             }
 77 |             bool get(const string& key, int & value) const
 78 |             {
 79 |                 string str;
 80 |                 if(!get(key, str)) {
 81 |                     return false;
 82 |                 }
 83 |                 value = atoi(str.c_str());
 84 |                 return true;
 85 |             }
 86 |             const char* operator [] (const char* key) const
 87 |             {
 88 |                 if(NULL == key)
 89 |                 {
 90 |                     return NULL;
 91 |                 }
 92 |                 map<string, string>::const_iterator it = map_.find(key);
 93 |                 if(map_.end() != it)
 94 |                 {
 95 |                     return it->second.c_str();
 96 |                 }
 97 |                 return NULL;
 98 |             }
 99 |         public:
100 |             string getConfigInfo() const
101 |             {
102 |                 string res;
103 |                 res << *this;
104 |                 return res;
105 |             }
106 |         private:
107 |             map<string, string> map_;
108 |         private:
109 |             friend ostream& operator << (ostream& os, const Config& config);
110 |     };
111 |     
112 |     inline ostream& operator << (ostream& os, const Config& config)
113 |     {
114 |         return os << config.map_;
115 |     }
116 | }
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/src/CppJieba/QuerySegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_QUERYSEGMENT_H
  2 | #define CPPJIEBA_QUERYSEGMENT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <set>
  6 | #include <cassert>
  7 | #include "Limonp/Logger.hpp"
  8 | #include "DictTrie.hpp"
  9 | #include "ISegment.hpp"
 10 | #include "SegmentBase.hpp"
 11 | #include "FullSegment.hpp"
 12 | #include "MixSegment.hpp"
 13 | #include "TransCode.hpp"
 14 | #include "DictTrie.hpp"
 15 | 
 16 | namespace CppJieba
 17 | {
 18 |     class QuerySegment: public SegmentBase
 19 |     {
 20 |     private:
 21 |         MixSegment _mixSeg;
 22 |         FullSegment _fullSeg;
 23 |         size_t _maxWordLen;
 24 | 
 25 |     public:
 26 |         QuerySegment(){};
 27 |         QuerySegment(const string& dict, const string& model, size_t maxWordLen)
 28 |         {
 29 |             init(dict, model, maxWordLen);
 30 |         };
 31 |         virtual ~QuerySegment(){};
 32 |     public:
 33 |         bool init(const string& dict, const string& model, size_t maxWordLen)
 34 |         {
 35 |             LIMONP_CHECK(_mixSeg.init(dict, model));
 36 |             LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
 37 |             assert(maxWordLen);
 38 |             _maxWordLen = maxWordLen;
 39 |             return true;
 40 |         }
 41 | 
 42 |     public:
 43 |         using SegmentBase::cut;
 44 | 
 45 |     public:
 46 |         bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
 47 |         {
 48 |             if (begin >= end)
 49 |             {
 50 |                 LogError("begin >= end");
 51 |                 return false;
 52 |             }
 53 | 
 54 |             //use mix cut first
 55 |             vector<Unicode> mixRes;
 56 |             if (!_mixSeg.cut(begin, end, mixRes))
 57 |             {
 58 |                 LogError("_mixSeg cut failed.");
 59 |                 return false;
 60 |             }
 61 | 
 62 |             vector<Unicode> fullRes;
 63 |             for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
 64 |             {
 65 |                 
 66 |                 // if it's too long, cut with _fullSeg, put fullRes in res
 67 |                 if (mixResItr->size() > _maxWordLen)
 68 |                 {
 69 |                     if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
 70 |                     {
 71 |                        for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
 72 |                        {
 73 |                            res.push_back(*fullResItr);
 74 |                        }
 75 | 
 76 |                        //clear tmp res
 77 |                        fullRes.clear();
 78 |                     }
 79 |                 }
 80 |                 else // just use the mix result
 81 |                 {
 82 |                     res.push_back(*mixResItr);
 83 |                 }
 84 |             }
 85 | 
 86 |             return true;
 87 |         }
 88 | 
 89 | 
 90 |         bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
 91 |         {
 92 |             if (begin >= end)
 93 |             {
 94 |                 LogError("begin >= end");
 95 |                 return false;
 96 |             }
 97 | 
 98 |             vector<Unicode> uRes;
 99 |             if (!cut(begin, end, uRes))
100 |             {
101 |                 LogError("get unicode cut result error.");
102 |                 return false;
103 |             }
104 | 
105 |             string tmp;
106 |             for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
107 |             {
108 |                 if (TransCode::encode(*uItr, tmp))
109 |                 {
110 |                     res.push_back(tmp);
111 |                 }
112 |                 else
113 |                 {
114 |                     LogError("encode failed.");
115 |                 }
116 |             }
117 | 
118 |             return true;
119 |         }
120 |     };
121 | }
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/StdExtension.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_STD_EXTEMSION_HPP
  2 | #define LIMONP_STD_EXTEMSION_HPP
  3 | 
  4 | #include <map>
  5 | 
  6 | #if(__cplusplus == 201103L)
  7 | #include <unordered_map>
  8 | #include <unordered_set>
  9 | #else
 10 | #include <tr1/unordered_map>
 11 | #include <tr1/unordered_set>
 12 | namespace std
 13 | {
 14 |     using std::tr1::unordered_map;
 15 |     using std::tr1::unordered_set;
 16 | }
 17 | 
 18 | #endif
 19 | 
 20 | #include <set>
 21 | #include <vector>
 22 | #include <fstream>
 23 | #include <sstream>
 24 | 
 25 | 
 26 | namespace std
 27 | {
 28 |     template<typename T>
 29 |         ostream& operator << (ostream& os, const vector<T>& vec)
 30 |         {
 31 |             if(vec.empty())
 32 |             {
 33 |                 return os << "[]";
 34 |             }
 35 |             os<<"[\""<<vec[0];
 36 |             for(size_t i = 1; i < vec.size(); i++)
 37 |             {
 38 |                 os<<"\", \""<<vec[i];
 39 |             }
 40 |             os<<"\"]";
 41 |             return os;
 42 |         }
 43 |     template<class T1, class T2>
 44 |         ostream& operator << (ostream& os, const pair<T1, T2>& pr)
 45 |         {
 46 |             os << pr.first << ":" << pr.second ;
 47 |             return os;
 48 |         }
 49 | 
 50 | 
 51 |     template<class T>
 52 |         string& operator << (string& str, const T& obj)
 53 |         {
 54 |             stringstream ss;
 55 |             ss << obj; // call ostream& operator << (ostream& os,
 56 |             return str = ss.str();
 57 |         }
 58 | 
 59 |     template<class T1, class T2>
 60 |         ostream& operator << (ostream& os, const map<T1, T2>& mp)
 61 |         {
 62 |             if(mp.empty())
 63 |             {
 64 |                 os<<"{}";
 65 |                 return os;
 66 |             }
 67 |             os<<'{';
 68 |             typename map<T1, T2>::const_iterator it = mp.begin();
 69 |             os<<*it;
 70 |             it++;
 71 |             while(it != mp.end())
 72 |             {
 73 |                 os<<", "<<*it;
 74 |                 it++;
 75 |             }
 76 |             os<<'}';
 77 |             return os;
 78 |         }
 79 |     template<class T1, class T2>
 80 |         ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp)
 81 |         {
 82 |             if(mp.empty())
 83 |             {
 84 |                 return os << "{}";
 85 |             }
 86 |             os<<'{';
 87 |             typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
 88 |             os<<*it;
 89 |             it++;
 90 |             while(it != mp.end())
 91 |             {
 92 |                 os<<", "<<*it++;
 93 |             }
 94 |             return os<<'}';
 95 |         }
 96 | 
 97 |     template<class T>
 98 |         ostream& operator << (ostream& os, const set<T>& st)
 99 |         {
100 |             if(st.empty())
101 |             {
102 |                 os << "{}";
103 |                 return os;
104 |             }
105 |             os<<'{';
106 |             typename set<T>::const_iterator it = st.begin();
107 |             os<<*it;
108 |             it++;
109 |             while(it != st.end())
110 |             {
111 |                 os<<", "<<*it;
112 |                 it++;
113 |             }
114 |             os<<'}';
115 |             return os;
116 |         }
117 | 
118 |     template<class KeyType, class ContainType>
119 |         bool isIn(const ContainType& contain, const KeyType& key)
120 |         {
121 |             return contain.end() != contain.find(key);
122 |         }
123 | 
124 |     template<class T>
125 |         basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs)
126 |         {
127 |             return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
128 |         }
129 | 
130 |     template<class T>
131 |         ofstream & operator << (ofstream & ofs, const basic_string<T>& s)
132 |         {
133 |             ostreambuf_iterator<T> itr (ofs);
134 |             copy(s.begin(), s.end(), itr);
135 |             return ofs;
136 |         }
137 | }
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/BlockingQueue.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | https://github.com/chenshuo/muduo/blob/master/muduo/base/BlockingQueue.h
  3 | */
  4 | 
  5 | #ifndef LIMONP_BLOCKINGQUEUE_HPP
  6 | #define LIMONP_BLOCKINGQUEUE_HPP
  7 | 
  8 | #include <queue>
  9 | #include "BoundedQueue.hpp"
 10 | #include "Condition.hpp"
 11 | 
 12 | namespace Limonp
 13 | {
 14 |     template<class T>
 15 |         class BlockingQueue: NonCopyable
 16 |         {
 17 |             public:
 18 |                 BlockingQueue()
 19 |                     : mutex_(), notEmpty_(mutex_), queue_()
 20 |                 {
 21 |                 }
 22 | 
 23 |                 void push(const T& x)
 24 |                 {
 25 |                     MutexLockGuard lock(mutex_);
 26 |                     queue_.push(x);
 27 |                     notEmpty_.notify(); // wait morphing saves us
 28 |                 }
 29 | 
 30 |                 T pop()
 31 |                 {
 32 |                     MutexLockGuard lock(mutex_);
 33 |                     // always use a while-loop, due to spurious wakeup
 34 |                     while (queue_.empty())
 35 |                     {
 36 |                         notEmpty_.wait();
 37 |                     }
 38 |                     assert(!queue_.empty());
 39 |                     T front(queue_.front());
 40 |                     queue_.pop();
 41 |                     return front;
 42 |                 }
 43 | 
 44 |                 size_t size() const
 45 |                 {
 46 |                     MutexLockGuard lock(mutex_);
 47 |                     return queue_.size();
 48 |                 }
 49 |                 bool empty() const
 50 |                 {
 51 |                     return size() == 0;
 52 |                 }
 53 | 
 54 |             private:
 55 |                 mutable MutexLock mutex_;
 56 |                 Condition         notEmpty_;
 57 |                 std::queue<T>     queue_;
 58 |         };
 59 | 
 60 |     template<typename T>
 61 |         class BoundedBlockingQueue : NonCopyable
 62 |         {
 63 |             public:
 64 |                 explicit BoundedBlockingQueue(size_t maxSize)
 65 |                     : mutex_(),
 66 |                     notEmpty_(mutex_),
 67 |                     notFull_(mutex_),
 68 |                     queue_(maxSize)
 69 |                 {}
 70 | 
 71 |                 void push(const T& x)
 72 |                 {
 73 |                     MutexLockGuard lock(mutex_);
 74 |                     while (queue_.full())
 75 |                     {
 76 |                         notFull_.wait();
 77 |                     }
 78 |                     assert(!queue_.full());
 79 |                     queue_.push(x);
 80 |                     notEmpty_.notify();
 81 |                 }
 82 | 
 83 |                 T pop()
 84 |                 {
 85 |                     MutexLockGuard lock(mutex_);
 86 |                     while (queue_.empty())
 87 |                     {
 88 |                         notEmpty_.wait();
 89 |                     }
 90 |                     assert(!queue_.empty());
 91 |                     T res = queue_.pop();
 92 |                     notFull_.notify();
 93 |                     return res;
 94 |                 }
 95 | 
 96 |                 bool empty() const
 97 |                 {
 98 |                     MutexLockGuard lock(mutex_);
 99 |                     return queue_.empty();
100 |                 }
101 | 
102 |                 bool full() const
103 |                 {
104 |                     MutexLockGuard lock(mutex_);
105 |                     return queue_.full();
106 |                 }
107 | 
108 |                 size_t size() const
109 |                 {
110 |                     MutexLockGuard lock(mutex_);
111 |                     return queue_.size();
112 |                 }
113 | 
114 |                 size_t capacity() const
115 |                 {
116 |                     return queue_.capacity();
117 |                 }
118 | 
119 |             private:
120 |                 mutable MutexLock          mutex_;
121 |                 Condition                  notEmpty_;
122 |                 Condition                  notFull_;
123 |                 BoundedQueue<T>  queue_;
124 |         };
125 | 
126 | }
127 | 
128 | #endif
129 | 


--------------------------------------------------------------------------------
/src/CppJieba/MixSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_MIXSEGMENT_H
  2 | #define CPPJIEBA_MIXSEGMENT_H
  3 | 
  4 | #include <cassert>
  5 | #include "MPSegment.hpp"
  6 | #include "HMMSegment.hpp"
  7 | #include "Limonp/StringUtil.hpp"
  8 | 
  9 | namespace CppJieba
 10 | {
 11 |     class MixSegment: public SegmentBase
 12 |     {
 13 |         private:
 14 |             MPSegment _mpSeg;
 15 |             HMMSegment _hmmSeg;
 16 |         public:
 17 |             MixSegment(){};
 18 |             MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 19 |             {
 20 |                 LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
 21 |             }
 22 |             virtual ~MixSegment(){}
 23 |         public:
 24 |             bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 25 |             {
 26 |                 LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
 27 |                 LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
 28 |                 LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
 29 |                 return true;
 30 |             }
 31 |         public:
 32 |             using SegmentBase::cut;
 33 |         public:
 34 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
 35 |             {
 36 |                 vector<Unicode> words;
 37 |                 words.reserve(end - begin);
 38 |                 if(!_mpSeg.cut(begin, end, words))
 39 |                 {
 40 |                     LogError("mpSeg cutDAG failed.");
 41 |                     return false;
 42 |                 }
 43 | 
 44 |                 vector<Unicode> hmmRes;
 45 |                 hmmRes.reserve(end - begin);
 46 |                 Unicode piece;
 47 |                 piece.reserve(end - begin);
 48 |                 for (size_t i = 0, j = 0; i < words.size(); i++)
 49 |                 {
 50 |                     //if mp get a word, it's ok, put it into result
 51 |                     if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
 52 |                     {
 53 |                         res.push_back(words[i]);
 54 |                         continue;
 55 |                     }
 56 | 
 57 |                     // if mp get a single one and it is not in userdict, collect it in sequence
 58 |                     j = i;
 59 |                     while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
 60 |                     {
 61 |                         piece.push_back(words[j][0]);
 62 |                         j++;
 63 |                     }
 64 | 
 65 |                     // cut the sequence with hmm
 66 |                     if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
 67 |                     {
 68 |                         LogError("_hmmSeg cut failed.");
 69 |                         return false;
 70 |                     }
 71 | 
 72 |                     //put hmm result to result
 73 |                     for (size_t k = 0; k < hmmRes.size(); k++)
 74 |                     {
 75 |                         res.push_back(hmmRes[k]);
 76 |                     }
 77 | 
 78 |                     //clear tmp vars
 79 |                     piece.clear();
 80 |                     hmmRes.clear();
 81 | 
 82 |                     //let i jump over this piece
 83 |                     i = j - 1;
 84 |                 }
 85 |                 return true;
 86 |             }
 87 | 
 88 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
 89 |             {
 90 |                 if(begin == end)
 91 |                 {
 92 |                     return false;
 93 |                 }
 94 | 
 95 |                 vector<Unicode> uRes;
 96 |                 uRes.reserve(end - begin);
 97 |                 if (!cut(begin, end, uRes))
 98 |                 {
 99 |                     return false;
100 |                 }
101 | 
102 |                 size_t offset = res.size();
103 |                 res.resize(res.size() + uRes.size());
104 |                 for(size_t i = 0; i < uRes.size(); i ++, offset++)
105 |                 {
106 |                     if(!TransCode::encode(uRes[i], res[offset]))
107 |                     {
108 |                         LogError("encode failed.");
109 |                     }
110 |                 }
111 |                 return true;
112 |             }
113 | 
114 |             const DictTrie* getDictTrie() const 
115 |             {
116 |                 return _mpSeg.getDictTrie();
117 |             }
118 |     };
119 | }
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/MysqlClient.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_MYSQLCLIENT_H
  2 | #define LIMONP_MYSQLCLIENT_H
  3 | 
  4 | #include <mysql.h>
  5 | #include <iostream>
  6 | #include <vector>
  7 | #include <string>
  8 | #include "Logger.hpp"
  9 | #include "InitOnOff.hpp"
 10 | 
 11 | namespace Limonp
 12 | {
 13 |     using namespace std;
 14 |     class MysqlClient: public InitOnOff
 15 |     {
 16 |         public:
 17 |             typedef vector< vector<string> > RowsType;
 18 |         private:
 19 |             const string host_;
 20 |             const size_t port_;
 21 |             const string user_;
 22 |             const string passwd_;
 23 |             const string db_;
 24 |             const string charset_;
 25 |         public:
 26 |             MysqlClient(const string& host, size_t port, const string& user, const string& passwd, const string& db, const string& charset = "utf8"): host_(host), port_(port), user_(user), passwd_(passwd), db_(db), charset_(charset), conn_(NULL)
 27 |             {
 28 |                 setInitFlag_(init_());
 29 |             }
 30 |             ~MysqlClient()
 31 |             {
 32 |                 if(conn_)
 33 |                 {
 34 |                     mysql_close(conn_);
 35 |                 }
 36 |             };
 37 |         private:
 38 |             bool init_()
 39 |             {
 40 |                 //cout<<mysql_get_client_info()<<endl;
 41 |                 if(NULL == (conn_ = mysql_init(NULL)))
 42 |                 {
 43 |                     LogError("mysql_init faield. %s", mysql_error(conn_));
 44 |                     return false;
 45 |                 }
 46 | 
 47 |                 if (mysql_real_connect(conn_, host_.c_str(), user_.c_str(), passwd_.c_str(), db_.c_str(), port_, NULL, 0) == NULL)
 48 |                 {
 49 |                     LogError("mysql_real_connect failed. %s", mysql_error(conn_));
 50 |                     mysql_close(conn_);
 51 |                     conn_ = NULL;
 52 |                     return false;
 53 |                 }  
 54 | 
 55 |                 if(mysql_set_character_set(conn_, charset_.c_str()))
 56 |                 {
 57 |                     LogError("mysql_set_character_set [%s] failed.", charset_.c_str());
 58 |                     return false;
 59 |                 }
 60 | 
 61 |                 //set reconenct
 62 |                 char value = 1;
 63 |                 mysql_options(conn_, MYSQL_OPT_RECONNECT, &value);
 64 | 
 65 |                 LogInfo("MysqlClient {host: %s, database:%s, charset:%s}", host_.c_str(), db_.c_str(), charset_.c_str());
 66 |                 return true;
 67 |             }
 68 |         public:
 69 |             bool executeSql(const string& sql)
 70 |             {
 71 |                 assert(getInitFlag_());
 72 |                 if(mysql_query(conn_, sql.c_str())) 
 73 |                 {
 74 |                     LogError("mysql_query failed.  %s", mysql_error(conn_));
 75 |                     return false;
 76 |                 }
 77 |                 return true;
 78 |             }
 79 |             size_t insert(const string& tableName, const string& keys, const vector<string>& vals)
 80 |             {
 81 |                 size_t retn = 0;
 82 |                 string sql;
 83 |                 for(size_t i = 0; i < vals.size(); i ++)
 84 |                 {
 85 |                     sql.clear();
 86 |                     string_format(sql, "insert into %s (%s) values %s", tableName.c_str(), keys.c_str(), vals[i].c_str());
 87 |                     retn += executeSql(sql.c_str());
 88 |                 }
 89 |                 return retn;
 90 |             }
 91 |             bool select(const string& sql, RowsType& rows)
 92 |             {
 93 |                 if(!executeSql(sql))
 94 |                 {
 95 |                     LogError("executeSql failed. [%s]", sql.c_str());
 96 |                     return false;
 97 |                 }
 98 |                 MYSQL_RES * result = mysql_store_result(conn_);
 99 |                 if(!result)
100 |                 {
101 |                     LogError("mysql_store_result failed.[%d]", mysql_error(conn_));
102 |                     return false;
103 |                 }
104 |                 size_t num_fields = mysql_num_fields(result);
105 |                 MYSQL_ROW row;
106 |                 while((row = mysql_fetch_row(result)))
107 |                 {
108 |                     vector<string> vec;
109 |                     for(size_t i = 0; i < num_fields; i ++)
110 |                     {
111 |                         row[i] ? vec.push_back(row[i]) : vec.push_back("NULL");
112 |                     }
113 |                     rows.push_back(vec);
114 |                 }
115 |                 mysql_free_result(result);
116 |                 return true;
117 |             }
118 | 
119 |         private:
120 |             MYSQL * conn_;
121 | 
122 |     };
123 | }
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------
/src/CppJieba/MPSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_MPSEGMENT_H
  2 | #define CPPJIEBA_MPSEGMENT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <set>
  6 | #include <cassert>
  7 | #include "Limonp/Logger.hpp"
  8 | #include "DictTrie.hpp"
  9 | #include "ISegment.hpp"
 10 | #include "SegmentBase.hpp"
 11 | 
 12 | namespace CppJieba
 13 | {
 14 | 
 15 |     class MPSegment: public SegmentBase
 16 |     {
 17 |         private:
 18 |             DictTrie _dictTrie;
 19 | 
 20 |         public:
 21 |             MPSegment(){};
 22 |             MPSegment(const string& dictPath, const string& userDictPath = "")
 23 |             {
 24 |                 LIMONP_CHECK(init(dictPath, userDictPath));
 25 |             };
 26 |             virtual ~MPSegment(){};
 27 |         public:
 28 |             bool init(const string& dictPath, const string& userDictPath = "")
 29 |             {
 30 |                 LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
 31 |                 LogInfo("MPSegment init(%s) ok", dictPath.c_str());
 32 |                 return true;
 33 |             }
 34 |             bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
 35 |             {
 36 |                 return _dictTrie.isUserDictSingleChineseWord(value);
 37 |             }
 38 |         public:
 39 |             using SegmentBase::cut;
 40 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
 41 |             {
 42 |                 if(begin == end)
 43 |                 {
 44 |                     return false;
 45 |                 }
 46 | 
 47 |                 vector<Unicode> words;
 48 |                 words.reserve(end - begin);
 49 |                 if(!cut(begin, end, words))
 50 |                 {
 51 |                     return false;
 52 |                 }
 53 |                 size_t offset = res.size();
 54 |                 res.resize(res.size() + words.size());
 55 |                 for(size_t i = 0; i < words.size(); i++)
 56 |                 {
 57 |                     if(!TransCode::encode(words[i], res[i + offset]))
 58 |                     {
 59 |                         LogError("encode failed.");
 60 |                         res[i + offset].clear();
 61 |                     }
 62 |                 }
 63 |                 return true;
 64 |             }
 65 | 
 66 |             bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
 67 |             {
 68 |                 if(end == begin)
 69 |                 {
 70 |                     return false;
 71 |                 }
 72 |                 vector<SegmentChar> segmentChars;
 73 | 
 74 |                 _dictTrie.find(begin, end, segmentChars);
 75 | 
 76 |                 _calcDP(segmentChars);
 77 | 
 78 |                 _cut(segmentChars, res);
 79 | 
 80 |                 return true;
 81 |             }
 82 |             const DictTrie* getDictTrie() const 
 83 |             {
 84 |                 return &_dictTrie;
 85 |             }
 86 | 
 87 |         private:
 88 |             void _calcDP(vector<SegmentChar>& segmentChars) const
 89 |             {
 90 |                 size_t nextPos;
 91 |                 const DictUnit* p;
 92 |                 double val;
 93 | 
 94 |                 for(ssize_t i = segmentChars.size() - 1; i >= 0; i--)
 95 |                 {
 96 |                     segmentChars[i].pInfo = NULL;
 97 |                     segmentChars[i].weight = MIN_DOUBLE;
 98 |                     assert(!segmentChars[i].dag.empty());
 99 |                     for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++)
100 |                     {
101 |                         nextPos = it->first;
102 |                         p = it->second;
103 |                         val = 0.0;
104 |                         if(nextPos + 1 < segmentChars.size())
105 |                         {
106 |                             val += segmentChars[nextPos + 1].weight;
107 |                         }
108 | 
109 |                         if(p)
110 |                         {
111 |                             val += p->weight; 
112 |                         }
113 |                         else
114 |                         {
115 |                             val += _dictTrie.getMinWeight();
116 |                         }
117 |                         if(val > segmentChars[i].weight)
118 |                         {
119 |                             segmentChars[i].pInfo = p;
120 |                             segmentChars[i].weight = val;
121 |                         }
122 |                     }
123 |                 }
124 |             }
125 |             void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const
126 |             {
127 |                 size_t i = 0;
128 |                 while(i < segmentChars.size())
129 |                 {
130 |                     const DictUnit* p = segmentChars[i].pInfo;
131 |                     if(p)
132 |                     {
133 |                         res.push_back(p->word);
134 |                         i += p->word.size();
135 |                     }
136 |                     else//single chinese word
137 |                     {
138 |                         res.push_back(Unicode(1, segmentChars[i].uniCh));
139 |                         i++;
140 |                     }
141 |                 }
142 |             }
143 | 
144 | 
145 |     };
146 | }
147 | 
148 | #endif
149 | 


--------------------------------------------------------------------------------
/src/CppJieba/FullSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_FULLSEGMENT_H
  2 | #define CPPJIEBA_FULLSEGMENT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <set>
  6 | #include <cassert>
  7 | #include "Limonp/Logger.hpp"
  8 | #include "DictTrie.hpp"
  9 | #include "ISegment.hpp"
 10 | #include "SegmentBase.hpp"
 11 | #include "TransCode.hpp"
 12 | 
 13 | namespace CppJieba
 14 | {
 15 |     class FullSegment: public SegmentBase
 16 |     {
 17 |         private:
 18 |             const DictTrie* _dictTrie;
 19 |             bool _isBorrowed;
 20 |         public:
 21 |             FullSegment()
 22 |             {
 23 |                 _dictTrie = NULL;
 24 |                 _isBorrowed = false;
 25 |             }
 26 |             explicit FullSegment(const string& dictPath)
 27 |             {
 28 |                 _dictTrie = NULL;
 29 |                 init(dictPath);
 30 |             }
 31 |             explicit FullSegment(const DictTrie* dictTrie) 
 32 |             {
 33 |                 _dictTrie = NULL;
 34 |                 init(dictTrie);
 35 |             }
 36 |             virtual ~FullSegment()
 37 |             {
 38 |                 if(_dictTrie && ! _isBorrowed) 
 39 |                 {
 40 |                     delete _dictTrie;
 41 |                 }
 42 | 
 43 |             };
 44 |         public:
 45 |             bool init(const string& dictPath)
 46 |             {
 47 |                 assert(_dictTrie == NULL);
 48 |                 _dictTrie = new DictTrie(dictPath);
 49 |                 _isBorrowed = false;
 50 |                 return true;
 51 |             }
 52 |             bool init(const DictTrie* dictTrie) 
 53 |             {
 54 |                 assert(_dictTrie == NULL);
 55 |                 assert(dictTrie);
 56 |                 _dictTrie = dictTrie;
 57 |                 _isBorrowed = true;
 58 |                 return true;
 59 |             }
 60 | 
 61 |         public:
 62 |             using SegmentBase::cut;
 63 | 
 64 |         public:
 65 |             bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
 66 |             {
 67 |                 assert(_dictTrie);
 68 |                 if (begin >= end)
 69 |                 {
 70 |                     LogError("begin >= end");
 71 |                     return false;
 72 |                 }
 73 | 
 74 |                 //resut of searching in trie tree
 75 |                 DagType tRes;
 76 | 
 77 |                 //max index of res's words
 78 |                 int maxIdx = 0;
 79 | 
 80 |                 // always equals to (uItr - begin)
 81 |                 int uIdx = 0;
 82 | 
 83 |                 //tmp variables
 84 |                 int wordLen = 0;
 85 |                 for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
 86 |                 {
 87 |                     //find word start from uItr
 88 |                     if (_dictTrie->find(uItr, end, tRes, 0))
 89 |                     {
 90 |                         for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
 91 |                         //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
 92 |                         {
 93 |                             wordLen = itr->second->word.size();
 94 |                             if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
 95 |                             {
 96 |                                 res.push_back(itr->second->word);
 97 |                             }
 98 |                             maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
 99 |                         }
100 |                         tRes.clear();
101 |                     }
102 |                     else // not found word start from uItr
103 |                     {
104 |                         if (maxIdx <= uIdx) // never exist in prev results
105 |                         {
106 |                             //put itr itself in res
107 |                             res.push_back(Unicode(1, *uItr));
108 | 
109 |                             //mark it exits
110 |                             ++maxIdx;
111 |                         }
112 |                     }
113 |                     ++uIdx;
114 |                 }
115 | 
116 |                 return true;
117 |             }
118 | 
119 |             bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
120 |             {
121 |                 assert(_dictTrie);
122 |                 if (begin >= end)
123 |                 {
124 |                     LogError("begin >= end");
125 |                     return false;
126 |                 }
127 | 
128 |                 vector<Unicode> uRes;
129 |                 if (!cut(begin, end, uRes))
130 |                 {
131 |                     LogError("get unicode cut result error.");
132 |                     return false;
133 |                 }
134 | 
135 |                 string tmp;
136 |                 for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
137 |                 {
138 |                     if (TransCode::encode(*uItr, tmp))
139 |                     {
140 |                         res.push_back(tmp);
141 |                     }
142 |                     else
143 |                     {
144 |                         LogError("encode failed.");
145 |                     }
146 |                 }
147 | 
148 |                 return true;
149 |             }
150 |     };
151 | }
152 | 
153 | #endif
154 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/LocalVector.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_LOCAL_VECTOR_HPP
  2 | #define LIMONP_LOCAL_VECTOR_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <stdlib.h>
  6 | #include <assert.h>
  7 | #include <string.h>
  8 | 
  9 | namespace Limonp
 10 | {
 11 |     using namespace std;
 12 |     /*
 13 |      * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
 14 |      * LocalVector<T> is simple and not well-tested. 
 15 |      */
 16 |     const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
 17 |     template <class T>
 18 |         class LocalVector
 19 |         {
 20 |             public:
 21 |                 typedef const T* const_iterator ;
 22 |                 typedef T value_type;
 23 |                 typedef size_t size_type;
 24 |             private:
 25 |                 T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
 26 |                 T * ptr_;
 27 |                 size_t size_;
 28 |                 size_t capacity_;
 29 |             public:
 30 |                 LocalVector()
 31 |                 {
 32 |                     init_();
 33 |                 };
 34 |                 LocalVector(const LocalVector<T>& vec)
 35 |                 {
 36 |                     init_();
 37 |                     *this = vec;
 38 |                 }
 39 |                 LocalVector(const_iterator  begin, const_iterator end) // TODO: make it faster
 40 |                 {
 41 |                     init_();
 42 |                     while(begin != end)
 43 |                     {
 44 |                         push_back(*begin++);
 45 |                     }
 46 |                 }
 47 |                 LocalVector(size_t size, const T& t) // TODO: make it faster
 48 |                 {
 49 |                     init_();
 50 |                     while(size--)
 51 |                     {
 52 |                         push_back(t);
 53 |                     }
 54 |                 }
 55 |                 ~LocalVector()
 56 |                 {
 57 |                     if(ptr_ != buffer_)
 58 |                     {
 59 |                         free(ptr_);
 60 |                     }
 61 |                 };
 62 |             public:
 63 |                 LocalVector<T>& operator = (const LocalVector<T>& vec)
 64 |                 {
 65 |                     clear();
 66 |                     size_ = vec.size();
 67 |                     capacity_ = vec.capacity();
 68 |                     if(vec.buffer_ == vec.ptr_)
 69 |                     {
 70 |                         memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
 71 |                         ptr_ = buffer_;
 72 |                     }
 73 |                     else
 74 |                     {
 75 |                         ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
 76 |                         assert(ptr_);
 77 |                         memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
 78 |                     }
 79 |                     return *this;
 80 |                 }
 81 |             private:
 82 |                 void init_()
 83 |                 {
 84 |                     ptr_ = buffer_;
 85 |                     size_ = 0;
 86 |                     capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
 87 |                 }
 88 |             public:
 89 |                 T& operator [] (size_t i) 
 90 |                 {
 91 |                     return ptr_[i];
 92 |                 }
 93 |                 const T& operator [] (size_t i) const
 94 |                 {
 95 |                     return ptr_[i];
 96 |                 }
 97 |                 void push_back(const T& t)
 98 |                 {
 99 |                     if(size_ == capacity_)
100 |                     {
101 |                         assert(capacity_);
102 |                         reserve(capacity_ * 2);
103 |                     }
104 |                     ptr_[size_ ++ ] = t;
105 |                 }
106 |                 void reserve(size_t size) 
107 |                 {
108 |                     if(size <= capacity_)
109 |                     {
110 |                         return;
111 |                     }
112 |                     T * next =  (T*)malloc(sizeof(T) * size);
113 |                     assert(next);
114 |                     T * old = ptr_;
115 |                     ptr_ = next;
116 |                     memcpy(ptr_, old, sizeof(T) * capacity_);
117 |                     capacity_ = size;
118 |                     if(old != buffer_)
119 |                     {
120 |                         free(old);
121 |                     }
122 |                 }
123 |                 bool empty() const
124 |                 {
125 |                     return 0 == size();
126 |                 }
127 |                 size_t size() const
128 |                 {
129 |                     return size_;
130 |                 }
131 |                 size_t capacity() const
132 |                 {
133 |                     return capacity_;
134 |                 }
135 |                 const_iterator begin() const
136 |                 {
137 |                     return ptr_;
138 |                 }
139 |                 const_iterator end() const
140 |                 {
141 |                     return ptr_ + size_;
142 |                 }
143 |                 void clear()
144 |                 {
145 |                     if(ptr_ != buffer_)
146 |                     {
147 |                         free(ptr_);
148 |                     }
149 |                     init_();
150 |                 }
151 |         };
152 | 
153 |     template <class T>
154 |         ostream & operator << (ostream& os, const LocalVector<T>& vec)
155 |         {
156 |             if(vec.empty())
157 |             {
158 |                 return os << "[]";
159 |             }
160 |             os<<"[\""<<vec[0];
161 |             for(size_t i = 1; i < vec.size(); i++)
162 |             {
163 |                 os<<"\", \""<<vec[i];
164 |             }
165 |             os<<"\"]";
166 |             return os;
167 |         }
168 | 
169 | }
170 | 
171 | #endif
172 | 


--------------------------------------------------------------------------------
/src/CppJieba/KeywordExtractor.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
  2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H
  3 | 
  4 | #include "MixSegment.hpp"
  5 | #include <cmath>
  6 | #include <set>
  7 | 
  8 | namespace CppJieba
  9 | {
 10 |     using namespace Limonp;
 11 | 
 12 |     /*utf8*/
 13 |     class KeywordExtractor
 14 |     {
 15 |         private:
 16 |             MixSegment _segment;
 17 |         private:
 18 |             unordered_map<string, double> _idfMap;
 19 |             double _idfAverage;
 20 | 
 21 |             unordered_set<string> _stopWords;
 22 |         public:
 23 |             KeywordExtractor(){};
 24 |             KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
 25 |             {
 26 |                 LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
 27 |             };
 28 |             ~KeywordExtractor(){};
 29 | 
 30 |         public:
 31 |             bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
 32 |             {
 33 |                 _loadIdfDict(idfPath);
 34 |                 _loadStopWordDict(stopWordPath);
 35 |                 LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
 36 |                 return true;
 37 |             };
 38 |         public:
 39 | 
 40 |             bool extract(const string& str, vector<string>& keywords, size_t topN) const
 41 |             {
 42 |                 vector<pair<string, double> > topWords;
 43 |                 if(!extract(str, topWords, topN))
 44 |                 {
 45 |                     return false;
 46 |                 }
 47 |                 for(size_t i = 0; i < topWords.size(); i++)
 48 |                 {
 49 |                     keywords.push_back(topWords[i].first);
 50 |                 }
 51 |                 return true;
 52 |             }
 53 | 
 54 |             bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
 55 |             {
 56 |                 vector<string> words;
 57 |                 if(!_segment.cut(str, words))
 58 |                 {
 59 |                     LogError("segment cut(%s) failed.", str.c_str());
 60 |                     return false;
 61 |                 }
 62 | 
 63 |                 map<string, double> wordmap;
 64 |                 for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
 65 |                 {
 66 |                     if(_isSingleWord(*iter))
 67 |                     {
 68 |                         continue;
 69 |                     }
 70 |                     wordmap[*iter] += 1.0;
 71 |                 }
 72 | 
 73 |                 for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
 74 |                 {
 75 |                     if(_stopWords.end() != _stopWords.find(itr->first))
 76 |                     {
 77 |                         wordmap.erase(itr++);
 78 |                         continue;
 79 |                     }
 80 | 
 81 |                     unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
 82 |                     if(cit != _idfMap.end())
 83 |                     {
 84 |                         itr->second *= cit->second;
 85 |                     }
 86 |                     else
 87 |                     {
 88 |                         itr->second *= _idfAverage;
 89 |                     }
 90 |                     itr ++;
 91 |                 }
 92 | 
 93 |                 keywords.clear();
 94 |                 std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
 95 |                 topN = min(topN, keywords.size());
 96 |                 partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
 97 |                 keywords.resize(topN);
 98 |                 return true;
 99 |             }
100 |         private:
101 |             void _loadIdfDict(const string& idfPath)
102 |             {
103 |                 ifstream ifs(idfPath.c_str());
104 |                 if(!ifs)
105 |                 {
106 |                     LogError("open %s failed.", idfPath.c_str());
107 |                     assert(false);
108 |                 }
109 |                 string line ;
110 |                 vector<string> buf;
111 |                 double idf = 0.0;
112 |                 double idfSum = 0.0;
113 |                 size_t lineno = 0;
114 |                 for(;getline(ifs, line); lineno++)
115 |                 {
116 |                     buf.clear();
117 |                     if(line.empty())
118 |                     {
119 |                         LogError("line[%d] empty. skipped.", lineno);
120 |                         continue;
121 |                     }
122 |                     if(!split(line, buf, " ") || buf.size() != 2)
123 |                     {
124 |                         LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
125 |                         continue;
126 |                     }
127 |                     idf = atof(buf[1].c_str());
128 |                     _idfMap[buf[0]] = idf;
129 |                     idfSum += idf;
130 | 
131 |                 } 
132 | 
133 |                 assert(lineno);
134 |                 _idfAverage = idfSum / lineno;
135 |                 assert(_idfAverage > 0.0);
136 |             }
137 |             void _loadStopWordDict(const string& filePath)
138 |             {
139 |                 ifstream ifs(filePath.c_str());
140 |                 if(!ifs)
141 |                 {
142 |                     LogError("open %s failed.", filePath.c_str());
143 |                     assert(false);
144 |                 }
145 |                 string line ;
146 |                 while(getline(ifs, line))
147 |                 {
148 |                     _stopWords.insert(line);
149 |                 }
150 |                 assert(_stopWords.size());
151 |             }
152 |         private:
153 |             bool _isSingleWord(const string& str) const
154 |             {
155 |                 Unicode unicode;
156 |                 TransCode::decode(str, unicode);
157 |                 if(unicode.size() == 1)
158 |                   return true;
159 |                 return false;
160 |             }
161 | 
162 |         private:
163 |             static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
164 |             {
165 |                 return lhs.second > rhs.second;
166 |             }
167 |             
168 |     };
169 | }
170 | 
171 | #endif
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/src/CppJieba/DictTrie.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_DICT_TRIE_HPP
  2 | #define CPPJIEBA_DICT_TRIE_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <map>
  7 | #include <cstring>
  8 | #include <stdint.h>
  9 | #include <cmath>
 10 | #include <limits>
 11 | #include "Limonp/StringUtil.hpp"
 12 | #include "Limonp/Logger.hpp"
 13 | #include "TransCode.hpp"
 14 | #include "Trie.hpp"
 15 | 
 16 | 
 17 | 
 18 | namespace CppJieba
 19 | {
 20 |     using namespace Limonp;
 21 |     const double MIN_DOUBLE = -3.14e+100;
 22 |     const double MAX_DOUBLE = 3.14e+100;
 23 |     const size_t DICT_COLUMN_NUM = 3;
 24 |     const char* const UNKNOWN_TAG = "";
 25 | 
 26 |     class DictTrie
 27 |     {
 28 |         private:
 29 |             vector<DictUnit> _nodeInfos;
 30 |             Trie * _trie;
 31 | 
 32 |             double _minWeight;
 33 |         private:
 34 |             unordered_set<Unicode::value_type> _userDictSingleChineseWord;
 35 |         public:
 36 |             bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
 37 |             {
 38 |                 return isIn(_userDictSingleChineseWord, word);
 39 |             }
 40 |         public:
 41 |             double getMinWeight() const {return _minWeight;};
 42 | 
 43 |         public:
 44 |             DictTrie()
 45 |             {
 46 |                 _trie = NULL;
 47 |                 _minWeight = MAX_DOUBLE;
 48 |             }
 49 |             DictTrie(const string& dictPath, const string& userDictPath = "")
 50 |             {
 51 |                 new (this) DictTrie();
 52 |                 init(dictPath, userDictPath);
 53 |             }
 54 |             ~DictTrie()
 55 |             {
 56 |                 if(_trie)
 57 |                 {
 58 |                     delete _trie;
 59 |                 }
 60 |             }
 61 |             
 62 |         public:
 63 |             bool init(const string& dictPath, const string& userDictPath = "")
 64 |             {
 65 |                 assert(!_trie);
 66 |                 _loadDict(dictPath);
 67 |                 _calculateWeight(_nodeInfos);
 68 |                 _minWeight = _findMinWeight(_nodeInfos);
 69 |                 
 70 |                 if(userDictPath.size())
 71 |                 {
 72 |                     double maxWeight = _findMaxWeight(_nodeInfos);
 73 |                     _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
 74 |                 }
 75 |                 _shrink(_nodeInfos);
 76 |                 _trie = _createTrie(_nodeInfos);
 77 |                 assert(_trie);
 78 |                 return true;
 79 |             }
 80 | 
 81 |         public:
 82 |             const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
 83 |             {
 84 |                 return _trie->find(begin, end);
 85 |             }
 86 |             bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
 87 |             {
 88 |                 return _trie->find(begin, end, dag, offset);
 89 |             }
 90 |             void find(
 91 |                         Unicode::const_iterator begin, 
 92 |                         Unicode::const_iterator end, 
 93 |                         vector<SegmentChar>& res
 94 |                         ) const
 95 |             {
 96 |                 _trie->find(begin, end, res);
 97 |             }
 98 | 
 99 | 
100 |         private:
101 |             Trie * _createTrie(const vector<DictUnit>& dictUnits)
102 |             {
103 |                 assert(dictUnits.size());
104 |                 vector<Unicode> words;
105 |                 vector<const DictUnit*> valuePointers;
106 |                 for(size_t i = 0 ; i < dictUnits.size(); i ++)
107 |                 {
108 |                     words.push_back(dictUnits[i].word);
109 |                     valuePointers.push_back(&dictUnits[i]);
110 |                 }
111 | 
112 |                 Trie * trie = new Trie(words, valuePointers);
113 |                 return trie;
114 |             }
115 |             void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
116 |             {
117 |                 ifstream ifs(filePath.c_str());
118 |                 assert(ifs);
119 |                 string line;
120 |                 DictUnit nodeInfo;
121 |                 vector<string> buf;
122 |                 size_t lineno;
123 |                 for(lineno = 0; getline(ifs, line); lineno++)
124 |                 {
125 |                     buf.clear();
126 |                     split(line, buf, " ");
127 |                     assert(buf.size() >= 1);
128 |                     if(!TransCode::decode(buf[0], nodeInfo.word))
129 |                     {
130 |                         LogError("line[%u:%s] illegal.", lineno, line.c_str());
131 |                         continue;
132 |                     }
133 |                     if(nodeInfo.word.size() == 1)
134 |                     {
135 |                         _userDictSingleChineseWord.insert(nodeInfo.word[0]);
136 |                     }
137 |                     nodeInfo.weight = defaultWeight;
138 |                     nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
139 |                     _nodeInfos.push_back(nodeInfo);
140 |                 }
141 |                 LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
142 |             }
143 |             void _loadDict(const string& filePath) 
144 |             {
145 |                 ifstream ifs(filePath.c_str());
146 |                 assert(ifs);
147 |                 string line;
148 |                 vector<string> buf;
149 | 
150 |                 DictUnit nodeInfo;
151 |                 for(size_t lineno = 0 ; getline(ifs, line); lineno++)
152 |                 {
153 |                     split(line, buf, " ");
154 |                     assert(buf.size() == DICT_COLUMN_NUM);
155 |                     
156 |                     if(!TransCode::decode(buf[0], nodeInfo.word))
157 |                     {
158 |                         LogError("line[%u:%s] illegal.", lineno, line.c_str());
159 |                         continue;
160 |                     }
161 |                     nodeInfo.weight = atof(buf[1].c_str());
162 |                     nodeInfo.tag = buf[2];
163 |                     
164 |                     _nodeInfos.push_back(nodeInfo);
165 |                 }
166 |             }
167 |             double _findMinWeight(const vector<DictUnit>& nodeInfos) const
168 |             {
169 |                 double ret = MAX_DOUBLE;
170 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
171 |                 {
172 |                     ret = min(nodeInfos[i].weight, ret);
173 |                 }
174 |                 return ret;
175 |             }
176 |             double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
177 |             {
178 |                 double ret = MIN_DOUBLE;
179 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
180 |                 {
181 |                     ret = max(nodeInfos[i].weight, ret);
182 |                 }
183 |                 return ret;
184 |             }
185 | 
186 |             void _calculateWeight(vector<DictUnit>& nodeInfos) const
187 |             {
188 |                 double sum = 0.0;
189 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
190 |                 {
191 |                     sum += nodeInfos[i].weight;
192 |                 }
193 |                 assert(sum);
194 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
195 |                 {
196 |                     DictUnit& nodeInfo = nodeInfos[i];
197 |                     assert(nodeInfo.weight);
198 |                     nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
199 |                 }
200 |             }
201 | 
202 |             void _shrink(vector<DictUnit>& units) const
203 |             {
204 |                 vector<DictUnit>(units.begin(), units.end()).swap(units);
205 |             }
206 | 
207 | 
208 |     };
209 | }
210 | 
211 | #endif
212 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/StringUtil.hpp:
--------------------------------------------------------------------------------
  1 | /************************************
  2 |  * file enc : ascii
  3 |  * author   : wuyanyi09@gmail.com
  4 |  ************************************/
  5 | #ifndef LIMONP_STR_FUNCTS_H
  6 | #define LIMONP_STR_FUNCTS_H
  7 | #include <fstream>
  8 | #include <iostream>
  9 | #include <string>
 10 | #include <vector>
 11 | #include <algorithm>
 12 | #include <cctype>
 13 | #include <map>
 14 | #include <stdint.h>
 15 | #include <stdio.h>
 16 | #include <stdarg.h>
 17 | #include <memory.h>
 18 | #include <functional> 
 19 | #include <locale>
 20 | #include <sstream>
 21 | #include <sys/types.h>
 22 | #include <iterator>
 23 | #include <algorithm>
 24 | #include "StdExtension.hpp"
 25 | 
 26 | namespace Limonp
 27 | {
 28 |     using namespace std;
 29 |     inline string string_format(const char* fmt, ...) 
 30 |     {
 31 |         int size = 256;
 32 |         std::string str;
 33 |         va_list ap;
 34 |         while (1) {
 35 |             str.resize(size);
 36 |             va_start(ap, fmt);
 37 |             int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
 38 |             va_end(ap);
 39 |             if (n > -1 && n < size) {
 40 |                 str.resize(n);
 41 |                 return str;
 42 |             }
 43 |             if (n > -1)
 44 |               size = n + 1;
 45 |             else
 46 |               size *= 2;
 47 |         }
 48 |         return str;
 49 |     }
 50 | 
 51 |     template<class T>
 52 |         void join(T begin, T end, string& res, const string& connector)
 53 |         {
 54 |             if(begin == end)
 55 |             {
 56 |                 return;
 57 |             }
 58 |             stringstream ss;
 59 |             ss<<*begin;
 60 |             begin++;
 61 |             while(begin != end)
 62 |             {
 63 |                 ss << connector << *begin;
 64 |                 begin ++;
 65 |             }
 66 |             res = ss.str();
 67 |         }
 68 | 
 69 |     template<class T>
 70 |         string join(T begin, T end, const string& connector)
 71 |         {
 72 |             string res;
 73 |             join(begin ,end, res, connector);
 74 |             return res;
 75 |         }
 76 | 
 77 | 
 78 | 
 79 |     inline bool split(const string& src, vector<string>& res, const string& pattern, size_t offset = 0, size_t len = string::npos)
 80 |     {
 81 |         if(src.empty())
 82 |         {
 83 |             return false;
 84 |         }
 85 |         res.clear();
 86 | 
 87 |         size_t start = 0;
 88 |         size_t end = 0;
 89 |         size_t cnt = 0;
 90 |         while(start < src.size() && res.size() < len)
 91 |         {
 92 |             end = src.find_first_of(pattern, start);
 93 |             if(string::npos == end)
 94 |             {
 95 |                 if(cnt >= offset)
 96 |                 {
 97 |                     res.push_back(src.substr(start));
 98 |                 }
 99 |                 return true;
100 |             }
101 |             //if(end == src.size() - 1)
102 |             //{
103 |             //    res.push_back("");
104 |             //    return true;
105 |             //}
106 |             if(cnt >= offset)
107 |             {
108 |                 res.push_back(src.substr(start, end - start));
109 |             }
110 |             cnt ++;
111 |             start = end + 1;
112 |         }
113 |         return true;
114 |     }
115 | 
116 |     inline string& upper(string& str)
117 |     {
118 |         transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
119 |         return str;
120 |     }
121 | 
122 |     inline string& lower(string& str)
123 |     {
124 |         transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
125 |         return str;
126 |     }
127 | 
128 |     inline std::string &ltrim(std::string &s) 
129 |     {
130 |         s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<int, int>(std::isspace))));
131 |         return s;
132 |     }
133 | 
134 |     inline std::string &rtrim(std::string &s) 
135 |     {
136 |         s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
137 |         return s;
138 |     }
139 | 
140 |     inline std::string &trim(std::string &s) 
141 |     {
142 |         return ltrim(rtrim(s));
143 |     }
144 | 
145 |     inline std::string & ltrim(std::string & s, char x)
146 |     {
147 |         s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
148 |         return s;
149 |     }
150 | 
151 |     inline std::string & rtrim(std::string & s, char x)
152 |     {
153 |         s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
154 |         return s;
155 |     }
156 | 
157 |     inline std::string &trim(std::string &s, char x)
158 |     {
159 |         return ltrim(rtrim(s, x), x);
160 |     }
161 | 
162 |     inline bool startsWith(const string& str, const string& prefix)
163 |     {
164 |         if(prefix.length() > str.length())
165 |         {
166 |             return false;
167 |         }
168 |         return 0 == str.compare(0, prefix.length(), prefix);
169 |     }
170 | 
171 |     inline bool endsWith(const string& str, const string& suffix)
172 |     {
173 |         if(suffix.length() > str.length())
174 |         {
175 |             return false;
176 |         }
177 |         return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
178 |     }
179 | 
180 |     inline bool isInStr(const string& str, char ch)
181 |     {
182 |         return str.find(ch) != string::npos;
183 |     }
184 | 
185 |     inline uint16_t twocharToUint16(char high, char low)
186 |     {
187 |         return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
188 |     }
189 | 
190 |     template <class Uint16Container>
191 |     bool utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec)
192 |     {
193 |         if(!str)
194 |         {
195 |             return false;
196 |         }
197 |         char ch1, ch2;
198 |         uint16_t tmp;
199 |         vec.clear();
200 |         for(size_t i = 0;i < len;)
201 |         {
202 |             if(!(str[i] & 0x80)) // 0xxxxxxx
203 |             {
204 |                 vec.push_back(str[i]);
205 |                 i++;
206 |             }
207 |             else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) // 110xxxxxx
208 |             {
209 |                 ch1 = (str[i] >> 2) & 0x07;
210 |                 ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
211 |                 tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
212 |                 vec.push_back(tmp);
213 |                 i += 2;
214 |             }
215 |             else if((uint8_t)str[i] <= 0xef && i + 2 < len)
216 |             {
217 |                 ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
218 |                 ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 
219 |                 tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
220 |                 vec.push_back(tmp);
221 |                 i += 3;
222 |             }
223 |             else
224 |             {
225 |                 return false;
226 |             }
227 |         }
228 |         return true;
229 |     }
230 |     template <class Uint16Container>
231 |     bool utf8ToUnicode(const string& str, Uint16Container& vec)
232 |     {
233 |         return utf8ToUnicode(str.c_str(), str.size(), vec);
234 |     }
235 | 
236 |     template <class Uint16ContainerConIter>
237 |     bool unicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res)
238 |     {
239 |         if(begin >= end)
240 |         {
241 |             return false;
242 |         }
243 |         res.clear();
244 |         uint16_t ui;
245 |         while(begin != end)
246 |         {
247 |             ui = *begin;
248 |             if(ui <= 0x7f)
249 |             {
250 |                 res += char(ui);
251 |             }
252 |             else if(ui <= 0x7ff)
253 |             {
254 |                 res += char(((ui>>6) & 0x1f) | 0xc0);
255 |                 res += char((ui & 0x3f) | 0x80);
256 |             }
257 |             else
258 |             {
259 |                 res += char(((ui >> 12) & 0x0f )| 0xe0);
260 |                 res += char(((ui>>6) & 0x3f )| 0x80 );
261 |                 res += char((ui & 0x3f) | 0x80);
262 |             }
263 |             begin ++;
264 |         }
265 |         return true;
266 |     }
267 | 
268 |     
269 |     template <class Uint16Container>
270 |     bool gbkTrans(const char* const str, size_t len, Uint16Container& vec)
271 |     {
272 |         vec.clear();
273 |         if(!str)
274 |         {
275 |             return false;
276 |         }
277 |         size_t i = 0;
278 |         while(i < len)
279 |         {
280 |             if(0 == (str[i] & 0x80))
281 |             {
282 |                 vec.push_back(uint16_t(str[i]));
283 |                 i++;
284 |             }
285 |             else
286 |             {
287 |                 if(i + 1 < len) //&& (str[i+1] & 0x80))
288 |                 {
289 |                     uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
290 |                     vec.push_back(tmp);
291 |                     i += 2;
292 |                 }
293 |                 else
294 |                 {
295 |                     return false;
296 |                 }
297 |             }
298 |         }
299 |         return true;
300 |     }
301 | 
302 |     template <class Uint16Container>
303 |     bool gbkTrans(const string& str, Uint16Container& vec)
304 |     {
305 |         return gbkTrans(str.c_str(), str.size(), vec);
306 |     }
307 | 
308 |     template <class Uint16ContainerConIter>
309 |     bool gbkTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res)
310 |     {
311 |         if(begin >= end)
312 |         {
313 |             return false;
314 |         }
315 |         res.clear();
316 |         //pair<char, char> pa;
317 |         char first, second;
318 |         while(begin != end)
319 |         {
320 |             //pa = uint16ToChar2(*begin);
321 |             first = ((*begin)>>8) & 0x00ff;
322 |             second = (*begin) & 0x00ff;
323 |             if(first & 0x80)
324 |             {
325 |                 res += first;
326 |                 res += second;
327 |             }
328 |             else
329 |             {
330 |                 res += second;
331 |             }
332 |             begin++;
333 |         }
334 |         return true;
335 |     }
336 | 
337 |     /*
338 |      * format example: "%Y-%m-%d %H:%M:%S"
339 |      */
340 |     inline void getTime(const string& format, string&  timeStr)
341 |     {
342 |         time_t timeNow;
343 |         time(&timeNow);
344 |         timeStr.resize(64);
345 |         size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
346 |         timeStr.resize(len);
347 |     }
348 | }
349 | #endif
350 | 


--------------------------------------------------------------------------------
/src/CppJieba/Trie.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_TRIE_HPP
  2 | #define CPPJIEBA_TRIE_HPP
  3 | 
  4 | #include "Limonp/StdExtension.hpp"
  5 | #include <vector>
  6 | #include <queue>
  7 | 
  8 | namespace CppJieba
  9 | {
 10 |     using namespace std;
 11 | 
 12 |     struct DictUnit
 13 |     {
 14 |         Unicode word;
 15 |         double weight; 
 16 |         string tag;
 17 |     };
 18 | 
 19 |     // for debugging
 20 |     inline ostream & operator << (ostream& os, const DictUnit& unit)
 21 |     {
 22 |         string s;
 23 |         s << unit.word;
 24 |         return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
 25 |     }
 26 | 
 27 |     typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
 28 | 
 29 |     struct SegmentChar 
 30 |     {
 31 |         uint16_t uniCh;
 32 |         DagType dag;
 33 |         const DictUnit * pInfo;
 34 |         double weight;
 35 |         size_t nextPos;
 36 |         SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
 37 |         {}
 38 |         ~SegmentChar() 
 39 |         {}
 40 |     };
 41 | 
 42 |     typedef Unicode::value_type TrieKey;
 43 | 
 44 |     class TrieNode
 45 |     {
 46 |         public:
 47 |             typedef unordered_map<TrieKey,  TrieNode*> NextMap;
 48 |         public:
 49 |             TrieNode * fail;
 50 |             NextMap * next;
 51 |             const DictUnit * ptValue;
 52 |         public:
 53 |             TrieNode(): fail(NULL), next(NULL), ptValue(NULL) 
 54 |             {}
 55 |             const TrieNode * findNext(TrieKey key) const
 56 |             {
 57 |                 if(next == NULL)
 58 |                 {
 59 |                     return NULL;
 60 |                 }
 61 |                 NextMap::const_iterator iter = next->find(key);
 62 |                 if(iter == next->end()) 
 63 |                 {
 64 |                     return NULL;
 65 |                 }
 66 |                 return iter->second;
 67 |             }
 68 |     };
 69 | 
 70 |     class Trie
 71 |     {
 72 |         private:
 73 |             TrieNode* _root;
 74 |         public:
 75 |             Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
 76 |             {
 77 |                 _root = new TrieNode;
 78 |                 _createTrie(keys, valuePointers);
 79 |                 _build();// build automation
 80 |             }
 81 |             ~Trie()
 82 |             {
 83 |                 if(_root)
 84 |                 {
 85 |                     _deleteNode(_root);
 86 |                 }
 87 |             }
 88 |         public:
 89 |             const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
 90 |             {
 91 |                 TrieNode::NextMap::const_iterator citer;
 92 |                 const TrieNode* ptNode = _root;
 93 |                 for(Unicode::const_iterator it = begin; it != end; it++)
 94 |                 {// build automation
 95 |                     assert(ptNode);
 96 |                     if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
 97 |                     {
 98 |                         return NULL;
 99 |                     }
100 |                     ptNode = citer->second;
101 |                 }
102 |                 return ptNode->ptValue;
103 |             }
104 |             // aho-corasick-automation 
105 |             void find(
106 |                         Unicode::const_iterator begin, 
107 |                         Unicode::const_iterator end, 
108 |                         vector<struct SegmentChar>& res
109 |                         ) const
110 |             {
111 |                 res.resize(end - begin);
112 |                 const TrieNode * now = _root;
113 |                 const TrieNode* node;
114 |                 // compiler will complain warnings if only "i < end - begin" .
115 |                 for (size_t i = 0; i < size_t(end - begin); i++) 
116 |                 {
117 |                     Unicode::value_type ch = *(begin + i);
118 |                     res[i].uniCh = ch;
119 |                     assert(res[i].dag.empty());
120 |                     res[i].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, NULL));
121 |                     bool flag = false;
122 | 
123 |                     // rollback
124 |                     while( now != _root )
125 |                     {
126 |                         node = now->findNext(ch);
127 |                         if (node != NULL) 
128 |                         {
129 |                             flag = true;
130 |                             break;
131 |                         }
132 |                         else 
133 |                         {
134 |                             now = now->fail;
135 |                         }
136 |                     }
137 | 
138 |                     if(!flag)
139 |                     {
140 |                         node = now->findNext(ch);
141 |                     }
142 |                     if(node == NULL) 
143 |                     {
144 |                         now = _root;
145 |                     } 
146 |                     else 
147 |                     {
148 |                         now = node;
149 |                         const TrieNode * temp = now;
150 |                         while(temp != _root) 
151 |                         {
152 |                             if (temp->ptValue) 
153 |                             {
154 |                                 size_t pos = i - temp->ptValue->word.size() + 1;
155 |                                 res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
156 |                                 if(pos == i) 
157 |                                 {
158 |                                     res[pos].dag[0].second = temp->ptValue;
159 |                                 }
160 |                             }
161 |                             temp = temp->fail;
162 |                             assert(temp);
163 |                         }
164 |                     }
165 |                 }
166 |             }
167 |             bool find(
168 |                         Unicode::const_iterator begin, 
169 |                         Unicode::const_iterator end, 
170 |                         DagType & res,
171 |                         size_t offset = 0) const
172 |             {
173 |                 const TrieNode * ptNode = _root;
174 |                 TrieNode::NextMap::const_iterator citer;
175 |                 for(Unicode::const_iterator itr = begin; itr != end ; itr++)
176 |                 {
177 |                     assert(ptNode);
178 |                     if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
179 |                     {
180 |                         break;
181 |                     }
182 |                     ptNode = citer->second;
183 |                     if(ptNode->ptValue)
184 |                     {
185 |                         if(itr == begin && res.size() == 1) // first singleword
186 |                         {
187 |                             res[0].second = ptNode->ptValue;
188 |                         }
189 |                         else
190 |                         {
191 |                             res.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
192 |                         }
193 |                     }
194 |                 }
195 |                 return !res.empty();
196 |             }
197 |         private:
198 |             void _build()
199 |             {
200 |                 queue<TrieNode*> que;
201 |                 assert(_root->ptValue == NULL);
202 |                 assert(_root->next);
203 |                 _root->fail = NULL;
204 |                 for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
205 |                     iter->second->fail = _root;
206 |                     que.push(iter->second);
207 |                 }
208 |                 TrieNode* back = NULL;
209 |                 TrieNode::NextMap::iterator backiter;
210 |                 while(!que.empty()) {
211 |                     TrieNode * now = que.front();
212 |                     que.pop();
213 |                     if(now->next == NULL) {
214 |                         continue;
215 |                     }
216 |                     for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
217 |                         back = now->fail;
218 |                         while(back != NULL) {
219 |                             if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) 
220 |                             {
221 |                                 iter->second->fail = backiter->second;
222 |                                 break;
223 |                             }
224 |                             back = back->fail;
225 |                         }
226 |                         if(back == NULL) {
227 |                             iter->second->fail = _root;
228 |                         }
229 |                         que.push(iter->second);
230 |                     }
231 |                 }
232 |             }
233 |         private:
234 |             void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
235 |             {
236 |                 if(valuePointers.empty() || keys.empty())
237 |                 {
238 |                     return;
239 |                 }
240 |                 assert(keys.size() == valuePointers.size());
241 | 
242 |                 for(size_t i = 0; i < keys.size(); i++)
243 |                 {
244 |                     _insertNode(keys[i], valuePointers[i]);
245 |                 }
246 |             }
247 |         private:
248 |             void _insertNode(const Unicode& key, const DictUnit* ptValue)
249 |             {
250 |                 TrieNode* ptNode  = _root;
251 | 
252 |                 TrieNode::NextMap::const_iterator kmIter;
253 | 
254 |                 for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++)
255 |                 {
256 |                     if(NULL == ptNode->next)
257 |                     {
258 |                         ptNode->next = new TrieNode::NextMap;
259 |                     }
260 |                     kmIter = ptNode->next->find(*citer);
261 |                     if(ptNode->next->end() == kmIter)
262 |                     {
263 |                         TrieNode * nextNode = new TrieNode;
264 |                         nextNode->next = NULL;
265 |                         nextNode->ptValue = NULL;
266 | 
267 |                         (*ptNode->next)[*citer] = nextNode;
268 |                         ptNode = nextNode;
269 |                     }
270 |                     else
271 |                     {
272 |                         ptNode = kmIter->second;
273 |                     }
274 |                 }
275 |                 ptNode->ptValue = ptValue;
276 |             }
277 |             void _deleteNode(TrieNode* node)
278 |             {
279 |                 if(!node)
280 |                 {
281 |                     return;
282 |                 }
283 |                 if(node->next)
284 |                 {
285 |                      TrieNode::NextMap::iterator it;
286 |                     for(it = node->next->begin(); it != node->next->end(); it++)
287 |                     {
288 |                         _deleteNode(it->second);
289 |                     }
290 |                     delete node->next;
291 |                 }
292 |                 delete node;
293 |             }
294 |     };
295 | }
296 | 
297 | #endif
298 | 


--------------------------------------------------------------------------------
/src/CppJieba/HMMSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIBEA_HMMSEGMENT_H
  2 | #define CPPJIBEA_HMMSEGMENT_H
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <memory.h>
  7 | #include <cassert>
  8 | #include "Limonp/StringUtil.hpp"
  9 | #include "Limonp/Logger.hpp"
 10 | #include "TransCode.hpp"
 11 | #include "ISegment.hpp"
 12 | #include "SegmentBase.hpp"
 13 | #include "DictTrie.hpp"
 14 | 
 15 | namespace CppJieba
 16 | {
 17 |     using namespace Limonp;
 18 |     typedef unordered_map<uint16_t, double> EmitProbMap;
 19 |     class HMMSegment: public SegmentBase
 20 |     {
 21 |         public:
 22 |             /*
 23 |              * STATUS:
 24 |              * 0:B, 1:E, 2:M, 3:S
 25 |              * */
 26 |             enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
 27 |         private:
 28 |             char _statMap[STATUS_SUM];
 29 |             double _startProb[STATUS_SUM];
 30 |             double _transProb[STATUS_SUM][STATUS_SUM];
 31 |             EmitProbMap _emitProbB;
 32 |             EmitProbMap _emitProbE;
 33 |             EmitProbMap _emitProbM;
 34 |             EmitProbMap _emitProbS;
 35 |             vector<EmitProbMap* > _emitProbVec;
 36 | 
 37 |         public:
 38 |             HMMSegment(){}
 39 |             explicit HMMSegment(const string& filePath)
 40 |             {
 41 |                 LIMONP_CHECK(init(filePath));
 42 |             }
 43 |             virtual ~HMMSegment(){}
 44 |         public:
 45 |             bool init(const string& filePath)
 46 |             {
 47 |                 memset(_startProb, 0, sizeof(_startProb));
 48 |                 memset(_transProb, 0, sizeof(_transProb));
 49 |                 _statMap[0] = 'B';
 50 |                 _statMap[1] = 'E';
 51 |                 _statMap[2] = 'M';
 52 |                 _statMap[3] = 'S';
 53 |                 _emitProbVec.push_back(&_emitProbB);
 54 |                 _emitProbVec.push_back(&_emitProbE);
 55 |                 _emitProbVec.push_back(&_emitProbM);
 56 |                 _emitProbVec.push_back(&_emitProbS);
 57 |                 LIMONP_CHECK(_loadModel(filePath.c_str()));
 58 |                 LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
 59 |                 return true;
 60 |             }
 61 |         public:
 62 |             using SegmentBase::cut;
 63 |         public:
 64 |             bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const 
 65 |             {
 66 |                 Unicode::const_iterator left = begin;
 67 |                 Unicode::const_iterator right = begin;
 68 |                 while(right != end)
 69 |                 {
 70 |                     if(*right < 0x80) 
 71 |                     {
 72 |                         if(left != right && !_cut(left, right, res))
 73 |                         {
 74 |                             return false;
 75 |                         }
 76 |                         left = right;
 77 |                         do {
 78 |                             right = _sequentialLetterRule(left, end);
 79 |                             if(right != left)
 80 |                             {
 81 |                                 break;
 82 |                             }
 83 |                             right = _numbersRule(left, end);
 84 |                             if(right != left)
 85 |                             {
 86 |                                 break;
 87 |                             }
 88 |                             right ++;
 89 |                         } while(false);
 90 |                         res.push_back(Unicode(left, right));
 91 |                         left = right;
 92 |                     }
 93 |                     else
 94 |                     {
 95 |                         right++;
 96 |                     }
 97 |                 }
 98 |                 if(left != right && !_cut(left, right, res))
 99 |                 {
100 |                     return false;
101 |                 }
102 |                 return true;
103 |             }
104 |         private:
105 |             // sequential letters rule
106 |             Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
107 |             {
108 |                 Unicode::value_type x;
109 |                 while(begin != end)
110 |                 {
111 |                     x = *begin;
112 |                     if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
113 |                     {
114 |                         begin ++;
115 |                     }
116 |                     else
117 |                     {
118 |                         break;
119 |                     }
120 |                 }
121 |                 return begin;
122 |             }
123 |             // 
124 |             Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
125 |             {
126 |                 Unicode::value_type x = *begin;
127 |                 if('0' <= x && x <= '9')
128 |                 {
129 |                     begin ++;
130 |                 }
131 |                 else
132 |                 {
133 |                     return begin;
134 |                 }
135 |                 while(begin != end)
136 |                 {
137 |                     x = *begin;
138 |                     if( ('0' <= x && x <= '9') || x == '.')
139 |                     {
140 |                         begin++;
141 |                     }
142 |                     else
143 |                     {
144 |                         break;
145 |                     }
146 |                 }
147 |                 return begin;
148 |             }
149 |             bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
150 |             {
151 |                 vector<size_t> status; 
152 |                 if(!_viterbi(begin, end, status))
153 |                 {
154 |                     LogError("_viterbi failed.");
155 |                     return false;
156 |                 }
157 | 
158 |                 Unicode::const_iterator left = begin;
159 |                 Unicode::const_iterator right;
160 |                 for(size_t i = 0; i < status.size(); i++)
161 |                 {
162 |                     if(status[i] % 2) //if(E == status[i] || S == status[i])
163 |                     {
164 |                         right = begin + i + 1;
165 |                         res.push_back(Unicode(left, right));
166 |                         left = right;
167 |                     }
168 |                 }
169 |                 return true;
170 |             }
171 |         public:
172 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
173 |             {
174 |                 if(begin == end)
175 |                 {
176 |                     return false;
177 |                 }
178 |                 vector<Unicode> words;
179 |                 words.reserve(end - begin);
180 |                 if(!cut(begin, end, words))
181 |                 {
182 |                     return false;
183 |                 }
184 |                 size_t offset = res.size();
185 |                 res.resize(res.size() + words.size());
186 |                 for(size_t i = 0; i < words.size(); i++)
187 |                 {
188 |                     if(!TransCode::encode(words[i], res[offset + i]))
189 |                     {
190 |                         LogError("encode failed.");
191 |                     }
192 |                 }
193 |                 return true;
194 |             }
195 | 
196 |         private:
197 |             bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
198 |             {
199 |                 if(begin == end)
200 |                 {
201 |                     return false;
202 |                 }
203 | 
204 |                 size_t Y = STATUS_SUM;
205 |                 size_t X = end - begin;
206 | 
207 |                 size_t XYSize = X * Y;
208 |                 size_t now, old, stat;
209 |                 double tmp, endE, endS;
210 | 
211 |                 vector<int> path(XYSize);
212 |                 vector<double> weight(XYSize);
213 | 
214 |                 //start
215 |                 for(size_t y = 0; y < Y; y++)
216 |                 {
217 |                     weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
218 |                     path[0 + y * X] = -1;
219 |                 }
220 | 
221 | 
222 |                 double emitProb;
223 | 
224 |                 for(size_t x = 1; x < X; x++)
225 |                 {
226 |                     for(size_t y = 0; y < Y; y++)
227 |                     {
228 |                         now = x + y*X;
229 |                         weight[now] = MIN_DOUBLE;
230 |                         path[now] = E; // warning
231 |                         emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
232 |                         for(size_t preY = 0; preY < Y; preY++)
233 |                         {
234 |                             old = x - 1 + preY * X;
235 |                             tmp = weight[old] + _transProb[preY][y] + emitProb;
236 |                             if(tmp > weight[now])
237 |                             {
238 |                                 weight[now] = tmp;
239 |                                 path[now] = preY;
240 |                             }
241 |                         }
242 |                     }
243 |                 }
244 | 
245 |                 endE = weight[X-1+E*X];
246 |                 endS = weight[X-1+S*X];
247 |                 stat = 0;
248 |                 if(endE >= endS)
249 |                 {
250 |                     stat = E;
251 |                 }
252 |                 else
253 |                 {
254 |                     stat = S;
255 |                 }
256 | 
257 |                 status.resize(X);
258 |                 for(int x = X -1 ; x >= 0; x--)
259 |                 {
260 |                     status[x] = stat;
261 |                     stat = path[x + stat*X];
262 |                 }
263 | 
264 |                 return true;
265 |             }
266 |             bool _loadModel(const char* const filePath)
267 |             {
268 |                 LogDebug("loadModel [%s] start ...", filePath);
269 |                 ifstream ifile(filePath);
270 |                 string line;
271 |                 vector<string> tmp;
272 |                 vector<string> tmp2;
273 |                 //load _startProb
274 |                 if(!_getLine(ifile, line))
275 |                 {
276 |                     return false;
277 |                 }
278 |                 split(line, tmp, " ");
279 |                 if(tmp.size() != STATUS_SUM)
280 |                 {
281 |                     LogError("start_p illegal");
282 |                     return false;
283 |                 }
284 |                 for(size_t j = 0; j< tmp.size(); j++)
285 |                 {
286 |                     _startProb[j] = atof(tmp[j].c_str());
287 |                 }
288 | 
289 |                 //load _transProb
290 |                 for(size_t i = 0; i < STATUS_SUM; i++)
291 |                 {
292 |                     if(!_getLine(ifile, line))
293 |                     {
294 |                         return false;
295 |                     }
296 |                     split(line, tmp, " ");
297 |                     if(tmp.size() != STATUS_SUM)
298 |                     {
299 |                         LogError("trans_p illegal");
300 |                         return false;
301 |                     }
302 |                     for(size_t j =0; j < STATUS_SUM; j++)
303 |                     {
304 |                         _transProb[i][j] = atof(tmp[j].c_str());
305 |                     }
306 |                 }
307 | 
308 |                 //load _emitProbB
309 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
310 |                 {
311 |                     return false;
312 |                 }
313 | 
314 |                 //load _emitProbE
315 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
316 |                 {
317 |                     return false;
318 |                 }
319 | 
320 |                 //load _emitProbM
321 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
322 |                 {
323 |                     return false;
324 |                 }
325 | 
326 |                 //load _emitProbS
327 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
328 |                 {
329 |                     return false;
330 |                 }
331 | 
332 |                 LogDebug("loadModel [%s] end.", filePath);
333 | 
334 |                 return true;
335 |             }
336 |             bool _getLine(ifstream& ifile, string& line)
337 |             {
338 |                 while(getline(ifile, line))
339 |                 {
340 |                     trim(line);
341 |                     if(line.empty())
342 |                     {
343 |                         continue;
344 |                     }
345 |                     if(startsWith(line, "#"))
346 |                     {
347 |                         continue;
348 |                     }
349 |                     return true;
350 |                 }
351 |                 return false;
352 |             }
353 |             bool _loadEmitProb(const string& line, EmitProbMap& mp)
354 |             {
355 |                 if(line.empty())
356 |                 {
357 |                     return false;
358 |                 }
359 |                 vector<string> tmp, tmp2;
360 |                 Unicode unicode;
361 |                 split(line, tmp, ",");
362 |                 for(size_t i = 0; i < tmp.size(); i++)
363 |                 {
364 |                     split(tmp[i], tmp2, ":");
365 |                     if(2 != tmp2.size())
366 |                     {
367 |                         LogError("_emitProb illegal.");
368 |                         return false;
369 |                     }
370 |                     if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
371 |                     {
372 |                         LogError("TransCode failed.");
373 |                         return false;
374 |                     }
375 |                     mp[unicode[0]] = atof(tmp2[1].c_str());
376 |                 }
377 |                 return true;
378 |             }
379 |             double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const 
380 |             {
381 |                 EmitProbMap::const_iterator cit = ptMp->find(key);
382 |                 if(cit == ptMp->end())
383 |                 {
384 |                     return defVal;
385 |                 }
386 |                 return cit->second;
387 | 
388 |             }
389 | 
390 | 
391 |     };
392 | }
393 | 
394 | #endif
395 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Md5.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __MD5_H__
  2 | #define __MD5_H__
  3 | 
  4 | // Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
  5 | // rights reserved.
  6 | 
  7 | // License to copy and use this software is granted provided that it
  8 | // is identified as the "RSA Data Security, Inc. MD5 Message-Digest
  9 | // Algorithm" in all material mentioning or referencing this software
 10 | // or this function.
 11 | //
 12 | // License is also granted to make and use derivative works provided
 13 | // that such works are identified as "derived from the RSA Data
 14 | // Security, Inc. MD5 Message-Digest Algorithm" in all material
 15 | // mentioning or referencing the derived work.
 16 | //
 17 | // RSA Data Security, Inc. makes no representations concerning either
 18 | // the merchantability of this software or the suitability of this
 19 | // software for any particular purpose. It is provided "as is"
 20 | // without express or implied warranty of any kind.
 21 | //
 22 | // These notices must be retained in any copies of any part of this
 23 | // documentation and/or software.
 24 | 
 25 | 
 26 | 
 27 | // The original md5 implementation avoids external libraries.
 28 | // This version has dependency on stdio.h for file input and
 29 | // string.h for memcpy.
 30 | #include <cstdio>
 31 | #include <cstring>
 32 | #include <iostream>
 33 | 
 34 | namespace Limonp 
 35 | {
 36 | 
 37 | //#pragma region MD5 defines
 38 | // Constants for MD5Transform routine.
 39 | #define S11 7
 40 | #define S12 12
 41 | #define S13 17
 42 | #define S14 22
 43 | #define S21 5
 44 | #define S22 9
 45 | #define S23 14
 46 | #define S24 20
 47 | #define S31 4
 48 | #define S32 11
 49 | #define S33 16
 50 | #define S34 23
 51 | #define S41 6
 52 | #define S42 10
 53 | #define S43 15
 54 | #define S44 21
 55 | 
 56 | 
 57 | // F, G, H and I are basic MD5 functions.
 58 | #define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
 59 | #define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
 60 | #define H(x, y, z) ((x) ^ (y) ^ (z))
 61 | #define I(x, y, z) ((y) ^ ((x) | (~z)))
 62 | 
 63 | // ROTATE_LEFT rotates x left n bits.
 64 | #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
 65 | 
 66 | // FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
 67 | // Rotation is separate from addition to prevent recomputation.
 68 | #define FF(a, b, c, d, x, s, ac) { \
 69 |   (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
 70 |   (a) = ROTATE_LEFT ((a), (s)); \
 71 |   (a) += (b); \
 72 |   }
 73 | #define GG(a, b, c, d, x, s, ac) { \
 74 |   (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
 75 |   (a) = ROTATE_LEFT ((a), (s)); \
 76 |   (a) += (b); \
 77 |   }
 78 | #define HH(a, b, c, d, x, s, ac) { \
 79 |   (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
 80 |   (a) = ROTATE_LEFT ((a), (s)); \
 81 |   (a) += (b); \
 82 |   }
 83 | #define II(a, b, c, d, x, s, ac) { \
 84 |   (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
 85 |   (a) = ROTATE_LEFT ((a), (s)); \
 86 |   (a) += (b); \
 87 |   }
 88 | //#pragma endregion
 89 | 
 90 | 
 91 | typedef unsigned char BYTE ;
 92 | 
 93 | // POINTER defines a generic pointer type
 94 | typedef unsigned char *POINTER;
 95 | 
 96 | // UINT2 defines a two byte word
 97 | typedef unsigned short int UINT2;
 98 | 
 99 | // UINT4 defines a four byte word
100 | typedef unsigned int UINT4;
101 | 
102 | static unsigned char PADDING[64] = {
103 |   0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 |   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 |   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
106 | };
107 | // convenient object that wraps
108 | // the C-functions for use in C++ only
109 | class MD5
110 | {
111 | private:
112 |   struct __context_t {
113 |     UINT4 state[4];                                   /* state (ABCD) */
114 |     UINT4 count[2];        /* number of bits, modulo 2^64 (lsb first) */
115 |     unsigned char buffer[64];                         /* input buffer */
116 |   } context ;
117 | 
118 |   //#pragma region static helper functions
119 |   // The core of the MD5 algorithm is here.
120 |   // MD5 basic transformation. Transforms state based on block.
121 |   static void MD5Transform( UINT4 state[4], unsigned char block[64] )
122 |   {
123 |     UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
124 | 
125 |     Decode (x, block, 64);
126 | 
127 |     /* Round 1 */
128 |     FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
129 |     FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
130 |     FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
131 |     FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
132 |     FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
133 |     FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
134 |     FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
135 |     FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
136 |     FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
137 |     FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
138 |     FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
139 |     FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
140 |     FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
141 |     FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
142 |     FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
143 |     FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
144 | 
145 |     /* Round 2 */
146 |     GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
147 |     GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
148 |     GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
149 |     GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
150 |     GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
151 |     GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
152 |     GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
153 |     GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
154 |     GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
155 |     GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
156 |     GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
157 |     GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
158 |     GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
159 |     GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
160 |     GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
161 |     GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
162 | 
163 |     /* Round 3 */
164 |     HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
165 |     HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
166 |     HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
167 |     HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
168 |     HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
169 |     HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
170 |     HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
171 |     HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
172 |     HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
173 |     HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
174 |     HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
175 |     HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
176 |     HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
177 |     HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
178 |     HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
179 |     HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
180 | 
181 |     /* Round 4 */
182 |     II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
183 |     II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
184 |     II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
185 |     II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
186 |     II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
187 |     II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
188 |     II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
189 |     II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
190 |     II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
191 |     II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
192 |     II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
193 |     II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
194 |     II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
195 |     II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
196 |     II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
197 |     II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
198 | 
199 |     state[0] += a;
200 |     state[1] += b;
201 |     state[2] += c;
202 |     state[3] += d;
203 | 
204 |     // Zeroize sensitive information.
205 |     memset((POINTER)x, 0, sizeof (x));
206 |   }
207 | 
208 |   // Encodes input (UINT4) into output (unsigned char). Assumes len is
209 |   // a multiple of 4.
210 |   static void Encode( unsigned char *output, UINT4 *input, unsigned int len )
211 |   {
212 |     unsigned int i, j;
213 | 
214 |     for (i = 0, j = 0; j < len; i++, j += 4) {
215 |       output[j] = (unsigned char)(input[i] & 0xff);
216 |       output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
217 |       output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
218 |       output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
219 |     }
220 |   }
221 | 
222 |   // Decodes input (unsigned char) into output (UINT4). Assumes len is
223 |   // a multiple of 4.
224 |   static void Decode( UINT4 *output, unsigned char *input, unsigned int len )
225 |   {
226 |     unsigned int i, j;
227 | 
228 |     for (i = 0, j = 0; j < len; i++, j += 4)
229 |       output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
230 |       (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
231 |   }
232 |   //#pragma endregion
233 | 
234 | 
235 | public:
236 |   // MAIN FUNCTIONS
237 |   MD5()
238 |   {
239 |     Init() ;
240 |   }
241 | 
242 |   // MD5 initialization. Begins an MD5 operation, writing a new context.
243 |   void Init()
244 |   {
245 |     context.count[0] = context.count[1] = 0;
246 |   
247 |     // Load magic initialization constants.
248 |     context.state[0] = 0x67452301;
249 |     context.state[1] = 0xefcdab89;
250 |     context.state[2] = 0x98badcfe;
251 |     context.state[3] = 0x10325476;
252 |   }
253 | 
254 |   // MD5 block update operation. Continues an MD5 message-digest
255 |   // operation, processing another message block, and updating the
256 |   // context.
257 |   void Update(
258 |     unsigned char *input,   // input block
259 |     unsigned int inputLen ) // length of input block
260 |   {
261 |     unsigned int i, index, partLen;
262 | 
263 |     // Compute number of bytes mod 64
264 |     index = (unsigned int)((context.count[0] >> 3) & 0x3F);
265 | 
266 |     // Update number of bits
267 |     if ((context.count[0] += ((UINT4)inputLen << 3))
268 |       < ((UINT4)inputLen << 3))
269 |       context.count[1]++;
270 |     context.count[1] += ((UINT4)inputLen >> 29);
271 | 
272 |     partLen = 64 - index;
273 | 
274 |     // Transform as many times as possible.
275 |     if (inputLen >= partLen) {
276 |       memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen);
277 |       MD5Transform (context.state, context.buffer);
278 | 
279 |       for (i = partLen; i + 63 < inputLen; i += 64)
280 |         MD5Transform (context.state, &input[i]);
281 | 
282 |       index = 0;
283 |     }
284 |     else
285 |       i = 0;
286 | 
287 |     /* Buffer remaining input */
288 |     memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i);
289 |   }
290 | 
291 |   // MD5 finalization. Ends an MD5 message-digest operation, writing the
292 |   // the message digest and zeroizing the context.
293 |   // Writes to digestRaw
294 |   void Final()
295 |   {
296 |     unsigned char bits[8];
297 |     unsigned int index, padLen;
298 | 
299 |     // Save number of bits
300 |     Encode( bits, context.count, 8 );
301 | 
302 |     // Pad out to 56 mod 64.
303 |     index = (unsigned int)((context.count[0] >> 3) & 0x3f);
304 |     padLen = (index < 56) ? (56 - index) : (120 - index);
305 |     Update( PADDING, padLen );
306 | 
307 |     // Append length (before padding)
308 |     Update( bits, 8 );
309 | 
310 |     // Store state in digest
311 |     Encode( digestRaw, context.state, 16);
312 | 
313 |     // Zeroize sensitive information.
314 |     memset((POINTER)&context, 0, sizeof (context));
315 | 
316 |     writeToString() ;
317 |   }
318 | 
319 |   /// Buffer must be 32+1 (nul) = 33 chars long at least 
320 |   void writeToString()
321 |   {
322 |     int pos ;
323 | 
324 |     for( pos = 0 ; pos < 16 ; pos++ )
325 |       sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ;
326 |   }
327 | 
328 | 
329 | public:
330 |   // an MD5 digest is a 16-byte number (32 hex digits)
331 |   BYTE digestRaw[ 16 ] ;
332 | 
333 |   // This version of the digest is actually
334 |   // a "printf'd" version of the digest.
335 |   char digestChars[ 33 ] ;
336 | 
337 |   /// Load a file from disk and digest it
338 |   // Digests a file and returns the result.
339 |   const char* digestFile( const char *filename )
340 |   {
341 |     if (NULL == filename || strcmp(filename, "") == 0)
342 |         return NULL;
343 | 
344 |     Init() ;
345 | 
346 |     FILE *file;
347 |     
348 |     unsigned char buffer[1024] ;
349 | 
350 |     if((file = fopen (filename, "rb")) == NULL)
351 |     {
352 |       return NULL;
353 |     }
354 |     int len;
355 |     while( (len = fread( buffer, 1, 1024, file )) )
356 |       Update( buffer, len ) ;
357 |     Final();
358 | 
359 |     fclose( file );
360 | 
361 |     return digestChars ;
362 |   }
363 | 
364 |   /// Digests a byte-array already in memory
365 |   const char* digestMemory( BYTE *memchunk, int len )
366 |   {
367 |     if (NULL == memchunk)
368 |         return NULL;
369 | 
370 |     Init() ;
371 |     Update( memchunk, len ) ;
372 |     Final() ;
373 |     
374 |     return digestChars ;
375 |   }
376 | 
377 |   // Digests a string and prints the result.
378 |   const char* digestString(const char *string )
379 |   {
380 |     if (string == NULL)
381 |         return NULL;
382 | 
383 |     Init() ;
384 |     Update( (unsigned char*)string, strlen(string) ) ;
385 |     Final() ;
386 | 
387 |     return digestChars ;
388 |   }
389 | };
390 | 
391 | inline bool md5String(const char* str, std::string& res)
392 | {
393 |     if (NULL == str)
394 |     {
395 |         res = "";
396 |         return false;
397 |     }
398 | 
399 |     MD5 md5;
400 |     const char *pRes = md5.digestString(str);
401 |     if (NULL == pRes)
402 |     {
403 |         res = "";
404 |         return false;
405 |     }
406 | 
407 |     res = pRes;
408 |     return true;
409 | }
410 | 
411 | inline bool md5File(const char* filepath, std::string& res)
412 | {
413 |     if (NULL == filepath || strcmp(filepath, "") == 0)
414 |     {
415 |         res = "";
416 |         return false;
417 |     }
418 | 
419 |     MD5 md5;
420 |     const char *pRes = md5.digestFile(filepath);
421 | 
422 |     if (NULL == pRes)
423 |     {
424 |         res = "";
425 |         return false;
426 |     }
427 | 
428 |     res = pRes;
429 |     return true;
430 | }
431 | }
432 | #endif
433 | 


--------------------------------------------------------------------------------