├── index.js ├── .travis.yml ├── .gitignore ├── .npmignore ├── binding.gyp ├── test ├── segment.js ├── pos_tagger.js └── query_segment.js ├── src ├── CppJieba │ ├── ISegment.hpp │ ├── Limonp │ │ ├── InitOnOff.hpp │ │ ├── NonCopyable.hpp │ │ ├── HandyMacro.hpp │ │ ├── Condition.hpp │ │ ├── Thread.hpp │ │ ├── MutexLock.hpp │ │ ├── BoundedQueue.hpp │ │ ├── ArgvContext.hpp │ │ ├── Logger.hpp │ │ ├── CastFloat.hpp │ │ ├── ThreadPool.hpp │ │ ├── Config.hpp │ │ ├── StdExtension.hpp │ │ ├── BlockingQueue.hpp │ │ ├── MysqlClient.hpp │ │ ├── LocalVector.hpp │ │ ├── StringUtil.hpp │ │ └── Md5.hpp │ ├── TransCode.hpp │ ├── SegmentBase.hpp │ ├── PosTagger.hpp │ ├── QuerySegment.hpp │ ├── MixSegment.hpp │ ├── MPSegment.hpp │ ├── FullSegment.hpp │ ├── KeywordExtractor.hpp │ ├── DictTrie.hpp │ ├── Trie.hpp │ └── HMMSegment.hpp ├── utils.h ├── mix_segment.h ├── pos_tagger.h ├── query_segment.h ├── mix_segment.cpp ├── pos_tagger.cpp ├── segment.cpp └── query_segment.cpp ├── package.json ├── ChangeLog.md ├── LICENSE └── README.md /index.js: -------------------------------------------------------------------------------- 1 | var segment = require("./build/Release/segment"); 2 | module.exports = segment; 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | notifications: 5 | recipients: 6 | - wuyanyi09@foxmail.com 7 | email: 8 | on_success: change 9 | on_failure: always 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tags 2 | build 3 | *.demo 4 | *swp 5 | *.out 6 | *.o 7 | *.d 8 | *.ut 9 | log 10 | main 11 | lib*.a 12 | *_demo 13 | segdict* 14 | tmp 15 | t.* 16 | *.pid 17 | node_modules 18 | npm-debug.log 19 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .*.swp 3 | npm-debug.log 4 | node_modules 5 | 6 | # don't need these in the npm package. 7 | html/*.png 8 | 9 | # don't ignore .npmignore files 10 | # these are used in some tests. 11 | !.npmignore 12 | 13 | *.pyc 14 | -------------------------------------------------------------------------------- /binding.gyp: -------------------------------------------------------------------------------- 1 | { 2 | "targets": [ 3 | { 4 | "target_name": "segment", 5 | "sources": [ "./src/segment.cpp", "./src/mix_segment.cpp", "./src/query_segment.cpp", "./src/pos_tagger.cpp" ], 6 | "cflags": [ 7 | "-DLOGGER_LEVEL=LL_WARN" 8 | ], 9 | "include_dirs" : [ 10 | " " + tl[i]); 6 | } 7 | }); 8 | var tl = segment.cutSync("阻塞的南京市长江大桥"); 9 | for(var i = 0; i < tl.length; i++) { 10 | console.log(i + " == " + tl[i]); 11 | } 12 | 13 | -------------------------------------------------------------------------------- /test/pos_tagger.js: -------------------------------------------------------------------------------- 1 | var segment = require("../index.js"); 2 | segment.taggerLoadDict("./dict/jieba.dict.utf8", "./dict/hmm_model.utf8"); 3 | segment.tag("非阻塞的南京市长江大桥", function(tl){ 4 | for(var i = 0; i < tl.length; i++) { 5 | console.log(i + " => " + tl[i]); 6 | } 7 | }); 8 | var tl = segment.tagSync("阻塞的南京市长江大桥"); 9 | for(var i = 0; i < tl.length; i++) { 10 | console.log(i + " == " + tl[i]); 11 | } 12 | 13 | -------------------------------------------------------------------------------- /src/CppJieba/ISegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTINTERFACE_H 2 | #define CPPJIEBA_SEGMENTINTERFACE_H 3 | 4 | 5 | namespace CppJieba 6 | { 7 | class ISegment 8 | { 9 | public: 10 | virtual ~ISegment(){}; 11 | public: 12 | virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; 13 | virtual bool cut(const string& str, vector& res) const = 0; 14 | }; 15 | } 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /test/query_segment.js: -------------------------------------------------------------------------------- 1 | var segment = require("../index.js"); 2 | // 第三个参数是分词的粒度阈值,当词长大于3时,会进行细粒度的再切割,不填时默认阈值是4。 3 | segment.queryLoadDict("./dict/jieba.dict.utf8", "./dict/hmm_model.utf8", 3); 4 | console.log("非阻塞的:"); 5 | segment.queryCut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", function(tl){ 6 | for(var i = 0; i < tl.length; i++) { 7 | console.log(i + " => " + tl[i]); 8 | } 9 | }); 10 | console.log("阻塞的:"); 11 | var tl = segment.queryCutSync("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); 12 | for(var i = 0; i < tl.length; i++) { 13 | console.log(i + " == " + tl[i]); 14 | } 15 | 16 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/InitOnOff.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_INITONOFF_H 2 | #define LIMONP_INITONOFF_H 3 | 4 | namespace Limonp 5 | { 6 | class InitOnOff 7 | { 8 | public: 9 | InitOnOff():isInited_(false){}; 10 | ~InitOnOff(){}; 11 | protected: 12 | bool isInited_; 13 | bool getInitFlag_()const{return isInited_;}; 14 | bool setInitFlag_(bool flag){return isInited_ = flag;}; 15 | public: 16 | operator bool() const {return getInitFlag_();}; 17 | 18 | }; 19 | } 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | #include 7 | #include 8 | 9 | namespace Limonp 10 | { 11 | class NonCopyable 12 | { 13 | protected: 14 | NonCopyable(){}; 15 | ~NonCopyable(){}; 16 | private: 17 | NonCopyable(const NonCopyable& ); 18 | const NonCopyable& operator=(const NonCopyable& ); 19 | }; 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nodejieba", 3 | "description": "chinese segment for node", 4 | "version": "0.2.5", 5 | "author": "Yanyi Wu ", 6 | "maintainers": [ 7 | "aszxqw " 8 | ], 9 | "main": "./index.js", 10 | "engines": { 11 | "node": "0.10.x" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "http://github.com/aszxqw/nodejieba.git" 16 | }, 17 | "keywords": [ 18 | "chinese", 19 | "segment", 20 | "cppjieba", 21 | "jieba" 22 | ], 23 | "dependencies": { 24 | "nan": "~1.2.0" 25 | }, 26 | "devDependencies": {}, 27 | "scripts": { 28 | "test": "node test/segment.js && node test/query_segment.js && node test/pos_tagger.js" 29 | }, 30 | "license": "MIT" 31 | } 32 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | ## v0.2.5 2 | 3 | * 增加词性标注功能 4 | 5 | ## v0.2.4 6 | 7 | * 更新 package 兼容更低版本的 npm 8 | 9 | ## v0.2.3 10 | 11 | * 更新 cppjieba ,减少内存使用。 12 | 13 | ## v0.2.2 14 | 15 | * 在queryLoadDict 函数中增加query模式的粒度阈值作为可选参数。 16 | 17 | ## v0.2.1 18 | 19 | * 增加搜索引擎分词模式,分别对应的调用函数是 `queryLoadDict, queryCutSync, queryCut`。 20 | 21 | ## v0.2.0 22 | 23 | * 将原来的 cut 阻塞分词模式改为非阻塞模式 24 | * 阻塞分词模型的函数名为 cutSync 25 | 26 | ## v0.1.4 27 | 28 | * 修复关于较低版本编译器需要使用`tr1/unordered_map`导致和`node-gyp`编译选项`-fno-rtti`冲突的编译错误问题。 29 | 30 | ## v0.1.3 31 | 32 | * 更新[CppJieba],支持更低版本的g++。 33 | 34 | ## v0.1.2 35 | 36 | * 更新[CppJieba],使用`less_memory`这个branch来减少Trie树内存的开销。 37 | 38 | ## v0.1.1 39 | 40 | * 依照node的c++扩展的常规写法,对CppJieba进行简单的包装,并已`npm publish` 41 | 42 | [CppJieba]:http://github.com/aszxqw/cppjieba.git 43 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef NODEJIEBA_SRC_UTLS_H 2 | #define NODEJIEBA_SRC_UTLS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace v8; 14 | 15 | inline void WrapVector(vector &ov, Local &array) { 16 | array = Array::New(ov.size()); 17 | for(size_t i = 0; i < ov.size(); i++) { 18 | array->Set(i, String::New(ov[i].c_str())); 19 | } 20 | } 21 | 22 | inline void WrapPairVector(vector > &ov, Local &array) { 23 | array = Array::New(ov.size()); 24 | for(size_t i = 0; i < ov.size(); i++) { 25 | array->Set(i, String::New((ov[i].first + ":" + ov[i].second).c_str())); 26 | } 27 | } 28 | 29 | inline string ValueToString(Local val) { 30 | String::Utf8Value su(val); 31 | return string(*su); 32 | } 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/mix_segment.h: -------------------------------------------------------------------------------- 1 | #ifndef NODEJIEAB_SRC_MIX_SEGMENT_H 2 | #define NODEJIEAB_SRC_MIX_SEGMENT_H 3 | #include "utils.h" 4 | #include "CppJieba/MixSegment.hpp" 5 | 6 | extern CppJieba::MixSegment segment; 7 | 8 | extern NAN_METHOD(loadDict); 9 | extern NAN_METHOD(cutSync); 10 | extern NAN_METHOD(cut); 11 | 12 | class CutWorker : public NanAsyncWorker { 13 | public: 14 | CutWorker(NanCallback *callback, string inputStr) 15 | : NanAsyncWorker(callback), inputStr(inputStr) {} 16 | 17 | ~CutWorker() {} 18 | 19 | 20 | void Execute () { 21 | segment.cut(inputStr, outputWords); 22 | } 23 | 24 | void HandleOKCallback () { 25 | NanScope(); 26 | Local args[1]; 27 | Local wordList; 28 | WrapVector(outputWords, wordList); 29 | args[0] = wordList; 30 | callback->Call(1, args); 31 | } 32 | 33 | private: 34 | string inputStr; 35 | vector outputWords; 36 | }; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/pos_tagger.h: -------------------------------------------------------------------------------- 1 | #ifndef NODEJIEAB_SRC_POSTAGGER_H 2 | #define NODEJIEAB_SRC_POSTAGGER_H 3 | #include "utils.h" 4 | #include "CppJieba/PosTagger.hpp" 5 | 6 | extern CppJieba::PosTagger tagger; 7 | 8 | extern NAN_METHOD(taggerLoadDict); 9 | extern NAN_METHOD(tagSync); 10 | extern NAN_METHOD(tag); 11 | 12 | class TaggerWorker : public NanAsyncWorker { 13 | public: 14 | TaggerWorker(NanCallback *callback, string inputStr) 15 | : NanAsyncWorker(callback), inputStr(inputStr) {} 16 | 17 | ~TaggerWorker() {} 18 | 19 | 20 | void Execute () { 21 | tagger.tag(inputStr, outputWords); 22 | } 23 | 24 | void HandleOKCallback () { 25 | NanScope(); 26 | Local args[1]; 27 | Local wordList; 28 | WrapPairVector(outputWords, wordList); 29 | args[0] = wordList; 30 | callback->Call(1, args); 31 | } 32 | 33 | private: 34 | string inputStr; 35 | vector > outputWords; 36 | }; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/query_segment.h: -------------------------------------------------------------------------------- 1 | #ifndef NODEJIEAB_SRC_QUERY_SEGMENT_H 2 | #define NODEJIEAB_SRC_QUERY_SEGMENT_H 3 | 4 | #include "utils.h" 5 | #include "CppJieba/QuerySegment.hpp" 6 | 7 | extern CppJieba::QuerySegment querySegment; 8 | 9 | extern NAN_METHOD(queryLoadDict); 10 | extern NAN_METHOD(queryCutSync); 11 | extern NAN_METHOD(queryCut); 12 | 13 | class QueryCutWorker : public NanAsyncWorker { 14 | public: 15 | QueryCutWorker(NanCallback *callback, string inputStr) 16 | : NanAsyncWorker(callback), inputStr(inputStr) {} 17 | 18 | ~QueryCutWorker() {} 19 | 20 | 21 | void Execute () { 22 | querySegment.cut(inputStr, outputWords); 23 | } 24 | 25 | void HandleOKCallback () { 26 | NanScope(); 27 | Local args[1]; 28 | Local wordList; 29 | WrapVector(outputWords, wordList); 30 | args[0] = wordList; 31 | callback->Call(1, args); 32 | } 33 | 34 | private: 35 | string inputStr; 36 | vector outputWords; 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/mix_segment.cpp: -------------------------------------------------------------------------------- 1 | #include "mix_segment.h" 2 | 3 | CppJieba::MixSegment segment; 4 | 5 | NAN_METHOD (cutSync) { 6 | NanScope(); 7 | 8 | String::Utf8Value param1(args[0]->ToString()); 9 | vector words; 10 | 11 | segment.cut(*param1, words); 12 | 13 | Local outArray; 14 | WrapVector(words, outArray); 15 | 16 | NanReturnValue(outArray); 17 | } 18 | NAN_METHOD (loadDict) { 19 | NanScope(); 20 | String::Utf8Value param0(args[0]->ToString()); 21 | String::Utf8Value param1(args[1]->ToString()); 22 | NanReturnValue (Boolean::New(segment.init(*param0, *param1))); 23 | } 24 | 25 | NAN_METHOD (cut) { 26 | NanScope(); 27 | if (args.Length() == 2){ 28 | string inputStr = ValueToString(args[0]); 29 | Local callback = args[1].As(); 30 | 31 | NanCallback* nanCallback = new NanCallback(callback); 32 | CutWorker* worker = new CutWorker(nanCallback, inputStr); 33 | NanAsyncQueueWorker(worker); 34 | } 35 | else { 36 | NanThrowTypeError("argc must equals to 2"); 37 | } 38 | NanReturnUndefined(); 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Yanyi Wu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/HandyMacro.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_HANDY_MACRO_HPP 2 | #define LIMONP_HANDY_MACRO_HPP 3 | 4 | #include 5 | #include 6 | 7 | #define LIMONP_CHECK(exp) \ 8 | if(!(exp)){fprintf(stderr, "File:%s, Line:%d Exp:[" #exp "] is true, abort.\n", __FILE__, __LINE__); abort();} 9 | 10 | #define print(x) cout<< #x": " << x <ToString()); 9 | vector > words; 10 | 11 | tagger.tag(*param1, words); 12 | 13 | Local outArray; 14 | WrapPairVector(words, outArray); 15 | 16 | NanReturnValue(outArray); 17 | } 18 | NAN_METHOD (taggerLoadDict) { 19 | NanScope(); 20 | String::Utf8Value param0(args[0]->ToString()); 21 | String::Utf8Value param1(args[1]->ToString()); 22 | tagger.init(*param0, *param1); 23 | NanReturnValue (Boolean::New(true)); 24 | } 25 | 26 | NAN_METHOD (tag) { 27 | NanScope(); 28 | if (args.Length() == 2){ 29 | string inputStr = ValueToString(args[0]); 30 | Local callback = args[1].As(); 31 | 32 | NanCallback* nanCallback = new NanCallback(callback); 33 | TaggerWorker* worker = new TaggerWorker(nanCallback, inputStr); 34 | NanAsyncQueueWorker(worker); 35 | } 36 | else { 37 | NanThrowTypeError("argc must equals to 2"); 38 | } 39 | NanReturnUndefined(); 40 | } 41 | -------------------------------------------------------------------------------- /src/segment.cpp: -------------------------------------------------------------------------------- 1 | #include "mix_segment.h" 2 | #include "query_segment.h" 3 | #include "pos_tagger.h" 4 | 5 | void init(Handle exports) { 6 | exports->Set(NanNew("loadDict"), 7 | NanNew(loadDict)->GetFunction()); 8 | exports->Set(NanNew("cutSync"), 9 | NanNew(cutSync)->GetFunction()); 10 | exports->Set(NanNew("cut"), 11 | NanNew(cut)->GetFunction()); 12 | 13 | exports->Set(NanNew("queryLoadDict"), 14 | NanNew(queryLoadDict)->GetFunction()); 15 | exports->Set(NanNew("queryCutSync"), 16 | NanNew(queryCutSync)->GetFunction()); 17 | exports->Set(NanNew("queryCut"), 18 | NanNew(queryCut)->GetFunction()); 19 | 20 | exports->Set(NanNew("taggerLoadDict"), 21 | NanNew(taggerLoadDict)->GetFunction()); 22 | exports->Set(NanNew("tagSync"), 23 | NanNew(tagSync)->GetFunction()); 24 | exports->Set(NanNew("tag"), 25 | NanNew(tag)->GetFunction()); 26 | } 27 | 28 | NODE_MODULE(segment, init) 29 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/Condition.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * https://github.com/chenshuo/muduo/blob/master/muduo/base/Condition.h 3 | */ 4 | 5 | #ifndef LIMONP_CONDITION_HPP 6 | #define LIMONP_CONDITION_HPP 7 | 8 | #include "MutexLock.hpp" 9 | 10 | namespace Limonp 11 | { 12 | class Condition : NonCopyable 13 | { 14 | public: 15 | explicit Condition(MutexLock& mutex) 16 | : mutex_(mutex) 17 | { 18 | LIMONP_CHECK(!pthread_cond_init(&pcond_, NULL)); 19 | } 20 | 21 | ~Condition() 22 | { 23 | LIMONP_CHECK(!pthread_cond_destroy(&pcond_)); 24 | } 25 | 26 | void wait() 27 | { 28 | LIMONP_CHECK(!pthread_cond_wait(&pcond_, mutex_.getPthreadMutex())); 29 | } 30 | 31 | void notify() 32 | { 33 | LIMONP_CHECK(!pthread_cond_signal(&pcond_)); 34 | } 35 | 36 | void notifyAll() 37 | { 38 | LIMONP_CHECK(!pthread_cond_broadcast(&pcond_)); 39 | } 40 | 41 | private: 42 | MutexLock& mutex_; 43 | pthread_cond_t pcond_; 44 | }; 45 | 46 | } 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/query_segment.cpp: -------------------------------------------------------------------------------- 1 | #include "query_segment.h" 2 | 3 | CppJieba::QuerySegment querySegment; 4 | 5 | NAN_METHOD (queryLoadDict) { 6 | NanScope(); 7 | String::Utf8Value param0(args[0]->ToString()); 8 | String::Utf8Value param1(args[1]->ToString()); 9 | int param2 = args.Length() >= 3 ? args[2]->Int32Value() : 4; 10 | NanReturnValue (Boolean::New(querySegment.init(*param0, *param1, param2))); 11 | } 12 | 13 | NAN_METHOD (queryCutSync) { 14 | NanScope(); 15 | 16 | String::Utf8Value param1(args[0]->ToString()); 17 | vector words; 18 | 19 | querySegment.cut(*param1, words); 20 | 21 | Local outArray; 22 | WrapVector(words, outArray); 23 | 24 | NanReturnValue(outArray); 25 | } 26 | 27 | NAN_METHOD (queryCut) { 28 | NanScope(); 29 | if (args.Length() == 2){ 30 | string inputStr = ValueToString(args[0]); 31 | Local callback = args[1].As(); 32 | 33 | NanCallback* nanCallback = new NanCallback(callback); 34 | QueryCutWorker* worker = new QueryCutWorker(nanCallback, inputStr); 35 | NanAsyncQueueWorker(worker); 36 | } 37 | else { 38 | NanThrowTypeError("argc must equals to 2"); 39 | } 40 | NanReturnUndefined(); 41 | } 42 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/Thread.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_HPP 2 | #define LIMONP_THREAD_HPP 3 | 4 | #include "HandyMacro.hpp" 5 | #include "NonCopyable.hpp" 6 | 7 | namespace Limonp 8 | { 9 | class IThread: NonCopyable 10 | { 11 | private: 12 | pthread_t thread_; 13 | bool isStarted; 14 | bool isJoined; 15 | public: 16 | IThread(): isStarted(false), isJoined(false) 17 | { 18 | } 19 | virtual ~IThread() 20 | { 21 | if(isStarted && !isJoined) 22 | { 23 | LIMONP_CHECK(!pthread_detach(thread_)); 24 | } 25 | }; 26 | public: 27 | virtual void run() = 0; 28 | void start() 29 | { 30 | assert(!isStarted); 31 | LIMONP_CHECK(!pthread_create(&thread_, NULL, worker_, this)); 32 | isStarted = true; 33 | } 34 | void join() 35 | { 36 | assert(!isJoined); 37 | LIMONP_CHECK(!pthread_join(thread_, NULL)); 38 | isJoined = true; 39 | } 40 | private: 41 | static void * worker_(void * data) 42 | { 43 | IThread * ptr = (IThread* ) data; 44 | ptr->run(); 45 | return NULL; 46 | } 47 | }; 48 | } 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/MutexLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MUTEX_LOCK_HPP 2 | #define LIMONP_MUTEX_LOCK_HPP 3 | 4 | #include 5 | #include "NonCopyable.hpp" 6 | #include "HandyMacro.hpp" 7 | 8 | namespace Limonp 9 | { 10 | class MutexLock: NonCopyable 11 | { 12 | private: 13 | pthread_mutex_t mutex_; 14 | public: 15 | pthread_mutex_t* getPthreadMutex() 16 | { 17 | return &mutex_; 18 | } 19 | public: 20 | MutexLock() 21 | { 22 | LIMONP_CHECK(!pthread_mutex_init(&mutex_, NULL)); 23 | } 24 | ~MutexLock() 25 | { 26 | LIMONP_CHECK(!pthread_mutex_destroy(&mutex_)); 27 | } 28 | private: 29 | void lock() 30 | { 31 | LIMONP_CHECK(!pthread_mutex_lock(&mutex_)); 32 | } 33 | void unlock() 34 | { 35 | LIMONP_CHECK(!pthread_mutex_unlock(&mutex_)); 36 | } 37 | friend class MutexLockGuard; 38 | }; 39 | class MutexLockGuard: NonCopyable 40 | { 41 | public: 42 | explicit MutexLockGuard(MutexLock & mutex) 43 | : mutex_(mutex) 44 | { 45 | mutex_.lock(); 46 | } 47 | ~MutexLockGuard() 48 | { 49 | mutex_.unlock(); 50 | } 51 | private: 52 | MutexLock & mutex_; 53 | }; 54 | #define MutexLockGuard(x) assert(false); 55 | } 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /src/CppJieba/TransCode.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf-8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef CPPJIEBA_TRANSCODE_H 6 | #define CPPJIEBA_TRANSCODE_H 7 | 8 | 9 | #include "Limonp/StringUtil.hpp" 10 | #include "Limonp/LocalVector.hpp" 11 | 12 | namespace CppJieba 13 | { 14 | 15 | using namespace Limonp; 16 | typedef uint16_t UnicodeValueType; 17 | typedef Limonp::LocalVector Unicode; 18 | namespace TransCode 19 | { 20 | inline bool decode(const string& str, Unicode& res) 21 | { 22 | #ifdef CPPJIEBA_GBK 23 | return gbkTrans(str, res); 24 | #else 25 | return utf8ToUnicode(str, res); 26 | #endif 27 | } 28 | 29 | inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) 30 | { 31 | #ifdef CPPJIEBA_GBK 32 | return gbkTrans(begin, end, res); 33 | #else 34 | return unicodeToUtf8(begin, end, res); 35 | #endif 36 | } 37 | 38 | inline bool encode(const Unicode& uni, string& res) 39 | { 40 | return encode(uni.begin(), uni.end(), res); 41 | } 42 | 43 | // compiler is expected to optimized this function to avoid return value copy 44 | inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end) 45 | { 46 | string res; 47 | res.reserve(end - begin); 48 | encode(begin, end, res); 49 | return res; 50 | } 51 | 52 | // compiler is expected to optimized this function to avoid return value copy 53 | inline Unicode decode(const string& str) 54 | { 55 | Unicode unicode; 56 | unicode.reserve(str.size()); 57 | decode(str, unicode); 58 | return unicode; 59 | } 60 | } 61 | } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/BoundedQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_QUEUE_HPP 2 | #define LIMONP_BOUNDED_QUEUE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace Limonp 9 | { 10 | using namespace std; 11 | template 12 | class BoundedQueue 13 | { 14 | private: 15 | size_t head_; 16 | size_t tail_; 17 | size_t size_; 18 | const size_t capacity_; 19 | vector circular__buffer; 20 | public: 21 | explicit BoundedQueue(size_t capacity): capacity_(capacity), circular__buffer(capacity) 22 | { 23 | head_ = 0; 24 | tail_ = 0; 25 | size_ = 0; 26 | assert(capacity_); 27 | } 28 | ~BoundedQueue(){} 29 | public: 30 | void clear() 31 | { 32 | head_ = 0; 33 | tail_ = 0; 34 | size_ = 0; 35 | } 36 | bool empty() const 37 | { 38 | return !size_; 39 | } 40 | bool full() const 41 | { 42 | return capacity_ == size_; 43 | } 44 | size_t size() const 45 | { 46 | return size_; 47 | } 48 | size_t capacity() const 49 | { 50 | return capacity_; 51 | } 52 | 53 | void push(const T& t) 54 | { 55 | assert(!full()); 56 | circular__buffer[tail_] = t; 57 | tail_ = (tail_ + 1) % capacity_; 58 | size_ ++; 59 | } 60 | 61 | T pop() 62 | { 63 | assert(!empty()); 64 | size_t oldPos = head_; 65 | head_ = (head_ + 1) % capacity_; 66 | size_ --; 67 | return circular__buffer[oldPos]; 68 | } 69 | 70 | }; 71 | } 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/ArgvContext.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | 6 | #ifndef LIMONP_ARGV_FUNCTS_H 7 | #define LIMONP_ARGV_FUNCTS_H 8 | 9 | #include 10 | #include 11 | #include "StringUtil.hpp" 12 | 13 | namespace Limonp 14 | { 15 | using namespace std; 16 | class ArgvContext 17 | { 18 | public : 19 | ArgvContext(int argc, const char* const * argv) 20 | { 21 | 22 | for(int i = 0; i < argc; i++) 23 | { 24 | if(startsWith(argv[i], "-")) 25 | { 26 | if(i + 1 < argc && !startsWith(argv[i + 1], "-")) 27 | { 28 | mpss_[argv[i]] = argv[i+1]; 29 | i++; 30 | } 31 | else 32 | { 33 | sset_.insert(argv[i]); 34 | } 35 | } 36 | else 37 | { 38 | args_.push_back(argv[i]); 39 | } 40 | } 41 | } 42 | ~ArgvContext(){}; 43 | public: 44 | friend ostream& operator << (ostream& os, const ArgvContext& args); 45 | string operator [](size_t i) const 46 | { 47 | if(i < args_.size()) 48 | { 49 | return args_[i]; 50 | } 51 | return ""; 52 | } 53 | string operator [](const string& key) const 54 | { 55 | map::const_iterator it = mpss_.find(key); 56 | if(it != mpss_.end()) 57 | { 58 | return it->second; 59 | } 60 | return ""; 61 | } 62 | public: 63 | bool hasKey(const string& key) const 64 | { 65 | if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) 66 | { 67 | return true; 68 | } 69 | return false; 70 | } 71 | private: 72 | vector args_; 73 | map mpss_; 74 | set sset_; 75 | 76 | }; 77 | 78 | inline ostream& operator << (ostream& os, const ArgvContext& args) 79 | { 80 | return os< 10 | 11 | 12 | namespace CppJieba 13 | { 14 | using namespace Limonp; 15 | 16 | //const char* const SPECIAL_CHARS = " \t\n"; 17 | #ifndef CPPJIEBA_GBK 18 | const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; 19 | #else 20 | const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u}; 21 | #endif 22 | 23 | class SegmentBase: public ISegment, public NonCopyable 24 | { 25 | public: 26 | SegmentBase(){_loadSpecialSymbols();}; 27 | virtual ~SegmentBase(){}; 28 | private: 29 | unordered_set _specialSymbols; 30 | private: 31 | void _loadSpecialSymbols() 32 | { 33 | size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); 34 | for(size_t i = 0; i < size; i ++) 35 | { 36 | _specialSymbols.insert(SPECIAL_SYMBOL[i]); 37 | } 38 | assert(_specialSymbols.size()); 39 | } 40 | 41 | public: 42 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; 43 | virtual bool cut(const string& str, vector& res) const 44 | { 45 | res.clear(); 46 | 47 | Unicode unicode; 48 | unicode.reserve(str.size()); 49 | 50 | TransCode::decode(str, unicode); 51 | 52 | Unicode::const_iterator left = unicode.begin(); 53 | Unicode::const_iterator right; 54 | 55 | for(right = unicode.begin(); right != unicode.end(); right++) 56 | { 57 | if(isIn(_specialSymbols, *right)) 58 | { 59 | if(left != right) 60 | { 61 | cut(left, right, res); 62 | } 63 | res.resize(res.size() + 1); 64 | TransCode::encode(right, right + 1, res.back()); 65 | left = right + 1; 66 | } 67 | } 68 | if(left != right) 69 | { 70 | cut(left, right, res); 71 | } 72 | 73 | return true; 74 | } 75 | }; 76 | } 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/Logger.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_LOGGER_H 6 | #define LIMONP_LOGGER_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__ 20 | 21 | #define LogDebug(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 22 | #define LogInfo(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 23 | #define LogWarn(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 24 | #define LogError(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 25 | #define LogFatal(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 26 | 27 | namespace Limonp 28 | { 29 | using namespace std; 30 | enum {LL_DEBUG = 0, LL_INFO = 1, LL_WARN = 2, LL_ERROR = 3, LL_FATAL = 4, LEVEL_ARRAY_SIZE = 5, CSTR_BUFFER_SIZE = 32}; 31 | static const char * LOG_LEVEL_ARRAY[LEVEL_ARRAY_SIZE]= {"DEBUG","INFO","WARN","ERROR","FATAL"}; 32 | static const char * LOG_FORMAT = "%s %s:%d %s %s\n"; 33 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 34 | 35 | class Logger 36 | { 37 | public: 38 | static void Logging(size_t level, const string& msg, const char* fileName, int lineno) 39 | { 40 | assert(level <= LL_FATAL); 41 | char buf[CSTR_BUFFER_SIZE]; 42 | time_t timeNow; 43 | time(&timeNow); 44 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)); 45 | fprintf(stderr, LOG_FORMAT, buf, fileName, lineno,LOG_LEVEL_ARRAY[level], msg.c_str()); 46 | } 47 | static void LoggingF(size_t level, const char* fileName, int lineno, const char* const fmt, ...) 48 | { 49 | #ifdef LOGGER_LEVEL 50 | if(level < LOGGER_LEVEL) return; 51 | #endif 52 | int size = 256; 53 | string msg; 54 | va_list ap; 55 | while (1) { 56 | msg.resize(size); 57 | va_start(ap, fmt); 58 | int n = vsnprintf((char *)msg.c_str(), size, fmt, ap); 59 | va_end(ap); 60 | if (n > -1 && n < size) { 61 | msg.resize(n); 62 | break; 63 | } 64 | if (n > -1) 65 | size = n + 1; 66 | else 67 | size *= 2; 68 | } 69 | Logging(level, msg, fileName, lineno); 70 | } 71 | }; 72 | } 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/CastFloat.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CAST_FUNCTS_H 2 | #define LIMONP_CAST_FUNCTS_H 3 | 4 | namespace Limonp 5 | { 6 | namespace CastFloat 7 | { 8 | //logical and or 9 | static const int sign_32 = 0xC0000000; 10 | static const int exponent_32 = 0x07800000; 11 | static const int mantissa_32 = 0x007FE000; 12 | static const int sign_exponent_32 = 0x40000000; 13 | static const int loss_32 = 0x38000000; 14 | 15 | static const short sign_16 = (short)0xC000; 16 | static const short exponent_16 = (short)0x3C00; 17 | static const short mantissa_16 = (short)0x03FF; 18 | static const short sign_exponent_16 = (short)0x4000; 19 | static const int exponent_fill_32 = 0x38000000; 20 | 21 | //infinite 22 | static const short infinite_16 = (short) 0x7FFF; 23 | static const short infinitesmall_16 = (short) 0x0000; 24 | 25 | inline float intBitsToFloat(unsigned int x) 26 | { 27 | union 28 | { 29 | float f; 30 | int i; 31 | }u; 32 | u.i = x; 33 | return u.f; 34 | } 35 | 36 | inline int floatToIntBits(float f) 37 | { 38 | union 39 | { 40 | float f; 41 | int i ; 42 | }u; 43 | u.f = f; 44 | return u.i; 45 | } 46 | 47 | inline short floatToShortBits(float f) 48 | { 49 | int fi = floatToIntBits(f); 50 | 51 | // 提取关键信息 52 | short sign = (short) ((unsigned int)(fi & sign_32) >> 16); 53 | short exponent = (short) ((unsigned int)(fi & exponent_32) >> 13); 54 | short mantissa = (short) ((unsigned int)(fi & mantissa_32) >> 13); 55 | // 生成编码结果 56 | short code = (short) (sign | exponent | mantissa); 57 | // 无穷大量、无穷小量的处理 58 | if ((fi & loss_32) > 0 && (fi & sign_exponent_32) > 0) { 59 | // 当指数符号为1时(正次方),且左234位为1,返回无穷大量 60 | return (short) (code | infinite_16); 61 | } 62 | if (((fi & loss_32) ^ loss_32) > 0 && (fi & sign_exponent_32) == 0) { 63 | // 当指数符号位0时(负次方),且左234位为0(与111异或>0),返回无穷小量 64 | return infinitesmall_16; 65 | } 66 | 67 | return code; 68 | } 69 | 70 | inline float shortBitsToFloat(short s) 71 | { 72 | /* 73 | * 指数空余3位:若符号位为1,补0;若符号位为0,补1。 尾数位在后补0(13个) 74 | */ 75 | int sign = ((int) (s & sign_16)) << 16; 76 | int exponent = ((int) (s & exponent_16)) << 13; 77 | // 指数符号位为0,234位补1 78 | if ((s & sign_exponent_16) == 0 && s != 0) { 79 | exponent |= exponent_fill_32; 80 | } 81 | int mantissa = ((int) (s & mantissa_16)) << 13; 82 | // 生成解码结果 83 | int code = sign | exponent | mantissa; 84 | return intBitsToFloat(code); 85 | 86 | } 87 | } 88 | } 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/aszxqw/nodejieba.png?branch=master)](https://travis-ci.org/aszxqw/nodejieba) 2 | [![Dependency Status](https://david-dm.org/aszxqw/nodejieba.png?theme=shields.io)](https://david-dm.org/aszxqw/nodejieba) 3 | [![devDependency Status](https://david-dm.org/aszxqw/nodejieba/dev-status.png?theme=shields.io)](https://david-dm.org/aszxqw/nodejieba#info=devDependencies) 4 | [![NpmDownload Status](http://img.shields.io/npm/dm/nodejieba.svg)](https://www.npmjs.org/package/nodejieba) 5 | - - - 6 | 7 | # NodeJieba "结巴"分词的Node.js版本 8 | 9 | ## Introduction 10 | 11 | `NodeJieba`只是[CppJieba]简单包装而成的`node`扩展,用来进行中文分词。 12 | 13 | 详见[NodeJiebaBlog] 14 | 15 | ## Install 16 | 17 | ```sh 18 | npm install nodejieba 19 | ``` 20 | 21 | 因为`npm`速度很慢而且经常因为墙的原因出现莫名其妙的问题,在此强烈建议使用[cnpm],命令如下: 22 | 23 | ```sh 24 | npm --registry=http://r.cnpmjs.org install nodejieba 25 | ``` 26 | 27 | ## Usage 28 | 29 | ### 默认分词算法 30 | 31 | #### 初始化 32 | 33 | ```js 34 | var segment = require("nodejieba"); 35 | segment.loadDict("./node_modules/nodejieba/dict/jieba.dict.utf8", "./node_modules/nodejieba/dict/hmm_model.utf8"); 36 | ``` 37 | 38 | #### 阻塞式调用 39 | 40 | ```js 41 | var wordList = segment.cutSync("阻塞模式分词"); 42 | if (wordList.constructor == Array) // just for tutorial, this is always be true 43 | { 44 | wordList.forEach(function(word) { 45 | console.log(word); 46 | }); 47 | } 48 | ``` 49 | 50 | #### 非阻塞式调用 51 | 52 | ```js 53 | segment.cut("非阻塞模式分词", function(wordList) { 54 | wordList.forEach(function(word) { 55 | console.log(word); 56 | }); 57 | }); 58 | ``` 59 | 60 | ### 搜索引擎分词算法 61 | 62 | #### 初始化 63 | 64 | ```js 65 | var segment = require("nodejieba"); 66 | segment.queryLoadDict("./node_modules/nodejieba/dict/jieba.dict.utf8", "./node_modules/nodejieba/dict/hmm_model.utf8"); 67 | ``` 68 | 69 | #### 阻塞式调用 70 | 71 | ```js 72 | var wordList = segment.queryCutSync("阻塞模式分词"); 73 | if (wordList.constructor == Array) // just for tutorial, this is always be true 74 | { 75 | wordList.forEach(function(word) { 76 | console.log(word); 77 | }); 78 | } 79 | ``` 80 | 81 | #### 非阻塞式调用 82 | 83 | ```js 84 | segment.queryCut("非阻塞模式分词", function(wordList) { 85 | wordList.forEach(function(word) { 86 | console.log(word); 87 | }); 88 | }); 89 | ``` 90 | 91 | 具体用法可以参考 `test/segment.js test/query_segment.js` 92 | 93 | ### 词性标注 94 | 95 | 具体用法可以参考 `test/pos_tagger.js` 96 | 97 | ## Testing 98 | 99 | 在`node v0.10.2`下测试通过 100 | 101 | ## Demo 102 | 103 | http://cppjieba-webdemo.herokuapp.com/ 104 | (chrome is suggested) 105 | 106 | ## Thanks 107 | 108 | [Jieba中文分词] 109 | 110 | ## Author 111 | 112 | - aszxqw https://github.com/aszxqw wuyanyi09@gmail.com 113 | - myl2821 https://github.com/myl2821 myl2821@gmail.com 114 | 115 | [NodeJiebaBlog]:http://www.aszxqw.com/work/2014/02/22/nodejs-cpp-addon-nodejieba.html 116 | [CppJieba]:https://github.com/aszxqw/cppjieba.git 117 | [cnpm]:http://cnpmjs.org 118 | [Jieba中文分词]:https://github.com/fxsjy/jieba 119 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/ThreadPool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_POOL_HPP 2 | #define LIMONP_THREAD_POOL_HPP 3 | 4 | #include "Thread.hpp" 5 | #include "BlockingQueue.hpp" 6 | 7 | namespace Limonp 8 | { 9 | class ITask 10 | { 11 | public: 12 | virtual void run() = 0; 13 | virtual ~ITask() {} 14 | }; 15 | 16 | template 17 | ITask* CreateTask(ArgType arg) 18 | { 19 | return new TaskType(arg); 20 | } 21 | template 22 | ITask* CreateTask(ArgType0 arg0, ArgType1 arg1) 23 | { 24 | return new TaskType(arg0, arg1); 25 | } 26 | 27 | //class ThreadPool; 28 | class ThreadPool: NonCopyable 29 | { 30 | private: 31 | class Worker: public IThread 32 | { 33 | private: 34 | ThreadPool * ptThreadPool_; 35 | public: 36 | Worker(ThreadPool* pool): ptThreadPool_(pool) 37 | { 38 | assert(ptThreadPool_); 39 | } 40 | virtual ~Worker() 41 | { 42 | } 43 | public: 44 | virtual void run() 45 | { 46 | while(true) 47 | { 48 | ITask * task = ptThreadPool_->queue_.pop(); 49 | if(task == NULL) 50 | { 51 | break; 52 | } 53 | task->run(); 54 | delete task; 55 | } 56 | } 57 | }; 58 | private: 59 | friend class Worker; 60 | private: 61 | vector threads_; 62 | BoundedBlockingQueue queue_; 63 | //mutable MutexLock mutex_; 64 | //Condition isEmpty__; 65 | public: 66 | ThreadPool(size_t threadNum, size_t queueMaxSize): threads_(threadNum), queue_(queueMaxSize)//, mutex_(), isEmpty__(mutex_) 67 | { 68 | assert(threadNum); 69 | assert(queueMaxSize); 70 | for(size_t i = 0; i < threads_.size(); i ++) 71 | { 72 | threads_[i] = new Worker(this); 73 | } 74 | } 75 | ~ThreadPool() 76 | { 77 | for(size_t i = 0; i < threads_.size(); i ++) 78 | { 79 | queue_.push(NULL); 80 | } 81 | for(size_t i = 0; i < threads_.size(); i ++) 82 | { 83 | threads_[i]->join(); 84 | delete threads_[i]; 85 | } 86 | } 87 | 88 | public: 89 | void start() 90 | { 91 | for(size_t i = 0; i < threads_.size(); i++) 92 | { 93 | threads_[i]->start(); 94 | } 95 | } 96 | 97 | void add(ITask* task) 98 | { 99 | assert(task); 100 | queue_.push(task); 101 | } 102 | }; 103 | } 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /src/CppJieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "MixSegment.hpp" 5 | #include "Limonp/StringUtil.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace CppJieba 9 | { 10 | using namespace Limonp; 11 | 12 | static const char* const POS_M = "m"; 13 | static const char* const POS_ENG = "eng"; 14 | static const char* const POS_X = "x"; 15 | 16 | class PosTagger 17 | { 18 | private: 19 | MixSegment _segment; 20 | const DictTrie * _dictTrie; 21 | 22 | public: 23 | PosTagger() 24 | {} 25 | PosTagger( 26 | const string& dictPath, 27 | const string& hmmFilePath, 28 | const string& userDictPath = "" 29 | ) 30 | { 31 | init(dictPath, hmmFilePath, userDictPath); 32 | }; 33 | ~PosTagger(){}; 34 | public: 35 | void init( 36 | const string& dictPath, 37 | const string& hmmFilePath, 38 | const string& userDictPath = "" 39 | ) 40 | { 41 | LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath)); 42 | _dictTrie = _segment.getDictTrie(); 43 | LIMONP_CHECK(_dictTrie); 44 | }; 45 | 46 | 47 | bool tag(const string& src, vector >& res) const 48 | { 49 | vector cutRes; 50 | if (!_segment.cut(src, cutRes)) 51 | { 52 | LogError("_mixSegment cut failed"); 53 | return false; 54 | } 55 | 56 | const DictUnit *tmp = NULL; 57 | Unicode unico; 58 | for (vector::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) 59 | { 60 | if (!TransCode::decode(*itr, unico)) 61 | { 62 | LogError("decode failed."); 63 | return false; 64 | } 65 | tmp = _dictTrie->find(unico.begin(), unico.end()); 66 | if(tmp == NULL || tmp->tag.empty()) 67 | { 68 | res.push_back(make_pair(*itr, _specialRule(unico))); 69 | } 70 | else 71 | { 72 | res.push_back(make_pair(*itr, tmp->tag)); 73 | } 74 | } 75 | return !res.empty(); 76 | } 77 | private: 78 | const char* _specialRule(const Unicode& unicode) const 79 | { 80 | size_t m = 0; 81 | size_t eng = 0; 82 | for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) 83 | { 84 | if(unicode[i] < 0x80) 85 | { 86 | eng ++; 87 | if('0' <= unicode[i] && unicode[i] <= '9') 88 | { 89 | m++; 90 | } 91 | } 92 | } 93 | // ascii char is not found 94 | if(eng == 0) 95 | { 96 | return POS_X; 97 | } 98 | // all the ascii is number char 99 | if(m == eng) 100 | { 101 | return POS_M; 102 | } 103 | // the ascii chars contain english letter 104 | return POS_ENG; 105 | } 106 | }; 107 | } 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "StringUtil.hpp" 14 | 15 | namespace Limonp 16 | { 17 | using namespace std; 18 | class Config 19 | { 20 | public: 21 | explicit Config(const string& filePath) 22 | { 23 | loadFile_(filePath); 24 | } 25 | public: 26 | operator bool () 27 | { 28 | return !map_.empty(); 29 | } 30 | private: 31 | void loadFile_(const string& filePath) 32 | { 33 | ifstream ifs(filePath.c_str()); 34 | assert(ifs); 35 | string line; 36 | vector vecBuf; 37 | size_t lineno = 0; 38 | while(getline(ifs, line)) 39 | { 40 | lineno ++; 41 | trim(line); 42 | if(line.empty() || startsWith(line, "#")) 43 | { 44 | continue; 45 | } 46 | vecBuf.clear(); 47 | if(!split(line, vecBuf, "=") || 2 != vecBuf.size()) 48 | { 49 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 50 | assert(false); 51 | continue; 52 | } 53 | string& key = vecBuf[0]; 54 | string& value = vecBuf[1]; 55 | trim(key); 56 | trim(value); 57 | if(!map_.insert(make_pair(key, value)).second) 58 | { 59 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 60 | assert(false); 61 | continue; 62 | } 63 | } 64 | ifs.close(); 65 | } 66 | public: 67 | bool get(const string& key, string& value) const 68 | { 69 | map::const_iterator it = map_.find(key); 70 | if(map_.end() != it) 71 | { 72 | value = it->second; 73 | return true; 74 | } 75 | return false; 76 | } 77 | bool get(const string& key, int & value) const 78 | { 79 | string str; 80 | if(!get(key, str)) { 81 | return false; 82 | } 83 | value = atoi(str.c_str()); 84 | return true; 85 | } 86 | const char* operator [] (const char* key) const 87 | { 88 | if(NULL == key) 89 | { 90 | return NULL; 91 | } 92 | map::const_iterator it = map_.find(key); 93 | if(map_.end() != it) 94 | { 95 | return it->second.c_str(); 96 | } 97 | return NULL; 98 | } 99 | public: 100 | string getConfigInfo() const 101 | { 102 | string res; 103 | res << *this; 104 | return res; 105 | } 106 | private: 107 | map map_; 108 | private: 109 | friend ostream& operator << (ostream& os, const Config& config); 110 | }; 111 | 112 | inline ostream& operator << (ostream& os, const Config& config) 113 | { 114 | return os << config.map_; 115 | } 116 | } 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /src/CppJieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "Limonp/Logger.hpp" 8 | #include "DictTrie.hpp" 9 | #include "ISegment.hpp" 10 | #include "SegmentBase.hpp" 11 | #include "FullSegment.hpp" 12 | #include "MixSegment.hpp" 13 | #include "TransCode.hpp" 14 | #include "DictTrie.hpp" 15 | 16 | namespace CppJieba 17 | { 18 | class QuerySegment: public SegmentBase 19 | { 20 | private: 21 | MixSegment _mixSeg; 22 | FullSegment _fullSeg; 23 | size_t _maxWordLen; 24 | 25 | public: 26 | QuerySegment(){}; 27 | QuerySegment(const string& dict, const string& model, size_t maxWordLen) 28 | { 29 | init(dict, model, maxWordLen); 30 | }; 31 | virtual ~QuerySegment(){}; 32 | public: 33 | bool init(const string& dict, const string& model, size_t maxWordLen) 34 | { 35 | LIMONP_CHECK(_mixSeg.init(dict, model)); 36 | LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie())); 37 | assert(maxWordLen); 38 | _maxWordLen = maxWordLen; 39 | return true; 40 | } 41 | 42 | public: 43 | using SegmentBase::cut; 44 | 45 | public: 46 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 47 | { 48 | if (begin >= end) 49 | { 50 | LogError("begin >= end"); 51 | return false; 52 | } 53 | 54 | //use mix cut first 55 | vector mixRes; 56 | if (!_mixSeg.cut(begin, end, mixRes)) 57 | { 58 | LogError("_mixSeg cut failed."); 59 | return false; 60 | } 61 | 62 | vector fullRes; 63 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) 64 | { 65 | 66 | // if it's too long, cut with _fullSeg, put fullRes in res 67 | if (mixResItr->size() > _maxWordLen) 68 | { 69 | if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) 70 | { 71 | for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) 72 | { 73 | res.push_back(*fullResItr); 74 | } 75 | 76 | //clear tmp res 77 | fullRes.clear(); 78 | } 79 | } 80 | else // just use the mix result 81 | { 82 | res.push_back(*mixResItr); 83 | } 84 | } 85 | 86 | return true; 87 | } 88 | 89 | 90 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 91 | { 92 | if (begin >= end) 93 | { 94 | LogError("begin >= end"); 95 | return false; 96 | } 97 | 98 | vector uRes; 99 | if (!cut(begin, end, uRes)) 100 | { 101 | LogError("get unicode cut result error."); 102 | return false; 103 | } 104 | 105 | string tmp; 106 | for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) 107 | { 108 | if (TransCode::encode(*uItr, tmp)) 109 | { 110 | res.push_back(tmp); 111 | } 112 | else 113 | { 114 | LogError("encode failed."); 115 | } 116 | } 117 | 118 | return true; 119 | } 120 | }; 121 | } 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/StdExtension.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_STD_EXTEMSION_HPP 2 | #define LIMONP_STD_EXTEMSION_HPP 3 | 4 | #include 5 | 6 | #if(__cplusplus == 201103L) 7 | #include 8 | #include 9 | #else 10 | #include 11 | #include 12 | namespace std 13 | { 14 | using std::tr1::unordered_map; 15 | using std::tr1::unordered_set; 16 | } 17 | 18 | #endif 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | namespace std 27 | { 28 | template 29 | ostream& operator << (ostream& os, const vector& vec) 30 | { 31 | if(vec.empty()) 32 | { 33 | return os << "[]"; 34 | } 35 | os<<"[\""< 44 | ostream& operator << (ostream& os, const pair& pr) 45 | { 46 | os << pr.first << ":" << pr.second ; 47 | return os; 48 | } 49 | 50 | 51 | template 52 | string& operator << (string& str, const T& obj) 53 | { 54 | stringstream ss; 55 | ss << obj; // call ostream& operator << (ostream& os, 56 | return str = ss.str(); 57 | } 58 | 59 | template 60 | ostream& operator << (ostream& os, const map& mp) 61 | { 62 | if(mp.empty()) 63 | { 64 | os<<"{}"; 65 | return os; 66 | } 67 | os<<'{'; 68 | typename map::const_iterator it = mp.begin(); 69 | os<<*it; 70 | it++; 71 | while(it != mp.end()) 72 | { 73 | os<<", "<<*it; 74 | it++; 75 | } 76 | os<<'}'; 77 | return os; 78 | } 79 | template 80 | ostream& operator << (ostream& os, const std::unordered_map& mp) 81 | { 82 | if(mp.empty()) 83 | { 84 | return os << "{}"; 85 | } 86 | os<<'{'; 87 | typename std::unordered_map::const_iterator it = mp.begin(); 88 | os<<*it; 89 | it++; 90 | while(it != mp.end()) 91 | { 92 | os<<", "<<*it++; 93 | } 94 | return os<<'}'; 95 | } 96 | 97 | template 98 | ostream& operator << (ostream& os, const set& st) 99 | { 100 | if(st.empty()) 101 | { 102 | os << "{}"; 103 | return os; 104 | } 105 | os<<'{'; 106 | typename set::const_iterator it = st.begin(); 107 | os<<*it; 108 | it++; 109 | while(it != st.end()) 110 | { 111 | os<<", "<<*it; 112 | it++; 113 | } 114 | os<<'}'; 115 | return os; 116 | } 117 | 118 | template 119 | bool isIn(const ContainType& contain, const KeyType& key) 120 | { 121 | return contain.end() != contain.find(key); 122 | } 123 | 124 | template 125 | basic_string & operator << (basic_string & s, ifstream & ifs) 126 | { 127 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 128 | } 129 | 130 | template 131 | ofstream & operator << (ofstream & ofs, const basic_string& s) 132 | { 133 | ostreambuf_iterator itr (ofs); 134 | copy(s.begin(), s.end(), itr); 135 | return ofs; 136 | } 137 | } 138 | 139 | #endif 140 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/BlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | https://github.com/chenshuo/muduo/blob/master/muduo/base/BlockingQueue.h 3 | */ 4 | 5 | #ifndef LIMONP_BLOCKINGQUEUE_HPP 6 | #define LIMONP_BLOCKINGQUEUE_HPP 7 | 8 | #include 9 | #include "BoundedQueue.hpp" 10 | #include "Condition.hpp" 11 | 12 | namespace Limonp 13 | { 14 | template 15 | class BlockingQueue: NonCopyable 16 | { 17 | public: 18 | BlockingQueue() 19 | : mutex_(), notEmpty_(mutex_), queue_() 20 | { 21 | } 22 | 23 | void push(const T& x) 24 | { 25 | MutexLockGuard lock(mutex_); 26 | queue_.push(x); 27 | notEmpty_.notify(); // wait morphing saves us 28 | } 29 | 30 | T pop() 31 | { 32 | MutexLockGuard lock(mutex_); 33 | // always use a while-loop, due to spurious wakeup 34 | while (queue_.empty()) 35 | { 36 | notEmpty_.wait(); 37 | } 38 | assert(!queue_.empty()); 39 | T front(queue_.front()); 40 | queue_.pop(); 41 | return front; 42 | } 43 | 44 | size_t size() const 45 | { 46 | MutexLockGuard lock(mutex_); 47 | return queue_.size(); 48 | } 49 | bool empty() const 50 | { 51 | return size() == 0; 52 | } 53 | 54 | private: 55 | mutable MutexLock mutex_; 56 | Condition notEmpty_; 57 | std::queue queue_; 58 | }; 59 | 60 | template 61 | class BoundedBlockingQueue : NonCopyable 62 | { 63 | public: 64 | explicit BoundedBlockingQueue(size_t maxSize) 65 | : mutex_(), 66 | notEmpty_(mutex_), 67 | notFull_(mutex_), 68 | queue_(maxSize) 69 | {} 70 | 71 | void push(const T& x) 72 | { 73 | MutexLockGuard lock(mutex_); 74 | while (queue_.full()) 75 | { 76 | notFull_.wait(); 77 | } 78 | assert(!queue_.full()); 79 | queue_.push(x); 80 | notEmpty_.notify(); 81 | } 82 | 83 | T pop() 84 | { 85 | MutexLockGuard lock(mutex_); 86 | while (queue_.empty()) 87 | { 88 | notEmpty_.wait(); 89 | } 90 | assert(!queue_.empty()); 91 | T res = queue_.pop(); 92 | notFull_.notify(); 93 | return res; 94 | } 95 | 96 | bool empty() const 97 | { 98 | MutexLockGuard lock(mutex_); 99 | return queue_.empty(); 100 | } 101 | 102 | bool full() const 103 | { 104 | MutexLockGuard lock(mutex_); 105 | return queue_.full(); 106 | } 107 | 108 | size_t size() const 109 | { 110 | MutexLockGuard lock(mutex_); 111 | return queue_.size(); 112 | } 113 | 114 | size_t capacity() const 115 | { 116 | return queue_.capacity(); 117 | } 118 | 119 | private: 120 | mutable MutexLock mutex_; 121 | Condition notEmpty_; 122 | Condition notFull_; 123 | BoundedQueue queue_; 124 | }; 125 | 126 | } 127 | 128 | #endif 129 | -------------------------------------------------------------------------------- /src/CppJieba/MixSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MIXSEGMENT_H 2 | #define CPPJIEBA_MIXSEGMENT_H 3 | 4 | #include 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "Limonp/StringUtil.hpp" 8 | 9 | namespace CppJieba 10 | { 11 | class MixSegment: public SegmentBase 12 | { 13 | private: 14 | MPSegment _mpSeg; 15 | HMMSegment _hmmSeg; 16 | public: 17 | MixSegment(){}; 18 | MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") 19 | { 20 | LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict)); 21 | } 22 | virtual ~MixSegment(){} 23 | public: 24 | bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") 25 | { 26 | LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); 27 | LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); 28 | LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); 29 | return true; 30 | } 31 | public: 32 | using SegmentBase::cut; 33 | public: 34 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 35 | { 36 | vector words; 37 | words.reserve(end - begin); 38 | if(!_mpSeg.cut(begin, end, words)) 39 | { 40 | LogError("mpSeg cutDAG failed."); 41 | return false; 42 | } 43 | 44 | vector hmmRes; 45 | hmmRes.reserve(end - begin); 46 | Unicode piece; 47 | piece.reserve(end - begin); 48 | for (size_t i = 0, j = 0; i < words.size(); i++) 49 | { 50 | //if mp get a word, it's ok, put it into result 51 | if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) 52 | { 53 | res.push_back(words[i]); 54 | continue; 55 | } 56 | 57 | // if mp get a single one and it is not in userdict, collect it in sequence 58 | j = i; 59 | while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) 60 | { 61 | piece.push_back(words[j][0]); 62 | j++; 63 | } 64 | 65 | // cut the sequence with hmm 66 | if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) 67 | { 68 | LogError("_hmmSeg cut failed."); 69 | return false; 70 | } 71 | 72 | //put hmm result to result 73 | for (size_t k = 0; k < hmmRes.size(); k++) 74 | { 75 | res.push_back(hmmRes[k]); 76 | } 77 | 78 | //clear tmp vars 79 | piece.clear(); 80 | hmmRes.clear(); 81 | 82 | //let i jump over this piece 83 | i = j - 1; 84 | } 85 | return true; 86 | } 87 | 88 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 89 | { 90 | if(begin == end) 91 | { 92 | return false; 93 | } 94 | 95 | vector uRes; 96 | uRes.reserve(end - begin); 97 | if (!cut(begin, end, uRes)) 98 | { 99 | return false; 100 | } 101 | 102 | size_t offset = res.size(); 103 | res.resize(res.size() + uRes.size()); 104 | for(size_t i = 0; i < uRes.size(); i ++, offset++) 105 | { 106 | if(!TransCode::encode(uRes[i], res[offset])) 107 | { 108 | LogError("encode failed."); 109 | } 110 | } 111 | return true; 112 | } 113 | 114 | const DictTrie* getDictTrie() const 115 | { 116 | return _mpSeg.getDictTrie(); 117 | } 118 | }; 119 | } 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/MysqlClient.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MYSQLCLIENT_H 2 | #define LIMONP_MYSQLCLIENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "Logger.hpp" 9 | #include "InitOnOff.hpp" 10 | 11 | namespace Limonp 12 | { 13 | using namespace std; 14 | class MysqlClient: public InitOnOff 15 | { 16 | public: 17 | typedef vector< vector > RowsType; 18 | private: 19 | const string host_; 20 | const size_t port_; 21 | const string user_; 22 | const string passwd_; 23 | const string db_; 24 | const string charset_; 25 | public: 26 | MysqlClient(const string& host, size_t port, const string& user, const string& passwd, const string& db, const string& charset = "utf8"): host_(host), port_(port), user_(user), passwd_(passwd), db_(db), charset_(charset), conn_(NULL) 27 | { 28 | setInitFlag_(init_()); 29 | } 30 | ~MysqlClient() 31 | { 32 | if(conn_) 33 | { 34 | mysql_close(conn_); 35 | } 36 | }; 37 | private: 38 | bool init_() 39 | { 40 | //cout<& vals) 80 | { 81 | size_t retn = 0; 82 | string sql; 83 | for(size_t i = 0; i < vals.size(); i ++) 84 | { 85 | sql.clear(); 86 | string_format(sql, "insert into %s (%s) values %s", tableName.c_str(), keys.c_str(), vals[i].c_str()); 87 | retn += executeSql(sql.c_str()); 88 | } 89 | return retn; 90 | } 91 | bool select(const string& sql, RowsType& rows) 92 | { 93 | if(!executeSql(sql)) 94 | { 95 | LogError("executeSql failed. [%s]", sql.c_str()); 96 | return false; 97 | } 98 | MYSQL_RES * result = mysql_store_result(conn_); 99 | if(!result) 100 | { 101 | LogError("mysql_store_result failed.[%d]", mysql_error(conn_)); 102 | return false; 103 | } 104 | size_t num_fields = mysql_num_fields(result); 105 | MYSQL_ROW row; 106 | while((row = mysql_fetch_row(result))) 107 | { 108 | vector vec; 109 | for(size_t i = 0; i < num_fields; i ++) 110 | { 111 | row[i] ? vec.push_back(row[i]) : vec.push_back("NULL"); 112 | } 113 | rows.push_back(vec); 114 | } 115 | mysql_free_result(result); 116 | return true; 117 | } 118 | 119 | private: 120 | MYSQL * conn_; 121 | 122 | }; 123 | } 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /src/CppJieba/MPSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MPSEGMENT_H 2 | #define CPPJIEBA_MPSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "Limonp/Logger.hpp" 8 | #include "DictTrie.hpp" 9 | #include "ISegment.hpp" 10 | #include "SegmentBase.hpp" 11 | 12 | namespace CppJieba 13 | { 14 | 15 | class MPSegment: public SegmentBase 16 | { 17 | private: 18 | DictTrie _dictTrie; 19 | 20 | public: 21 | MPSegment(){}; 22 | MPSegment(const string& dictPath, const string& userDictPath = "") 23 | { 24 | LIMONP_CHECK(init(dictPath, userDictPath)); 25 | }; 26 | virtual ~MPSegment(){}; 27 | public: 28 | bool init(const string& dictPath, const string& userDictPath = "") 29 | { 30 | LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath)); 31 | LogInfo("MPSegment init(%s) ok", dictPath.c_str()); 32 | return true; 33 | } 34 | bool isUserDictSingleChineseWord(const Unicode::value_type & value) const 35 | { 36 | return _dictTrie.isUserDictSingleChineseWord(value); 37 | } 38 | public: 39 | using SegmentBase::cut; 40 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 41 | { 42 | if(begin == end) 43 | { 44 | return false; 45 | } 46 | 47 | vector words; 48 | words.reserve(end - begin); 49 | if(!cut(begin, end, words)) 50 | { 51 | return false; 52 | } 53 | size_t offset = res.size(); 54 | res.resize(res.size() + words.size()); 55 | for(size_t i = 0; i < words.size(); i++) 56 | { 57 | if(!TransCode::encode(words[i], res[i + offset])) 58 | { 59 | LogError("encode failed."); 60 | res[i + offset].clear(); 61 | } 62 | } 63 | return true; 64 | } 65 | 66 | bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const 67 | { 68 | if(end == begin) 69 | { 70 | return false; 71 | } 72 | vector segmentChars; 73 | 74 | _dictTrie.find(begin, end, segmentChars); 75 | 76 | _calcDP(segmentChars); 77 | 78 | _cut(segmentChars, res); 79 | 80 | return true; 81 | } 82 | const DictTrie* getDictTrie() const 83 | { 84 | return &_dictTrie; 85 | } 86 | 87 | private: 88 | void _calcDP(vector& segmentChars) const 89 | { 90 | size_t nextPos; 91 | const DictUnit* p; 92 | double val; 93 | 94 | for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) 95 | { 96 | segmentChars[i].pInfo = NULL; 97 | segmentChars[i].weight = MIN_DOUBLE; 98 | assert(!segmentChars[i].dag.empty()); 99 | for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) 100 | { 101 | nextPos = it->first; 102 | p = it->second; 103 | val = 0.0; 104 | if(nextPos + 1 < segmentChars.size()) 105 | { 106 | val += segmentChars[nextPos + 1].weight; 107 | } 108 | 109 | if(p) 110 | { 111 | val += p->weight; 112 | } 113 | else 114 | { 115 | val += _dictTrie.getMinWeight(); 116 | } 117 | if(val > segmentChars[i].weight) 118 | { 119 | segmentChars[i].pInfo = p; 120 | segmentChars[i].weight = val; 121 | } 122 | } 123 | } 124 | } 125 | void _cut(const vector& segmentChars, vector& res) const 126 | { 127 | size_t i = 0; 128 | while(i < segmentChars.size()) 129 | { 130 | const DictUnit* p = segmentChars[i].pInfo; 131 | if(p) 132 | { 133 | res.push_back(p->word); 134 | i += p->word.size(); 135 | } 136 | else//single chinese word 137 | { 138 | res.push_back(Unicode(1, segmentChars[i].uniCh)); 139 | i++; 140 | } 141 | } 142 | } 143 | 144 | 145 | }; 146 | } 147 | 148 | #endif 149 | -------------------------------------------------------------------------------- /src/CppJieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "Limonp/Logger.hpp" 8 | #include "DictTrie.hpp" 9 | #include "ISegment.hpp" 10 | #include "SegmentBase.hpp" 11 | #include "TransCode.hpp" 12 | 13 | namespace CppJieba 14 | { 15 | class FullSegment: public SegmentBase 16 | { 17 | private: 18 | const DictTrie* _dictTrie; 19 | bool _isBorrowed; 20 | public: 21 | FullSegment() 22 | { 23 | _dictTrie = NULL; 24 | _isBorrowed = false; 25 | } 26 | explicit FullSegment(const string& dictPath) 27 | { 28 | _dictTrie = NULL; 29 | init(dictPath); 30 | } 31 | explicit FullSegment(const DictTrie* dictTrie) 32 | { 33 | _dictTrie = NULL; 34 | init(dictTrie); 35 | } 36 | virtual ~FullSegment() 37 | { 38 | if(_dictTrie && ! _isBorrowed) 39 | { 40 | delete _dictTrie; 41 | } 42 | 43 | }; 44 | public: 45 | bool init(const string& dictPath) 46 | { 47 | assert(_dictTrie == NULL); 48 | _dictTrie = new DictTrie(dictPath); 49 | _isBorrowed = false; 50 | return true; 51 | } 52 | bool init(const DictTrie* dictTrie) 53 | { 54 | assert(_dictTrie == NULL); 55 | assert(dictTrie); 56 | _dictTrie = dictTrie; 57 | _isBorrowed = true; 58 | return true; 59 | } 60 | 61 | public: 62 | using SegmentBase::cut; 63 | 64 | public: 65 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 66 | { 67 | assert(_dictTrie); 68 | if (begin >= end) 69 | { 70 | LogError("begin >= end"); 71 | return false; 72 | } 73 | 74 | //resut of searching in trie tree 75 | DagType tRes; 76 | 77 | //max index of res's words 78 | int maxIdx = 0; 79 | 80 | // always equals to (uItr - begin) 81 | int uIdx = 0; 82 | 83 | //tmp variables 84 | int wordLen = 0; 85 | for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) 86 | { 87 | //find word start from uItr 88 | if (_dictTrie->find(uItr, end, tRes, 0)) 89 | { 90 | for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) 91 | //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) 92 | { 93 | wordLen = itr->second->word.size(); 94 | if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) 95 | { 96 | res.push_back(itr->second->word); 97 | } 98 | maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; 99 | } 100 | tRes.clear(); 101 | } 102 | else // not found word start from uItr 103 | { 104 | if (maxIdx <= uIdx) // never exist in prev results 105 | { 106 | //put itr itself in res 107 | res.push_back(Unicode(1, *uItr)); 108 | 109 | //mark it exits 110 | ++maxIdx; 111 | } 112 | } 113 | ++uIdx; 114 | } 115 | 116 | return true; 117 | } 118 | 119 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 120 | { 121 | assert(_dictTrie); 122 | if (begin >= end) 123 | { 124 | LogError("begin >= end"); 125 | return false; 126 | } 127 | 128 | vector uRes; 129 | if (!cut(begin, end, uRes)) 130 | { 131 | LogError("get unicode cut result error."); 132 | return false; 133 | } 134 | 135 | string tmp; 136 | for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) 137 | { 138 | if (TransCode::encode(*uItr, tmp)) 139 | { 140 | res.push_back(tmp); 141 | } 142 | else 143 | { 144 | LogError("encode failed."); 145 | } 146 | } 147 | 148 | return true; 149 | } 150 | }; 151 | } 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/LocalVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOCAL_VECTOR_HPP 2 | #define LIMONP_LOCAL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace Limonp 10 | { 11 | using namespace std; 12 | /* 13 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 14 | * LocalVector is simple and not well-tested. 15 | */ 16 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 17 | template 18 | class LocalVector 19 | { 20 | public: 21 | typedef const T* const_iterator ; 22 | typedef T value_type; 23 | typedef size_t size_type; 24 | private: 25 | T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; 26 | T * ptr_; 27 | size_t size_; 28 | size_t capacity_; 29 | public: 30 | LocalVector() 31 | { 32 | init_(); 33 | }; 34 | LocalVector(const LocalVector& vec) 35 | { 36 | init_(); 37 | *this = vec; 38 | } 39 | LocalVector(const_iterator begin, const_iterator end) // TODO: make it faster 40 | { 41 | init_(); 42 | while(begin != end) 43 | { 44 | push_back(*begin++); 45 | } 46 | } 47 | LocalVector(size_t size, const T& t) // TODO: make it faster 48 | { 49 | init_(); 50 | while(size--) 51 | { 52 | push_back(t); 53 | } 54 | } 55 | ~LocalVector() 56 | { 57 | if(ptr_ != buffer_) 58 | { 59 | free(ptr_); 60 | } 61 | }; 62 | public: 63 | LocalVector& operator = (const LocalVector& vec) 64 | { 65 | clear(); 66 | size_ = vec.size(); 67 | capacity_ = vec.capacity(); 68 | if(vec.buffer_ == vec.ptr_) 69 | { 70 | memcpy(buffer_, vec.buffer_, sizeof(T) * size_); 71 | ptr_ = buffer_; 72 | } 73 | else 74 | { 75 | ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); 76 | assert(ptr_); 77 | memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T)); 78 | } 79 | return *this; 80 | } 81 | private: 82 | void init_() 83 | { 84 | ptr_ = buffer_; 85 | size_ = 0; 86 | capacity_ = LOCAL_VECTOR_BUFFER_SIZE; 87 | } 88 | public: 89 | T& operator [] (size_t i) 90 | { 91 | return ptr_[i]; 92 | } 93 | const T& operator [] (size_t i) const 94 | { 95 | return ptr_[i]; 96 | } 97 | void push_back(const T& t) 98 | { 99 | if(size_ == capacity_) 100 | { 101 | assert(capacity_); 102 | reserve(capacity_ * 2); 103 | } 104 | ptr_[size_ ++ ] = t; 105 | } 106 | void reserve(size_t size) 107 | { 108 | if(size <= capacity_) 109 | { 110 | return; 111 | } 112 | T * next = (T*)malloc(sizeof(T) * size); 113 | assert(next); 114 | T * old = ptr_; 115 | ptr_ = next; 116 | memcpy(ptr_, old, sizeof(T) * capacity_); 117 | capacity_ = size; 118 | if(old != buffer_) 119 | { 120 | free(old); 121 | } 122 | } 123 | bool empty() const 124 | { 125 | return 0 == size(); 126 | } 127 | size_t size() const 128 | { 129 | return size_; 130 | } 131 | size_t capacity() const 132 | { 133 | return capacity_; 134 | } 135 | const_iterator begin() const 136 | { 137 | return ptr_; 138 | } 139 | const_iterator end() const 140 | { 141 | return ptr_ + size_; 142 | } 143 | void clear() 144 | { 145 | if(ptr_ != buffer_) 146 | { 147 | free(ptr_); 148 | } 149 | init_(); 150 | } 151 | }; 152 | 153 | template 154 | ostream & operator << (ostream& os, const LocalVector& vec) 155 | { 156 | if(vec.empty()) 157 | { 158 | return os << "[]"; 159 | } 160 | os<<"[\""< 6 | #include 7 | 8 | namespace CppJieba 9 | { 10 | using namespace Limonp; 11 | 12 | /*utf8*/ 13 | class KeywordExtractor 14 | { 15 | private: 16 | MixSegment _segment; 17 | private: 18 | unordered_map _idfMap; 19 | double _idfAverage; 20 | 21 | unordered_set _stopWords; 22 | public: 23 | KeywordExtractor(){}; 24 | KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) 25 | { 26 | LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath)); 27 | }; 28 | ~KeywordExtractor(){}; 29 | 30 | public: 31 | bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) 32 | { 33 | _loadIdfDict(idfPath); 34 | _loadStopWordDict(stopWordPath); 35 | LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); 36 | return true; 37 | }; 38 | public: 39 | 40 | bool extract(const string& str, vector& keywords, size_t topN) const 41 | { 42 | vector > topWords; 43 | if(!extract(str, topWords, topN)) 44 | { 45 | return false; 46 | } 47 | for(size_t i = 0; i < topWords.size(); i++) 48 | { 49 | keywords.push_back(topWords[i].first); 50 | } 51 | return true; 52 | } 53 | 54 | bool extract(const string& str, vector >& keywords, size_t topN) const 55 | { 56 | vector words; 57 | if(!_segment.cut(str, words)) 58 | { 59 | LogError("segment cut(%s) failed.", str.c_str()); 60 | return false; 61 | } 62 | 63 | map wordmap; 64 | for(vector::iterator iter = words.begin(); iter != words.end(); iter++) 65 | { 66 | if(_isSingleWord(*iter)) 67 | { 68 | continue; 69 | } 70 | wordmap[*iter] += 1.0; 71 | } 72 | 73 | for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) 74 | { 75 | if(_stopWords.end() != _stopWords.find(itr->first)) 76 | { 77 | wordmap.erase(itr++); 78 | continue; 79 | } 80 | 81 | unordered_map::const_iterator cit = _idfMap.find(itr->first); 82 | if(cit != _idfMap.end()) 83 | { 84 | itr->second *= cit->second; 85 | } 86 | else 87 | { 88 | itr->second *= _idfAverage; 89 | } 90 | itr ++; 91 | } 92 | 93 | keywords.clear(); 94 | std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); 95 | topN = min(topN, keywords.size()); 96 | partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); 97 | keywords.resize(topN); 98 | return true; 99 | } 100 | private: 101 | void _loadIdfDict(const string& idfPath) 102 | { 103 | ifstream ifs(idfPath.c_str()); 104 | if(!ifs) 105 | { 106 | LogError("open %s failed.", idfPath.c_str()); 107 | assert(false); 108 | } 109 | string line ; 110 | vector buf; 111 | double idf = 0.0; 112 | double idfSum = 0.0; 113 | size_t lineno = 0; 114 | for(;getline(ifs, line); lineno++) 115 | { 116 | buf.clear(); 117 | if(line.empty()) 118 | { 119 | LogError("line[%d] empty. skipped.", lineno); 120 | continue; 121 | } 122 | if(!split(line, buf, " ") || buf.size() != 2) 123 | { 124 | LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); 125 | continue; 126 | } 127 | idf = atof(buf[1].c_str()); 128 | _idfMap[buf[0]] = idf; 129 | idfSum += idf; 130 | 131 | } 132 | 133 | assert(lineno); 134 | _idfAverage = idfSum / lineno; 135 | assert(_idfAverage > 0.0); 136 | } 137 | void _loadStopWordDict(const string& filePath) 138 | { 139 | ifstream ifs(filePath.c_str()); 140 | if(!ifs) 141 | { 142 | LogError("open %s failed.", filePath.c_str()); 143 | assert(false); 144 | } 145 | string line ; 146 | while(getline(ifs, line)) 147 | { 148 | _stopWords.insert(line); 149 | } 150 | assert(_stopWords.size()); 151 | } 152 | private: 153 | bool _isSingleWord(const string& str) const 154 | { 155 | Unicode unicode; 156 | TransCode::decode(str, unicode); 157 | if(unicode.size() == 1) 158 | return true; 159 | return false; 160 | } 161 | 162 | private: 163 | static bool _cmp(const pair& lhs, const pair& rhs) 164 | { 165 | return lhs.second > rhs.second; 166 | } 167 | 168 | }; 169 | } 170 | 171 | #endif 172 | 173 | 174 | -------------------------------------------------------------------------------- /src/CppJieba/DictTrie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_DICT_TRIE_HPP 2 | #define CPPJIEBA_DICT_TRIE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "Limonp/StringUtil.hpp" 12 | #include "Limonp/Logger.hpp" 13 | #include "TransCode.hpp" 14 | #include "Trie.hpp" 15 | 16 | 17 | 18 | namespace CppJieba 19 | { 20 | using namespace Limonp; 21 | const double MIN_DOUBLE = -3.14e+100; 22 | const double MAX_DOUBLE = 3.14e+100; 23 | const size_t DICT_COLUMN_NUM = 3; 24 | const char* const UNKNOWN_TAG = ""; 25 | 26 | class DictTrie 27 | { 28 | private: 29 | vector _nodeInfos; 30 | Trie * _trie; 31 | 32 | double _minWeight; 33 | private: 34 | unordered_set _userDictSingleChineseWord; 35 | public: 36 | bool isUserDictSingleChineseWord(const Unicode::value_type& word) const 37 | { 38 | return isIn(_userDictSingleChineseWord, word); 39 | } 40 | public: 41 | double getMinWeight() const {return _minWeight;}; 42 | 43 | public: 44 | DictTrie() 45 | { 46 | _trie = NULL; 47 | _minWeight = MAX_DOUBLE; 48 | } 49 | DictTrie(const string& dictPath, const string& userDictPath = "") 50 | { 51 | new (this) DictTrie(); 52 | init(dictPath, userDictPath); 53 | } 54 | ~DictTrie() 55 | { 56 | if(_trie) 57 | { 58 | delete _trie; 59 | } 60 | } 61 | 62 | public: 63 | bool init(const string& dictPath, const string& userDictPath = "") 64 | { 65 | assert(!_trie); 66 | _loadDict(dictPath); 67 | _calculateWeight(_nodeInfos); 68 | _minWeight = _findMinWeight(_nodeInfos); 69 | 70 | if(userDictPath.size()) 71 | { 72 | double maxWeight = _findMaxWeight(_nodeInfos); 73 | _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); 74 | } 75 | _shrink(_nodeInfos); 76 | _trie = _createTrie(_nodeInfos); 77 | assert(_trie); 78 | return true; 79 | } 80 | 81 | public: 82 | const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const 83 | { 84 | return _trie->find(begin, end); 85 | } 86 | bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const 87 | { 88 | return _trie->find(begin, end, dag, offset); 89 | } 90 | void find( 91 | Unicode::const_iterator begin, 92 | Unicode::const_iterator end, 93 | vector& res 94 | ) const 95 | { 96 | _trie->find(begin, end, res); 97 | } 98 | 99 | 100 | private: 101 | Trie * _createTrie(const vector& dictUnits) 102 | { 103 | assert(dictUnits.size()); 104 | vector words; 105 | vector valuePointers; 106 | for(size_t i = 0 ; i < dictUnits.size(); i ++) 107 | { 108 | words.push_back(dictUnits[i].word); 109 | valuePointers.push_back(&dictUnits[i]); 110 | } 111 | 112 | Trie * trie = new Trie(words, valuePointers); 113 | return trie; 114 | } 115 | void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) 116 | { 117 | ifstream ifs(filePath.c_str()); 118 | assert(ifs); 119 | string line; 120 | DictUnit nodeInfo; 121 | vector buf; 122 | size_t lineno; 123 | for(lineno = 0; getline(ifs, line); lineno++) 124 | { 125 | buf.clear(); 126 | split(line, buf, " "); 127 | assert(buf.size() >= 1); 128 | if(!TransCode::decode(buf[0], nodeInfo.word)) 129 | { 130 | LogError("line[%u:%s] illegal.", lineno, line.c_str()); 131 | continue; 132 | } 133 | if(nodeInfo.word.size() == 1) 134 | { 135 | _userDictSingleChineseWord.insert(nodeInfo.word[0]); 136 | } 137 | nodeInfo.weight = defaultWeight; 138 | nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); 139 | _nodeInfos.push_back(nodeInfo); 140 | } 141 | LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); 142 | } 143 | void _loadDict(const string& filePath) 144 | { 145 | ifstream ifs(filePath.c_str()); 146 | assert(ifs); 147 | string line; 148 | vector buf; 149 | 150 | DictUnit nodeInfo; 151 | for(size_t lineno = 0 ; getline(ifs, line); lineno++) 152 | { 153 | split(line, buf, " "); 154 | assert(buf.size() == DICT_COLUMN_NUM); 155 | 156 | if(!TransCode::decode(buf[0], nodeInfo.word)) 157 | { 158 | LogError("line[%u:%s] illegal.", lineno, line.c_str()); 159 | continue; 160 | } 161 | nodeInfo.weight = atof(buf[1].c_str()); 162 | nodeInfo.tag = buf[2]; 163 | 164 | _nodeInfos.push_back(nodeInfo); 165 | } 166 | } 167 | double _findMinWeight(const vector& nodeInfos) const 168 | { 169 | double ret = MAX_DOUBLE; 170 | for(size_t i = 0; i < nodeInfos.size(); i++) 171 | { 172 | ret = min(nodeInfos[i].weight, ret); 173 | } 174 | return ret; 175 | } 176 | double _findMaxWeight(const vector& nodeInfos) const 177 | { 178 | double ret = MIN_DOUBLE; 179 | for(size_t i = 0; i < nodeInfos.size(); i++) 180 | { 181 | ret = max(nodeInfos[i].weight, ret); 182 | } 183 | return ret; 184 | } 185 | 186 | void _calculateWeight(vector& nodeInfos) const 187 | { 188 | double sum = 0.0; 189 | for(size_t i = 0; i < nodeInfos.size(); i++) 190 | { 191 | sum += nodeInfos[i].weight; 192 | } 193 | assert(sum); 194 | for(size_t i = 0; i < nodeInfos.size(); i++) 195 | { 196 | DictUnit& nodeInfo = nodeInfos[i]; 197 | assert(nodeInfo.weight); 198 | nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); 199 | } 200 | } 201 | 202 | void _shrink(vector& units) const 203 | { 204 | vector(units.begin(), units.end()).swap(units); 205 | } 206 | 207 | 208 | }; 209 | } 210 | 211 | #endif 212 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/StringUtil.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_STR_FUNCTS_H 6 | #define LIMONP_STR_FUNCTS_H 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "StdExtension.hpp" 25 | 26 | namespace Limonp 27 | { 28 | using namespace std; 29 | inline string string_format(const char* fmt, ...) 30 | { 31 | int size = 256; 32 | std::string str; 33 | va_list ap; 34 | while (1) { 35 | str.resize(size); 36 | va_start(ap, fmt); 37 | int n = vsnprintf((char *)str.c_str(), size, fmt, ap); 38 | va_end(ap); 39 | if (n > -1 && n < size) { 40 | str.resize(n); 41 | return str; 42 | } 43 | if (n > -1) 44 | size = n + 1; 45 | else 46 | size *= 2; 47 | } 48 | return str; 49 | } 50 | 51 | template 52 | void join(T begin, T end, string& res, const string& connector) 53 | { 54 | if(begin == end) 55 | { 56 | return; 57 | } 58 | stringstream ss; 59 | ss<<*begin; 60 | begin++; 61 | while(begin != end) 62 | { 63 | ss << connector << *begin; 64 | begin ++; 65 | } 66 | res = ss.str(); 67 | } 68 | 69 | template 70 | string join(T begin, T end, const string& connector) 71 | { 72 | string res; 73 | join(begin ,end, res, connector); 74 | return res; 75 | } 76 | 77 | 78 | 79 | inline bool split(const string& src, vector& res, const string& pattern, size_t offset = 0, size_t len = string::npos) 80 | { 81 | if(src.empty()) 82 | { 83 | return false; 84 | } 85 | res.clear(); 86 | 87 | size_t start = 0; 88 | size_t end = 0; 89 | size_t cnt = 0; 90 | while(start < src.size() && res.size() < len) 91 | { 92 | end = src.find_first_of(pattern, start); 93 | if(string::npos == end) 94 | { 95 | if(cnt >= offset) 96 | { 97 | res.push_back(src.substr(start)); 98 | } 99 | return true; 100 | } 101 | //if(end == src.size() - 1) 102 | //{ 103 | // res.push_back(""); 104 | // return true; 105 | //} 106 | if(cnt >= offset) 107 | { 108 | res.push_back(src.substr(start, end - start)); 109 | } 110 | cnt ++; 111 | start = end + 1; 112 | } 113 | return true; 114 | } 115 | 116 | inline string& upper(string& str) 117 | { 118 | transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper); 119 | return str; 120 | } 121 | 122 | inline string& lower(string& str) 123 | { 124 | transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower); 125 | return str; 126 | } 127 | 128 | inline std::string <rim(std::string &s) 129 | { 130 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun(std::isspace)))); 131 | return s; 132 | } 133 | 134 | inline std::string &rtrim(std::string &s) 135 | { 136 | s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(std::isspace))).base(), s.end()); 137 | return s; 138 | } 139 | 140 | inline std::string &trim(std::string &s) 141 | { 142 | return ltrim(rtrim(s)); 143 | } 144 | 145 | inline std::string & ltrim(std::string & s, char x) 146 | { 147 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to(), x)))); 148 | return s; 149 | } 150 | 151 | inline std::string & rtrim(std::string & s, char x) 152 | { 153 | s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to(), x))).base(), s.end()); 154 | return s; 155 | } 156 | 157 | inline std::string &trim(std::string &s, char x) 158 | { 159 | return ltrim(rtrim(s, x), x); 160 | } 161 | 162 | inline bool startsWith(const string& str, const string& prefix) 163 | { 164 | if(prefix.length() > str.length()) 165 | { 166 | return false; 167 | } 168 | return 0 == str.compare(0, prefix.length(), prefix); 169 | } 170 | 171 | inline bool endsWith(const string& str, const string& suffix) 172 | { 173 | if(suffix.length() > str.length()) 174 | { 175 | return false; 176 | } 177 | return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix); 178 | } 179 | 180 | inline bool isInStr(const string& str, char ch) 181 | { 182 | return str.find(ch) != string::npos; 183 | } 184 | 185 | inline uint16_t twocharToUint16(char high, char low) 186 | { 187 | return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); 188 | } 189 | 190 | template 191 | bool utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) 192 | { 193 | if(!str) 194 | { 195 | return false; 196 | } 197 | char ch1, ch2; 198 | uint16_t tmp; 199 | vec.clear(); 200 | for(size_t i = 0;i < len;) 201 | { 202 | if(!(str[i] & 0x80)) // 0xxxxxxx 203 | { 204 | vec.push_back(str[i]); 205 | i++; 206 | } 207 | else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) // 110xxxxxx 208 | { 209 | ch1 = (str[i] >> 2) & 0x07; 210 | ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); 211 | tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); 212 | vec.push_back(tmp); 213 | i += 2; 214 | } 215 | else if((uint8_t)str[i] <= 0xef && i + 2 < len) 216 | { 217 | ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); 218 | ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 219 | tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); 220 | vec.push_back(tmp); 221 | i += 3; 222 | } 223 | else 224 | { 225 | return false; 226 | } 227 | } 228 | return true; 229 | } 230 | template 231 | bool utf8ToUnicode(const string& str, Uint16Container& vec) 232 | { 233 | return utf8ToUnicode(str.c_str(), str.size(), vec); 234 | } 235 | 236 | template 237 | bool unicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) 238 | { 239 | if(begin >= end) 240 | { 241 | return false; 242 | } 243 | res.clear(); 244 | uint16_t ui; 245 | while(begin != end) 246 | { 247 | ui = *begin; 248 | if(ui <= 0x7f) 249 | { 250 | res += char(ui); 251 | } 252 | else if(ui <= 0x7ff) 253 | { 254 | res += char(((ui>>6) & 0x1f) | 0xc0); 255 | res += char((ui & 0x3f) | 0x80); 256 | } 257 | else 258 | { 259 | res += char(((ui >> 12) & 0x0f )| 0xe0); 260 | res += char(((ui>>6) & 0x3f )| 0x80 ); 261 | res += char((ui & 0x3f) | 0x80); 262 | } 263 | begin ++; 264 | } 265 | return true; 266 | } 267 | 268 | 269 | template 270 | bool gbkTrans(const char* const str, size_t len, Uint16Container& vec) 271 | { 272 | vec.clear(); 273 | if(!str) 274 | { 275 | return false; 276 | } 277 | size_t i = 0; 278 | while(i < len) 279 | { 280 | if(0 == (str[i] & 0x80)) 281 | { 282 | vec.push_back(uint16_t(str[i])); 283 | i++; 284 | } 285 | else 286 | { 287 | if(i + 1 < len) //&& (str[i+1] & 0x80)) 288 | { 289 | uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff)); 290 | vec.push_back(tmp); 291 | i += 2; 292 | } 293 | else 294 | { 295 | return false; 296 | } 297 | } 298 | } 299 | return true; 300 | } 301 | 302 | template 303 | bool gbkTrans(const string& str, Uint16Container& vec) 304 | { 305 | return gbkTrans(str.c_str(), str.size(), vec); 306 | } 307 | 308 | template 309 | bool gbkTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) 310 | { 311 | if(begin >= end) 312 | { 313 | return false; 314 | } 315 | res.clear(); 316 | //pair pa; 317 | char first, second; 318 | while(begin != end) 319 | { 320 | //pa = uint16ToChar2(*begin); 321 | first = ((*begin)>>8) & 0x00ff; 322 | second = (*begin) & 0x00ff; 323 | if(first & 0x80) 324 | { 325 | res += first; 326 | res += second; 327 | } 328 | else 329 | { 330 | res += second; 331 | } 332 | begin++; 333 | } 334 | return true; 335 | } 336 | 337 | /* 338 | * format example: "%Y-%m-%d %H:%M:%S" 339 | */ 340 | inline void getTime(const string& format, string& timeStr) 341 | { 342 | time_t timeNow; 343 | time(&timeNow); 344 | timeStr.resize(64); 345 | size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow)); 346 | timeStr.resize(len); 347 | } 348 | } 349 | #endif 350 | -------------------------------------------------------------------------------- /src/CppJieba/Trie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_TRIE_HPP 2 | #define CPPJIEBA_TRIE_HPP 3 | 4 | #include "Limonp/StdExtension.hpp" 5 | #include 6 | #include 7 | 8 | namespace CppJieba 9 | { 10 | using namespace std; 11 | 12 | struct DictUnit 13 | { 14 | Unicode word; 15 | double weight; 16 | string tag; 17 | }; 18 | 19 | // for debugging 20 | inline ostream & operator << (ostream& os, const DictUnit& unit) 21 | { 22 | string s; 23 | s << unit.word; 24 | return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); 25 | } 26 | 27 | typedef LocalVector > DagType; 28 | 29 | struct SegmentChar 30 | { 31 | uint16_t uniCh; 32 | DagType dag; 33 | const DictUnit * pInfo; 34 | double weight; 35 | size_t nextPos; 36 | SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) 37 | {} 38 | ~SegmentChar() 39 | {} 40 | }; 41 | 42 | typedef Unicode::value_type TrieKey; 43 | 44 | class TrieNode 45 | { 46 | public: 47 | typedef unordered_map NextMap; 48 | public: 49 | TrieNode * fail; 50 | NextMap * next; 51 | const DictUnit * ptValue; 52 | public: 53 | TrieNode(): fail(NULL), next(NULL), ptValue(NULL) 54 | {} 55 | const TrieNode * findNext(TrieKey key) const 56 | { 57 | if(next == NULL) 58 | { 59 | return NULL; 60 | } 61 | NextMap::const_iterator iter = next->find(key); 62 | if(iter == next->end()) 63 | { 64 | return NULL; 65 | } 66 | return iter->second; 67 | } 68 | }; 69 | 70 | class Trie 71 | { 72 | private: 73 | TrieNode* _root; 74 | public: 75 | Trie(const vector& keys, const vector & valuePointers) 76 | { 77 | _root = new TrieNode; 78 | _createTrie(keys, valuePointers); 79 | _build();// build automation 80 | } 81 | ~Trie() 82 | { 83 | if(_root) 84 | { 85 | _deleteNode(_root); 86 | } 87 | } 88 | public: 89 | const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const 90 | { 91 | TrieNode::NextMap::const_iterator citer; 92 | const TrieNode* ptNode = _root; 93 | for(Unicode::const_iterator it = begin; it != end; it++) 94 | {// build automation 95 | assert(ptNode); 96 | if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) 97 | { 98 | return NULL; 99 | } 100 | ptNode = citer->second; 101 | } 102 | return ptNode->ptValue; 103 | } 104 | // aho-corasick-automation 105 | void find( 106 | Unicode::const_iterator begin, 107 | Unicode::const_iterator end, 108 | vector& res 109 | ) const 110 | { 111 | res.resize(end - begin); 112 | const TrieNode * now = _root; 113 | const TrieNode* node; 114 | // compiler will complain warnings if only "i < end - begin" . 115 | for (size_t i = 0; i < size_t(end - begin); i++) 116 | { 117 | Unicode::value_type ch = *(begin + i); 118 | res[i].uniCh = ch; 119 | assert(res[i].dag.empty()); 120 | res[i].dag.push_back(pair::size_type, const DictUnit* >(i, NULL)); 121 | bool flag = false; 122 | 123 | // rollback 124 | while( now != _root ) 125 | { 126 | node = now->findNext(ch); 127 | if (node != NULL) 128 | { 129 | flag = true; 130 | break; 131 | } 132 | else 133 | { 134 | now = now->fail; 135 | } 136 | } 137 | 138 | if(!flag) 139 | { 140 | node = now->findNext(ch); 141 | } 142 | if(node == NULL) 143 | { 144 | now = _root; 145 | } 146 | else 147 | { 148 | now = node; 149 | const TrieNode * temp = now; 150 | while(temp != _root) 151 | { 152 | if (temp->ptValue) 153 | { 154 | size_t pos = i - temp->ptValue->word.size() + 1; 155 | res[pos].dag.push_back(pair::size_type, const DictUnit* >(i, temp->ptValue)); 156 | if(pos == i) 157 | { 158 | res[pos].dag[0].second = temp->ptValue; 159 | } 160 | } 161 | temp = temp->fail; 162 | assert(temp); 163 | } 164 | } 165 | } 166 | } 167 | bool find( 168 | Unicode::const_iterator begin, 169 | Unicode::const_iterator end, 170 | DagType & res, 171 | size_t offset = 0) const 172 | { 173 | const TrieNode * ptNode = _root; 174 | TrieNode::NextMap::const_iterator citer; 175 | for(Unicode::const_iterator itr = begin; itr != end ; itr++) 176 | { 177 | assert(ptNode); 178 | if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) 179 | { 180 | break; 181 | } 182 | ptNode = citer->second; 183 | if(ptNode->ptValue) 184 | { 185 | if(itr == begin && res.size() == 1) // first singleword 186 | { 187 | res[0].second = ptNode->ptValue; 188 | } 189 | else 190 | { 191 | res.push_back(pair::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue)); 192 | } 193 | } 194 | } 195 | return !res.empty(); 196 | } 197 | private: 198 | void _build() 199 | { 200 | queue que; 201 | assert(_root->ptValue == NULL); 202 | assert(_root->next); 203 | _root->fail = NULL; 204 | for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) { 205 | iter->second->fail = _root; 206 | que.push(iter->second); 207 | } 208 | TrieNode* back = NULL; 209 | TrieNode::NextMap::iterator backiter; 210 | while(!que.empty()) { 211 | TrieNode * now = que.front(); 212 | que.pop(); 213 | if(now->next == NULL) { 214 | continue; 215 | } 216 | for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { 217 | back = now->fail; 218 | while(back != NULL) { 219 | if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) 220 | { 221 | iter->second->fail = backiter->second; 222 | break; 223 | } 224 | back = back->fail; 225 | } 226 | if(back == NULL) { 227 | iter->second->fail = _root; 228 | } 229 | que.push(iter->second); 230 | } 231 | } 232 | } 233 | private: 234 | void _createTrie(const vector& keys, const vector & valuePointers) 235 | { 236 | if(valuePointers.empty() || keys.empty()) 237 | { 238 | return; 239 | } 240 | assert(keys.size() == valuePointers.size()); 241 | 242 | for(size_t i = 0; i < keys.size(); i++) 243 | { 244 | _insertNode(keys[i], valuePointers[i]); 245 | } 246 | } 247 | private: 248 | void _insertNode(const Unicode& key, const DictUnit* ptValue) 249 | { 250 | TrieNode* ptNode = _root; 251 | 252 | TrieNode::NextMap::const_iterator kmIter; 253 | 254 | for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) 255 | { 256 | if(NULL == ptNode->next) 257 | { 258 | ptNode->next = new TrieNode::NextMap; 259 | } 260 | kmIter = ptNode->next->find(*citer); 261 | if(ptNode->next->end() == kmIter) 262 | { 263 | TrieNode * nextNode = new TrieNode; 264 | nextNode->next = NULL; 265 | nextNode->ptValue = NULL; 266 | 267 | (*ptNode->next)[*citer] = nextNode; 268 | ptNode = nextNode; 269 | } 270 | else 271 | { 272 | ptNode = kmIter->second; 273 | } 274 | } 275 | ptNode->ptValue = ptValue; 276 | } 277 | void _deleteNode(TrieNode* node) 278 | { 279 | if(!node) 280 | { 281 | return; 282 | } 283 | if(node->next) 284 | { 285 | TrieNode::NextMap::iterator it; 286 | for(it = node->next->begin(); it != node->next->end(); it++) 287 | { 288 | _deleteNode(it->second); 289 | } 290 | delete node->next; 291 | } 292 | delete node; 293 | } 294 | }; 295 | } 296 | 297 | #endif 298 | -------------------------------------------------------------------------------- /src/CppJieba/HMMSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIBEA_HMMSEGMENT_H 2 | #define CPPJIBEA_HMMSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "Limonp/StringUtil.hpp" 9 | #include "Limonp/Logger.hpp" 10 | #include "TransCode.hpp" 11 | #include "ISegment.hpp" 12 | #include "SegmentBase.hpp" 13 | #include "DictTrie.hpp" 14 | 15 | namespace CppJieba 16 | { 17 | using namespace Limonp; 18 | typedef unordered_map EmitProbMap; 19 | class HMMSegment: public SegmentBase 20 | { 21 | public: 22 | /* 23 | * STATUS: 24 | * 0:B, 1:E, 2:M, 3:S 25 | * */ 26 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 27 | private: 28 | char _statMap[STATUS_SUM]; 29 | double _startProb[STATUS_SUM]; 30 | double _transProb[STATUS_SUM][STATUS_SUM]; 31 | EmitProbMap _emitProbB; 32 | EmitProbMap _emitProbE; 33 | EmitProbMap _emitProbM; 34 | EmitProbMap _emitProbS; 35 | vector _emitProbVec; 36 | 37 | public: 38 | HMMSegment(){} 39 | explicit HMMSegment(const string& filePath) 40 | { 41 | LIMONP_CHECK(init(filePath)); 42 | } 43 | virtual ~HMMSegment(){} 44 | public: 45 | bool init(const string& filePath) 46 | { 47 | memset(_startProb, 0, sizeof(_startProb)); 48 | memset(_transProb, 0, sizeof(_transProb)); 49 | _statMap[0] = 'B'; 50 | _statMap[1] = 'E'; 51 | _statMap[2] = 'M'; 52 | _statMap[3] = 'S'; 53 | _emitProbVec.push_back(&_emitProbB); 54 | _emitProbVec.push_back(&_emitProbE); 55 | _emitProbVec.push_back(&_emitProbM); 56 | _emitProbVec.push_back(&_emitProbS); 57 | LIMONP_CHECK(_loadModel(filePath.c_str())); 58 | LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); 59 | return true; 60 | } 61 | public: 62 | using SegmentBase::cut; 63 | public: 64 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 65 | { 66 | Unicode::const_iterator left = begin; 67 | Unicode::const_iterator right = begin; 68 | while(right != end) 69 | { 70 | if(*right < 0x80) 71 | { 72 | if(left != right && !_cut(left, right, res)) 73 | { 74 | return false; 75 | } 76 | left = right; 77 | do { 78 | right = _sequentialLetterRule(left, end); 79 | if(right != left) 80 | { 81 | break; 82 | } 83 | right = _numbersRule(left, end); 84 | if(right != left) 85 | { 86 | break; 87 | } 88 | right ++; 89 | } while(false); 90 | res.push_back(Unicode(left, right)); 91 | left = right; 92 | } 93 | else 94 | { 95 | right++; 96 | } 97 | } 98 | if(left != right && !_cut(left, right, res)) 99 | { 100 | return false; 101 | } 102 | return true; 103 | } 104 | private: 105 | // sequential letters rule 106 | Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const 107 | { 108 | Unicode::value_type x; 109 | while(begin != end) 110 | { 111 | x = *begin; 112 | if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) 113 | { 114 | begin ++; 115 | } 116 | else 117 | { 118 | break; 119 | } 120 | } 121 | return begin; 122 | } 123 | // 124 | Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const 125 | { 126 | Unicode::value_type x = *begin; 127 | if('0' <= x && x <= '9') 128 | { 129 | begin ++; 130 | } 131 | else 132 | { 133 | return begin; 134 | } 135 | while(begin != end) 136 | { 137 | x = *begin; 138 | if( ('0' <= x && x <= '9') || x == '.') 139 | { 140 | begin++; 141 | } 142 | else 143 | { 144 | break; 145 | } 146 | } 147 | return begin; 148 | } 149 | bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 150 | { 151 | vector status; 152 | if(!_viterbi(begin, end, status)) 153 | { 154 | LogError("_viterbi failed."); 155 | return false; 156 | } 157 | 158 | Unicode::const_iterator left = begin; 159 | Unicode::const_iterator right; 160 | for(size_t i = 0; i < status.size(); i++) 161 | { 162 | if(status[i] % 2) //if(E == status[i] || S == status[i]) 163 | { 164 | right = begin + i + 1; 165 | res.push_back(Unicode(left, right)); 166 | left = right; 167 | } 168 | } 169 | return true; 170 | } 171 | public: 172 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 173 | { 174 | if(begin == end) 175 | { 176 | return false; 177 | } 178 | vector words; 179 | words.reserve(end - begin); 180 | if(!cut(begin, end, words)) 181 | { 182 | return false; 183 | } 184 | size_t offset = res.size(); 185 | res.resize(res.size() + words.size()); 186 | for(size_t i = 0; i < words.size(); i++) 187 | { 188 | if(!TransCode::encode(words[i], res[offset + i])) 189 | { 190 | LogError("encode failed."); 191 | } 192 | } 193 | return true; 194 | } 195 | 196 | private: 197 | bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const 198 | { 199 | if(begin == end) 200 | { 201 | return false; 202 | } 203 | 204 | size_t Y = STATUS_SUM; 205 | size_t X = end - begin; 206 | 207 | size_t XYSize = X * Y; 208 | size_t now, old, stat; 209 | double tmp, endE, endS; 210 | 211 | vector path(XYSize); 212 | vector weight(XYSize); 213 | 214 | //start 215 | for(size_t y = 0; y < Y; y++) 216 | { 217 | weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); 218 | path[0 + y * X] = -1; 219 | } 220 | 221 | 222 | double emitProb; 223 | 224 | for(size_t x = 1; x < X; x++) 225 | { 226 | for(size_t y = 0; y < Y; y++) 227 | { 228 | now = x + y*X; 229 | weight[now] = MIN_DOUBLE; 230 | path[now] = E; // warning 231 | emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); 232 | for(size_t preY = 0; preY < Y; preY++) 233 | { 234 | old = x - 1 + preY * X; 235 | tmp = weight[old] + _transProb[preY][y] + emitProb; 236 | if(tmp > weight[now]) 237 | { 238 | weight[now] = tmp; 239 | path[now] = preY; 240 | } 241 | } 242 | } 243 | } 244 | 245 | endE = weight[X-1+E*X]; 246 | endS = weight[X-1+S*X]; 247 | stat = 0; 248 | if(endE >= endS) 249 | { 250 | stat = E; 251 | } 252 | else 253 | { 254 | stat = S; 255 | } 256 | 257 | status.resize(X); 258 | for(int x = X -1 ; x >= 0; x--) 259 | { 260 | status[x] = stat; 261 | stat = path[x + stat*X]; 262 | } 263 | 264 | return true; 265 | } 266 | bool _loadModel(const char* const filePath) 267 | { 268 | LogDebug("loadModel [%s] start ...", filePath); 269 | ifstream ifile(filePath); 270 | string line; 271 | vector tmp; 272 | vector tmp2; 273 | //load _startProb 274 | if(!_getLine(ifile, line)) 275 | { 276 | return false; 277 | } 278 | split(line, tmp, " "); 279 | if(tmp.size() != STATUS_SUM) 280 | { 281 | LogError("start_p illegal"); 282 | return false; 283 | } 284 | for(size_t j = 0; j< tmp.size(); j++) 285 | { 286 | _startProb[j] = atof(tmp[j].c_str()); 287 | } 288 | 289 | //load _transProb 290 | for(size_t i = 0; i < STATUS_SUM; i++) 291 | { 292 | if(!_getLine(ifile, line)) 293 | { 294 | return false; 295 | } 296 | split(line, tmp, " "); 297 | if(tmp.size() != STATUS_SUM) 298 | { 299 | LogError("trans_p illegal"); 300 | return false; 301 | } 302 | for(size_t j =0; j < STATUS_SUM; j++) 303 | { 304 | _transProb[i][j] = atof(tmp[j].c_str()); 305 | } 306 | } 307 | 308 | //load _emitProbB 309 | if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) 310 | { 311 | return false; 312 | } 313 | 314 | //load _emitProbE 315 | if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) 316 | { 317 | return false; 318 | } 319 | 320 | //load _emitProbM 321 | if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) 322 | { 323 | return false; 324 | } 325 | 326 | //load _emitProbS 327 | if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) 328 | { 329 | return false; 330 | } 331 | 332 | LogDebug("loadModel [%s] end.", filePath); 333 | 334 | return true; 335 | } 336 | bool _getLine(ifstream& ifile, string& line) 337 | { 338 | while(getline(ifile, line)) 339 | { 340 | trim(line); 341 | if(line.empty()) 342 | { 343 | continue; 344 | } 345 | if(startsWith(line, "#")) 346 | { 347 | continue; 348 | } 349 | return true; 350 | } 351 | return false; 352 | } 353 | bool _loadEmitProb(const string& line, EmitProbMap& mp) 354 | { 355 | if(line.empty()) 356 | { 357 | return false; 358 | } 359 | vector tmp, tmp2; 360 | Unicode unicode; 361 | split(line, tmp, ","); 362 | for(size_t i = 0; i < tmp.size(); i++) 363 | { 364 | split(tmp[i], tmp2, ":"); 365 | if(2 != tmp2.size()) 366 | { 367 | LogError("_emitProb illegal."); 368 | return false; 369 | } 370 | if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) 371 | { 372 | LogError("TransCode failed."); 373 | return false; 374 | } 375 | mp[unicode[0]] = atof(tmp2[1].c_str()); 376 | } 377 | return true; 378 | } 379 | double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const 380 | { 381 | EmitProbMap::const_iterator cit = ptMp->find(key); 382 | if(cit == ptMp->end()) 383 | { 384 | return defVal; 385 | } 386 | return cit->second; 387 | 388 | } 389 | 390 | 391 | }; 392 | } 393 | 394 | #endif 395 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/Md5.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __MD5_H__ 2 | #define __MD5_H__ 3 | 4 | // Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All 5 | // rights reserved. 6 | 7 | // License to copy and use this software is granted provided that it 8 | // is identified as the "RSA Data Security, Inc. MD5 Message-Digest 9 | // Algorithm" in all material mentioning or referencing this software 10 | // or this function. 11 | // 12 | // License is also granted to make and use derivative works provided 13 | // that such works are identified as "derived from the RSA Data 14 | // Security, Inc. MD5 Message-Digest Algorithm" in all material 15 | // mentioning or referencing the derived work. 16 | // 17 | // RSA Data Security, Inc. makes no representations concerning either 18 | // the merchantability of this software or the suitability of this 19 | // software for any particular purpose. It is provided "as is" 20 | // without express or implied warranty of any kind. 21 | // 22 | // These notices must be retained in any copies of any part of this 23 | // documentation and/or software. 24 | 25 | 26 | 27 | // The original md5 implementation avoids external libraries. 28 | // This version has dependency on stdio.h for file input and 29 | // string.h for memcpy. 30 | #include 31 | #include 32 | #include 33 | 34 | namespace Limonp 35 | { 36 | 37 | //#pragma region MD5 defines 38 | // Constants for MD5Transform routine. 39 | #define S11 7 40 | #define S12 12 41 | #define S13 17 42 | #define S14 22 43 | #define S21 5 44 | #define S22 9 45 | #define S23 14 46 | #define S24 20 47 | #define S31 4 48 | #define S32 11 49 | #define S33 16 50 | #define S34 23 51 | #define S41 6 52 | #define S42 10 53 | #define S43 15 54 | #define S44 21 55 | 56 | 57 | // F, G, H and I are basic MD5 functions. 58 | #define F(x, y, z) (((x) & (y)) | ((~x) & (z))) 59 | #define G(x, y, z) (((x) & (z)) | ((y) & (~z))) 60 | #define H(x, y, z) ((x) ^ (y) ^ (z)) 61 | #define I(x, y, z) ((y) ^ ((x) | (~z))) 62 | 63 | // ROTATE_LEFT rotates x left n bits. 64 | #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) 65 | 66 | // FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. 67 | // Rotation is separate from addition to prevent recomputation. 68 | #define FF(a, b, c, d, x, s, ac) { \ 69 | (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ 70 | (a) = ROTATE_LEFT ((a), (s)); \ 71 | (a) += (b); \ 72 | } 73 | #define GG(a, b, c, d, x, s, ac) { \ 74 | (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ 75 | (a) = ROTATE_LEFT ((a), (s)); \ 76 | (a) += (b); \ 77 | } 78 | #define HH(a, b, c, d, x, s, ac) { \ 79 | (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ 80 | (a) = ROTATE_LEFT ((a), (s)); \ 81 | (a) += (b); \ 82 | } 83 | #define II(a, b, c, d, x, s, ac) { \ 84 | (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ 85 | (a) = ROTATE_LEFT ((a), (s)); \ 86 | (a) += (b); \ 87 | } 88 | //#pragma endregion 89 | 90 | 91 | typedef unsigned char BYTE ; 92 | 93 | // POINTER defines a generic pointer type 94 | typedef unsigned char *POINTER; 95 | 96 | // UINT2 defines a two byte word 97 | typedef unsigned short int UINT2; 98 | 99 | // UINT4 defines a four byte word 100 | typedef unsigned int UINT4; 101 | 102 | static unsigned char PADDING[64] = { 103 | 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 106 | }; 107 | // convenient object that wraps 108 | // the C-functions for use in C++ only 109 | class MD5 110 | { 111 | private: 112 | struct __context_t { 113 | UINT4 state[4]; /* state (ABCD) */ 114 | UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ 115 | unsigned char buffer[64]; /* input buffer */ 116 | } context ; 117 | 118 | //#pragma region static helper functions 119 | // The core of the MD5 algorithm is here. 120 | // MD5 basic transformation. Transforms state based on block. 121 | static void MD5Transform( UINT4 state[4], unsigned char block[64] ) 122 | { 123 | UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; 124 | 125 | Decode (x, block, 64); 126 | 127 | /* Round 1 */ 128 | FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ 129 | FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ 130 | FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ 131 | FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ 132 | FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ 133 | FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ 134 | FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ 135 | FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ 136 | FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ 137 | FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ 138 | FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ 139 | FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ 140 | FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ 141 | FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ 142 | FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ 143 | FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ 144 | 145 | /* Round 2 */ 146 | GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ 147 | GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ 148 | GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ 149 | GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ 150 | GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ 151 | GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ 152 | GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ 153 | GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ 154 | GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ 155 | GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ 156 | GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ 157 | GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ 158 | GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ 159 | GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ 160 | GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ 161 | GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ 162 | 163 | /* Round 3 */ 164 | HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ 165 | HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ 166 | HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ 167 | HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ 168 | HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ 169 | HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ 170 | HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ 171 | HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ 172 | HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ 173 | HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ 174 | HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ 175 | HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ 176 | HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ 177 | HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ 178 | HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ 179 | HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ 180 | 181 | /* Round 4 */ 182 | II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ 183 | II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ 184 | II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ 185 | II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ 186 | II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ 187 | II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ 188 | II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ 189 | II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ 190 | II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ 191 | II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ 192 | II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ 193 | II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ 194 | II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ 195 | II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ 196 | II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ 197 | II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ 198 | 199 | state[0] += a; 200 | state[1] += b; 201 | state[2] += c; 202 | state[3] += d; 203 | 204 | // Zeroize sensitive information. 205 | memset((POINTER)x, 0, sizeof (x)); 206 | } 207 | 208 | // Encodes input (UINT4) into output (unsigned char). Assumes len is 209 | // a multiple of 4. 210 | static void Encode( unsigned char *output, UINT4 *input, unsigned int len ) 211 | { 212 | unsigned int i, j; 213 | 214 | for (i = 0, j = 0; j < len; i++, j += 4) { 215 | output[j] = (unsigned char)(input[i] & 0xff); 216 | output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); 217 | output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); 218 | output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); 219 | } 220 | } 221 | 222 | // Decodes input (unsigned char) into output (UINT4). Assumes len is 223 | // a multiple of 4. 224 | static void Decode( UINT4 *output, unsigned char *input, unsigned int len ) 225 | { 226 | unsigned int i, j; 227 | 228 | for (i = 0, j = 0; j < len; i++, j += 4) 229 | output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | 230 | (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); 231 | } 232 | //#pragma endregion 233 | 234 | 235 | public: 236 | // MAIN FUNCTIONS 237 | MD5() 238 | { 239 | Init() ; 240 | } 241 | 242 | // MD5 initialization. Begins an MD5 operation, writing a new context. 243 | void Init() 244 | { 245 | context.count[0] = context.count[1] = 0; 246 | 247 | // Load magic initialization constants. 248 | context.state[0] = 0x67452301; 249 | context.state[1] = 0xefcdab89; 250 | context.state[2] = 0x98badcfe; 251 | context.state[3] = 0x10325476; 252 | } 253 | 254 | // MD5 block update operation. Continues an MD5 message-digest 255 | // operation, processing another message block, and updating the 256 | // context. 257 | void Update( 258 | unsigned char *input, // input block 259 | unsigned int inputLen ) // length of input block 260 | { 261 | unsigned int i, index, partLen; 262 | 263 | // Compute number of bytes mod 64 264 | index = (unsigned int)((context.count[0] >> 3) & 0x3F); 265 | 266 | // Update number of bits 267 | if ((context.count[0] += ((UINT4)inputLen << 3)) 268 | < ((UINT4)inputLen << 3)) 269 | context.count[1]++; 270 | context.count[1] += ((UINT4)inputLen >> 29); 271 | 272 | partLen = 64 - index; 273 | 274 | // Transform as many times as possible. 275 | if (inputLen >= partLen) { 276 | memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen); 277 | MD5Transform (context.state, context.buffer); 278 | 279 | for (i = partLen; i + 63 < inputLen; i += 64) 280 | MD5Transform (context.state, &input[i]); 281 | 282 | index = 0; 283 | } 284 | else 285 | i = 0; 286 | 287 | /* Buffer remaining input */ 288 | memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i); 289 | } 290 | 291 | // MD5 finalization. Ends an MD5 message-digest operation, writing the 292 | // the message digest and zeroizing the context. 293 | // Writes to digestRaw 294 | void Final() 295 | { 296 | unsigned char bits[8]; 297 | unsigned int index, padLen; 298 | 299 | // Save number of bits 300 | Encode( bits, context.count, 8 ); 301 | 302 | // Pad out to 56 mod 64. 303 | index = (unsigned int)((context.count[0] >> 3) & 0x3f); 304 | padLen = (index < 56) ? (56 - index) : (120 - index); 305 | Update( PADDING, padLen ); 306 | 307 | // Append length (before padding) 308 | Update( bits, 8 ); 309 | 310 | // Store state in digest 311 | Encode( digestRaw, context.state, 16); 312 | 313 | // Zeroize sensitive information. 314 | memset((POINTER)&context, 0, sizeof (context)); 315 | 316 | writeToString() ; 317 | } 318 | 319 | /// Buffer must be 32+1 (nul) = 33 chars long at least 320 | void writeToString() 321 | { 322 | int pos ; 323 | 324 | for( pos = 0 ; pos < 16 ; pos++ ) 325 | sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ; 326 | } 327 | 328 | 329 | public: 330 | // an MD5 digest is a 16-byte number (32 hex digits) 331 | BYTE digestRaw[ 16 ] ; 332 | 333 | // This version of the digest is actually 334 | // a "printf'd" version of the digest. 335 | char digestChars[ 33 ] ; 336 | 337 | /// Load a file from disk and digest it 338 | // Digests a file and returns the result. 339 | const char* digestFile( const char *filename ) 340 | { 341 | if (NULL == filename || strcmp(filename, "") == 0) 342 | return NULL; 343 | 344 | Init() ; 345 | 346 | FILE *file; 347 | 348 | unsigned char buffer[1024] ; 349 | 350 | if((file = fopen (filename, "rb")) == NULL) 351 | { 352 | return NULL; 353 | } 354 | int len; 355 | while( (len = fread( buffer, 1, 1024, file )) ) 356 | Update( buffer, len ) ; 357 | Final(); 358 | 359 | fclose( file ); 360 | 361 | return digestChars ; 362 | } 363 | 364 | /// Digests a byte-array already in memory 365 | const char* digestMemory( BYTE *memchunk, int len ) 366 | { 367 | if (NULL == memchunk) 368 | return NULL; 369 | 370 | Init() ; 371 | Update( memchunk, len ) ; 372 | Final() ; 373 | 374 | return digestChars ; 375 | } 376 | 377 | // Digests a string and prints the result. 378 | const char* digestString(const char *string ) 379 | { 380 | if (string == NULL) 381 | return NULL; 382 | 383 | Init() ; 384 | Update( (unsigned char*)string, strlen(string) ) ; 385 | Final() ; 386 | 387 | return digestChars ; 388 | } 389 | }; 390 | 391 | inline bool md5String(const char* str, std::string& res) 392 | { 393 | if (NULL == str) 394 | { 395 | res = ""; 396 | return false; 397 | } 398 | 399 | MD5 md5; 400 | const char *pRes = md5.digestString(str); 401 | if (NULL == pRes) 402 | { 403 | res = ""; 404 | return false; 405 | } 406 | 407 | res = pRes; 408 | return true; 409 | } 410 | 411 | inline bool md5File(const char* filepath, std::string& res) 412 | { 413 | if (NULL == filepath || strcmp(filepath, "") == 0) 414 | { 415 | res = ""; 416 | return false; 417 | } 418 | 419 | MD5 md5; 420 | const char *pRes = md5.digestFile(filepath); 421 | 422 | if (NULL == pRes) 423 | { 424 | res = ""; 425 | return false; 426 | } 427 | 428 | res = pRes; 429 | return true; 430 | } 431 | } 432 | #endif 433 | --------------------------------------------------------------------------------