├── ChangeLog.md ├── LICENSE ├── README.md ├── dict ├── hmm_model.utf8 ├── jieba.dict.utf8 └── user.dict.utf8 └── src ├── CppJieba ├── DictTrie.hpp ├── FullSegment.hpp ├── HMMSegment.hpp ├── ISegment.hpp ├── KeywordExtractor.hpp ├── Limonp │ ├── Config.hpp │ ├── HandyMacro.hpp │ ├── InitOnOff.hpp │ ├── LocalVector.hpp │ ├── Logger.hpp │ ├── MysqlClient.hpp │ ├── NonCopyable.hpp │ ├── StdExtension.hpp │ └── StringUtil.hpp ├── MPSegment.hpp ├── MixSegment.hpp ├── PosTagger.hpp ├── QuerySegment.hpp ├── SegmentBase.hpp ├── TransCode.hpp └── Trie.hpp ├── config └── ngx_http_cppjieba_module.cpp /ChangeLog.md: -------------------------------------------------------------------------------- 1 | # ChangeLog 2 | 3 | ## 0.1.0 4 | 5 | * 支持 GET 和 POST 的分词请求 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Yanyi Wu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ngx\_http\_cppjieba\_module 2 | 3 | ## 简介 4 | 5 | [CppJieba] 的 `Nginx` 扩展模块。 6 | 需要了解源码的可以参看 [NginxModuleDevelopment] 。 7 | 8 | ## 支持Docker 9 | 10 | ``` 11 | docker pull docker.cn/yanyiwu/nginx_cppjieba_server 12 | ``` 13 | 14 | ## 用法 15 | 16 | 17 | ### 安装和配置 18 | 19 | 以下用法假设 `ngx_http_cppjieba_module` 下载后存放的地址是 `/tmp/ngx_http_cppjieba_module` (这个地址在 `Nginx` 编译时和词典加载时候会用到) 20 | 21 | #### 下载源码: 22 | 23 | ``` 24 | git clone git://github.com/aszxqw/ngx_http_cppjieba_module.git /tmp/ngx_http_cppjieba_module 25 | ``` 26 | 27 | #### 进入 `Nginx` 源码目录: 28 | 29 | ``` 30 | ./configure --add-module=/tmp/ngx_http_cppjieba_module/src 31 | ``` 32 | 33 | 因为 `ngx_http_cppjieba_module` 是 `C++` 源码,所以作为 `Nginx` 模块编译的时候需要 修改 `obj/Makefile` 34 | 35 | ``` 36 | # 1. 在 "CC = gcc" 下面增加一行,如下 37 | CXX = g++ 38 | # 2. 修改链接器为 g++ , 如下 39 | LINK = $(CXX) 40 | # 3. 修改 ngx_http_cppjieba_module.cpp 的编译器,从 $(CC) 改为 $(CXX) , 如下 41 | $(CXX) -c $(CFLAGS) $(ALL_INCS) \ 42 | -o objs/addon/src/ngx_http_cppjieba_module.o \ 43 | /tmp/ngx_http_cppjieba_module/src/ngx_http_cppjieba_module.cpp 44 | 45 | ``` 46 | 47 | 这三步做完就可以 `make && sudo make install` nginx 了。 48 | 49 | ``` 50 | # 4. 修改 Nginx 配置文件 /usr/local/nginx/conf/nginx.conf 51 | location /cppjieba { 52 | cppjieba /tmp/ngx_http_cppjieba_module/dict/jieba.dict.utf8 /tmp/ngx_http_cppjieba_module/dict/hmm_model.utf8 /tmp/ngx_http_cppjieba_module/dict/user.dict.utf8; 53 | } 54 | ``` 55 | 56 | 如果 `ngx_http_cppjieba_module` 的源码路径不是 `/tmp/ngx_http_cppjieba_module`, 将上述过程的 `/tmp/xxx` 改为 `/your/path/xxx` 即可。 57 | 58 | ## 启动 Nginx 59 | 60 | ``` 61 | /usr/local/nginx/sbin/nginx 62 | ``` 63 | 64 | ## 测试 65 | 66 | ### GET 67 | 68 | ``` 69 | curl "http://127.0.0.1/cppjieba?s=长春市长春药店" 70 | ``` 71 | 72 | ### POST 73 | 74 | ``` 75 | curl --data "长春市长春药店" "http://127.0.0.1/cppjieba" 76 | ``` 77 | 78 | 预期结果: 79 | 80 | ``` 81 | ["长春市", "长春", "药店"] 82 | ``` 83 | 84 | 用 `chrome` 打开上述链接也可以,不过要 **记得** 设置浏览器的页面编码方式为 `utf-8` 。 85 | 86 | 87 | ## 客服 88 | 89 | `i@yanyiwu.com` 90 | 91 | [CppJieba]:https://github.com/aszxqw/cppjieba 92 | [NginxModuleDevelopment]:http://yanyiwu.com/work/2014/09/21/nginx-module-development-stuff.html 93 | -------------------------------------------------------------------------------- /dict/user.dict.utf8: -------------------------------------------------------------------------------- 1 | 云计算 2 | 韩玉鉴赏 3 | -------------------------------------------------------------------------------- /src/CppJieba/DictTrie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_DICT_TRIE_HPP 2 | #define CPPJIEBA_DICT_TRIE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "Limonp/StringUtil.hpp" 12 | #include "Limonp/Logger.hpp" 13 | #include "TransCode.hpp" 14 | #include "Trie.hpp" 15 | 16 | 17 | 18 | namespace CppJieba 19 | { 20 | using namespace Limonp; 21 | const double MIN_DOUBLE = -3.14e+100; 22 | const double MAX_DOUBLE = 3.14e+100; 23 | const size_t DICT_COLUMN_NUM = 3; 24 | const char* const UNKNOWN_TAG = "x"; 25 | 26 | 27 | struct DictUnit 28 | { 29 | Unicode word; 30 | double weight; 31 | string tag; 32 | }; 33 | 34 | inline ostream & operator << (ostream& os, const DictUnit& unit) 35 | { 36 | string s; 37 | s << unit.word; 38 | return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); 39 | } 40 | 41 | typedef map DagType; 42 | 43 | class DictTrie 44 | { 45 | public: 46 | typedef Trie, vector > TrieType; 47 | private: 48 | vector _nodeInfos; 49 | TrieType * _trie; 50 | 51 | double _minWeight; 52 | private: 53 | unordered_set _userDictSingleChineseWord; 54 | public: 55 | bool isUserDictSingleChineseWord(const Unicode::value_type& word) const 56 | { 57 | return isIn(_userDictSingleChineseWord, word); 58 | } 59 | public: 60 | double getMinWeight() const {return _minWeight;}; 61 | 62 | public: 63 | DictTrie() 64 | { 65 | _trie = NULL; 66 | _minWeight = MAX_DOUBLE; 67 | } 68 | DictTrie(const string& dictPath, const string& userDictPath = "") 69 | { 70 | new (this) DictTrie(); 71 | init(dictPath, userDictPath); 72 | } 73 | ~DictTrie() 74 | { 75 | if(_trie) 76 | { 77 | delete _trie; 78 | } 79 | } 80 | 81 | public: 82 | bool init(const string& dictPath, const string& userDictPath = "") 83 | { 84 | assert(!_trie); 85 | _loadDict(dictPath, _nodeInfos); 86 | _calculateWeight(_nodeInfos); 87 | _minWeight = _findMinWeight(_nodeInfos); 88 | 89 | if(userDictPath.size()) 90 | { 91 | double maxWeight = _findMaxWeight(_nodeInfos); 92 | _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); 93 | } 94 | _shrink(_nodeInfos); 95 | _trie = _creatTrie(_nodeInfos); 96 | assert(_trie); 97 | return true; 98 | } 99 | 100 | public: 101 | const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const 102 | { 103 | return _trie->find(begin, end); 104 | } 105 | bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const 106 | { 107 | return _trie->find(begin, end, dag, offset); 108 | } 109 | 110 | 111 | private: 112 | TrieType * _creatTrie(const vector& dictUnits) 113 | { 114 | assert(dictUnits.size()); 115 | vector words; 116 | vector valuePointers; 117 | for(size_t i = 0 ; i < dictUnits.size(); i ++) 118 | { 119 | words.push_back(dictUnits[i].word); 120 | valuePointers.push_back(&dictUnits[i]); 121 | } 122 | 123 | TrieType * trie = new TrieType(words, valuePointers); 124 | return trie; 125 | } 126 | void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) 127 | { 128 | ifstream ifs(filePath.c_str()); 129 | assert(ifs); 130 | string line; 131 | DictUnit nodeInfo; 132 | size_t lineno; 133 | for(lineno = 0; getline(ifs, line); lineno++) 134 | { 135 | if(!TransCode::decode(line, nodeInfo.word)) 136 | { 137 | LogError("line[%u:%s] illegal.", lineno, line.c_str()); 138 | continue; 139 | } 140 | if(nodeInfo.word.size() == 1) 141 | { 142 | _userDictSingleChineseWord.insert(nodeInfo.word[0]); 143 | } 144 | nodeInfo.weight = defaultWeight; 145 | nodeInfo.tag = defaultTag; 146 | _nodeInfos.push_back(nodeInfo); 147 | } 148 | LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); 149 | } 150 | void _loadDict(const string& filePath, vector& nodeInfos) const 151 | { 152 | ifstream ifs(filePath.c_str()); 153 | assert(ifs); 154 | string line; 155 | vector buf; 156 | 157 | DictUnit nodeInfo; 158 | for(size_t lineno = 0 ; getline(ifs, line); lineno++) 159 | { 160 | split(line, buf, " "); 161 | assert(buf.size() == DICT_COLUMN_NUM); 162 | 163 | if(!TransCode::decode(buf[0], nodeInfo.word)) 164 | { 165 | LogError("line[%u:%s] illegal.", lineno, line.c_str()); 166 | continue; 167 | } 168 | nodeInfo.weight = atof(buf[1].c_str()); 169 | nodeInfo.tag = buf[2]; 170 | 171 | nodeInfos.push_back(nodeInfo); 172 | } 173 | } 174 | double _findMinWeight(const vector& nodeInfos) const 175 | { 176 | double ret = MAX_DOUBLE; 177 | for(size_t i = 0; i < nodeInfos.size(); i++) 178 | { 179 | ret = min(nodeInfos[i].weight, ret); 180 | } 181 | return ret; 182 | } 183 | double _findMaxWeight(const vector& nodeInfos) const 184 | { 185 | double ret = MIN_DOUBLE; 186 | for(size_t i = 0; i < nodeInfos.size(); i++) 187 | { 188 | ret = max(nodeInfos[i].weight, ret); 189 | } 190 | return ret; 191 | } 192 | 193 | void _calculateWeight(vector& nodeInfos) const 194 | { 195 | double sum = 0.0; 196 | for(size_t i = 0; i < nodeInfos.size(); i++) 197 | { 198 | sum += nodeInfos[i].weight; 199 | } 200 | assert(sum); 201 | for(size_t i = 0; i < nodeInfos.size(); i++) 202 | { 203 | DictUnit& nodeInfo = nodeInfos[i]; 204 | assert(nodeInfo.weight); 205 | nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); 206 | } 207 | } 208 | 209 | void _shrink(vector& units) const 210 | { 211 | vector(units.begin(), units.end()).swap(units); 212 | } 213 | 214 | 215 | }; 216 | } 217 | 218 | #endif 219 | -------------------------------------------------------------------------------- /src/CppJieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "Limonp/Logger.hpp" 8 | #include "DictTrie.hpp" 9 | #include "ISegment.hpp" 10 | #include "SegmentBase.hpp" 11 | #include "TransCode.hpp" 12 | 13 | namespace CppJieba 14 | { 15 | class FullSegment: public SegmentBase 16 | { 17 | private: 18 | const DictTrie* _dictTrie; 19 | bool _isBorrowed; 20 | public: 21 | FullSegment() 22 | { 23 | _dictTrie = NULL; 24 | _isBorrowed = false; 25 | } 26 | explicit FullSegment(const string& dictPath) 27 | { 28 | _dictTrie = NULL; 29 | init(dictPath); 30 | } 31 | explicit FullSegment(const DictTrie* dictTrie) 32 | { 33 | _dictTrie = NULL; 34 | init(dictTrie); 35 | } 36 | virtual ~FullSegment() 37 | { 38 | if(_dictTrie && ! _isBorrowed) 39 | { 40 | delete _dictTrie; 41 | } 42 | 43 | }; 44 | public: 45 | bool init(const string& dictPath) 46 | { 47 | assert(_dictTrie == NULL); 48 | _dictTrie = new DictTrie(dictPath); 49 | _isBorrowed = false; 50 | return true; 51 | } 52 | bool init(const DictTrie* dictTrie) 53 | { 54 | assert(_dictTrie == NULL); 55 | assert(dictTrie); 56 | _dictTrie = dictTrie; 57 | _isBorrowed = true; 58 | return true; 59 | } 60 | 61 | public: 62 | using SegmentBase::cut; 63 | 64 | public: 65 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 66 | { 67 | assert(_dictTrie); 68 | if (begin >= end) 69 | { 70 | LogError("begin >= end"); 71 | return false; 72 | } 73 | 74 | //resut of searching in trie tree 75 | DagType tRes; 76 | 77 | //max index of res's words 78 | int maxIdx = 0; 79 | 80 | // always equals to (uItr - begin) 81 | int uIdx = 0; 82 | 83 | //tmp variables 84 | int wordLen = 0; 85 | for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) 86 | { 87 | //find word start from uItr 88 | if (_dictTrie->find(uItr, end, tRes, 0)) 89 | { 90 | for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) 91 | //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) 92 | { 93 | wordLen = itr->second->word.size(); 94 | if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) 95 | { 96 | res.push_back(itr->second->word); 97 | } 98 | maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; 99 | } 100 | tRes.clear(); 101 | } 102 | else // not found word start from uItr 103 | { 104 | if (maxIdx <= uIdx) // never exist in prev results 105 | { 106 | //put itr itself in res 107 | res.push_back(Unicode(1, *uItr)); 108 | 109 | //mark it exits 110 | ++maxIdx; 111 | } 112 | } 113 | ++uIdx; 114 | } 115 | 116 | return true; 117 | } 118 | 119 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 120 | { 121 | assert(_dictTrie); 122 | if (begin >= end) 123 | { 124 | LogError("begin >= end"); 125 | return false; 126 | } 127 | 128 | vector uRes; 129 | if (!cut(begin, end, uRes)) 130 | { 131 | LogError("get unicode cut result error."); 132 | return false; 133 | } 134 | 135 | string tmp; 136 | for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) 137 | { 138 | if (TransCode::encode(*uItr, tmp)) 139 | { 140 | res.push_back(tmp); 141 | } 142 | else 143 | { 144 | LogError("encode failed."); 145 | } 146 | } 147 | 148 | return true; 149 | } 150 | }; 151 | } 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /src/CppJieba/HMMSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIBEA_HMMSEGMENT_H 2 | #define CPPJIBEA_HMMSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "Limonp/StringUtil.hpp" 9 | #include "Limonp/Logger.hpp" 10 | #include "TransCode.hpp" 11 | #include "ISegment.hpp" 12 | #include "SegmentBase.hpp" 13 | #include "DictTrie.hpp" 14 | 15 | namespace CppJieba 16 | { 17 | using namespace Limonp; 18 | typedef unordered_map EmitProbMap; 19 | class HMMSegment: public SegmentBase 20 | { 21 | public: 22 | /* 23 | * STATUS: 24 | * 0:B, 1:E, 2:M, 3:S 25 | * */ 26 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 27 | private: 28 | char _statMap[STATUS_SUM]; 29 | double _startProb[STATUS_SUM]; 30 | double _transProb[STATUS_SUM][STATUS_SUM]; 31 | EmitProbMap _emitProbB; 32 | EmitProbMap _emitProbE; 33 | EmitProbMap _emitProbM; 34 | EmitProbMap _emitProbS; 35 | vector _emitProbVec; 36 | 37 | public: 38 | HMMSegment(){} 39 | explicit HMMSegment(const string& filePath) 40 | { 41 | LIMONP_CHECK(init(filePath)); 42 | } 43 | virtual ~HMMSegment(){} 44 | public: 45 | bool init(const string& filePath) 46 | { 47 | memset(_startProb, 0, sizeof(_startProb)); 48 | memset(_transProb, 0, sizeof(_transProb)); 49 | _statMap[0] = 'B'; 50 | _statMap[1] = 'E'; 51 | _statMap[2] = 'M'; 52 | _statMap[3] = 'S'; 53 | _emitProbVec.push_back(&_emitProbB); 54 | _emitProbVec.push_back(&_emitProbE); 55 | _emitProbVec.push_back(&_emitProbM); 56 | _emitProbVec.push_back(&_emitProbS); 57 | LIMONP_CHECK(_loadModel(filePath.c_str())); 58 | LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); 59 | return true; 60 | } 61 | public: 62 | using SegmentBase::cut; 63 | public: 64 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 65 | { 66 | Unicode::const_iterator left = begin; 67 | Unicode::const_iterator right = begin; 68 | while(right != end) 69 | { 70 | if(*right < 0x80) 71 | { 72 | if(left != right && !_cut(left, right, res)) 73 | { 74 | return false; 75 | } 76 | left = right; 77 | while(*right < 0x80 && right != end) 78 | { 79 | right++; 80 | } 81 | res.push_back(Unicode(left, right)); 82 | left = right; 83 | } 84 | else 85 | { 86 | right++; 87 | } 88 | } 89 | if(left != right && !_cut(left, right, res)) 90 | { 91 | return false; 92 | } 93 | return true; 94 | } 95 | private: 96 | bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 97 | { 98 | vector status; 99 | if(!_viterbi(begin, end, status)) 100 | { 101 | LogError("_viterbi failed."); 102 | return false; 103 | } 104 | 105 | Unicode::const_iterator left = begin; 106 | Unicode::const_iterator right; 107 | for(size_t i = 0; i < status.size(); i++) 108 | { 109 | if(status[i] % 2) //if(E == status[i] || S == status[i]) 110 | { 111 | right = begin + i + 1; 112 | res.push_back(Unicode(left, right)); 113 | left = right; 114 | } 115 | } 116 | return true; 117 | } 118 | public: 119 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 120 | { 121 | if(begin == end) 122 | { 123 | return false; 124 | } 125 | vector words; 126 | words.reserve(end - begin); 127 | if(!cut(begin, end, words)) 128 | { 129 | return false; 130 | } 131 | size_t offset = res.size(); 132 | res.resize(res.size() + words.size()); 133 | for(size_t i = 0; i < words.size(); i++) 134 | { 135 | if(!TransCode::encode(words[i], res[offset + i])) 136 | { 137 | LogError("encode failed."); 138 | } 139 | } 140 | return true; 141 | } 142 | 143 | private: 144 | bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const 145 | { 146 | if(begin == end) 147 | { 148 | return false; 149 | } 150 | 151 | size_t Y = STATUS_SUM; 152 | size_t X = end - begin; 153 | 154 | size_t XYSize = X * Y; 155 | size_t now, old, stat; 156 | double tmp, endE, endS; 157 | 158 | vector path(XYSize); 159 | vector weight(XYSize); 160 | 161 | //start 162 | for(size_t y = 0; y < Y; y++) 163 | { 164 | weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); 165 | path[0 + y * X] = -1; 166 | } 167 | 168 | 169 | double emitProb; 170 | 171 | for(size_t x = 1; x < X; x++) 172 | { 173 | for(size_t y = 0; y < Y; y++) 174 | { 175 | now = x + y*X; 176 | weight[now] = MIN_DOUBLE; 177 | path[now] = E; // warning 178 | emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); 179 | for(size_t preY = 0; preY < Y; preY++) 180 | { 181 | old = x - 1 + preY * X; 182 | tmp = weight[old] + _transProb[preY][y] + emitProb; 183 | if(tmp > weight[now]) 184 | { 185 | weight[now] = tmp; 186 | path[now] = preY; 187 | } 188 | } 189 | } 190 | } 191 | 192 | endE = weight[X-1+E*X]; 193 | endS = weight[X-1+S*X]; 194 | stat = 0; 195 | if(endE >= endS) 196 | { 197 | stat = E; 198 | } 199 | else 200 | { 201 | stat = S; 202 | } 203 | 204 | status.resize(X); 205 | for(int x = X -1 ; x >= 0; x--) 206 | { 207 | status[x] = stat; 208 | stat = path[x + stat*X]; 209 | } 210 | 211 | return true; 212 | } 213 | bool _loadModel(const char* const filePath) 214 | { 215 | LogDebug("loadModel [%s] start ...", filePath); 216 | ifstream ifile(filePath); 217 | string line; 218 | vector tmp; 219 | vector tmp2; 220 | //load _startProb 221 | if(!_getLine(ifile, line)) 222 | { 223 | return false; 224 | } 225 | split(line, tmp, " "); 226 | if(tmp.size() != STATUS_SUM) 227 | { 228 | LogError("start_p illegal"); 229 | return false; 230 | } 231 | for(size_t j = 0; j< tmp.size(); j++) 232 | { 233 | _startProb[j] = atof(tmp[j].c_str()); 234 | //cout<<_startProb[j]< tmp, tmp2; 309 | Unicode unicode; 310 | split(line, tmp, ","); 311 | for(size_t i = 0; i < tmp.size(); i++) 312 | { 313 | split(tmp[i], tmp2, ":"); 314 | if(2 != tmp2.size()) 315 | { 316 | LogError("_emitProb illegal."); 317 | return false; 318 | } 319 | if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) 320 | { 321 | LogError("TransCode failed."); 322 | return false; 323 | } 324 | mp[unicode[0]] = atof(tmp2[1].c_str()); 325 | } 326 | return true; 327 | } 328 | double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const 329 | { 330 | EmitProbMap::const_iterator cit = ptMp->find(key); 331 | if(cit == ptMp->end()) 332 | { 333 | return defVal; 334 | } 335 | return cit->second; 336 | 337 | } 338 | 339 | 340 | }; 341 | } 342 | 343 | #endif 344 | -------------------------------------------------------------------------------- /src/CppJieba/ISegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTINTERFACE_H 2 | #define CPPJIEBA_SEGMENTINTERFACE_H 3 | 4 | 5 | namespace CppJieba 6 | { 7 | class ISegment 8 | { 9 | public: 10 | virtual ~ISegment(){}; 11 | public: 12 | virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; 13 | virtual bool cut(const string& str, vector& res) const = 0; 14 | }; 15 | } 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/CppJieba/KeywordExtractor.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H 2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H 3 | 4 | #include "MixSegment.hpp" 5 | #include 6 | #include 7 | 8 | namespace CppJieba 9 | { 10 | using namespace Limonp; 11 | 12 | /*utf8*/ 13 | class KeywordExtractor 14 | { 15 | private: 16 | MixSegment _segment; 17 | private: 18 | unordered_map _idfMap; 19 | double _idfAverage; 20 | 21 | unordered_set _stopWords; 22 | public: 23 | KeywordExtractor(){}; 24 | KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) 25 | { 26 | LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath)); 27 | }; 28 | ~KeywordExtractor(){}; 29 | 30 | public: 31 | bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) 32 | { 33 | _loadIdfDict(idfPath); 34 | _loadStopWordDict(stopWordPath); 35 | LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); 36 | return true; 37 | }; 38 | public: 39 | 40 | bool extract(const string& str, vector& keywords, size_t topN) const 41 | { 42 | vector > topWords; 43 | if(!extract(str, topWords, topN)) 44 | { 45 | return false; 46 | } 47 | for(size_t i = 0; i < topWords.size(); i++) 48 | { 49 | keywords.push_back(topWords[i].first); 50 | } 51 | return true; 52 | } 53 | 54 | bool extract(const string& str, vector >& keywords, size_t topN) const 55 | { 56 | vector words; 57 | if(!_segment.cut(str, words)) 58 | { 59 | LogError("segment cut(%s) failed.", str.c_str()); 60 | return false; 61 | } 62 | 63 | map wordmap; 64 | for(vector::iterator iter = words.begin(); iter != words.end(); iter++) 65 | { 66 | if(_isSingleWord(*iter)) 67 | { 68 | continue; 69 | } 70 | wordmap[*iter] += 1.0; 71 | } 72 | 73 | for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) 74 | { 75 | if(_stopWords.end() != _stopWords.find(itr->first)) 76 | { 77 | wordmap.erase(itr++); 78 | continue; 79 | } 80 | 81 | unordered_map::const_iterator cit = _idfMap.find(itr->first); 82 | if(cit != _idfMap.end()) 83 | { 84 | itr->second *= cit->second; 85 | } 86 | else 87 | { 88 | itr->second *= _idfAverage; 89 | } 90 | itr ++; 91 | } 92 | 93 | keywords.clear(); 94 | std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); 95 | topN = min(topN, keywords.size()); 96 | partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); 97 | keywords.resize(topN); 98 | return true; 99 | } 100 | private: 101 | void _loadIdfDict(const string& idfPath) 102 | { 103 | ifstream ifs(idfPath.c_str()); 104 | if(!ifs) 105 | { 106 | LogError("open %s failed.", idfPath.c_str()); 107 | assert(false); 108 | } 109 | string line ; 110 | vector buf; 111 | double idf = 0.0; 112 | double idfSum = 0.0; 113 | size_t lineno = 0; 114 | for(;getline(ifs, line); lineno++) 115 | { 116 | buf.clear(); 117 | if(line.empty()) 118 | { 119 | LogError("line[%d] empty. skipped.", lineno); 120 | continue; 121 | } 122 | if(!split(line, buf, " ") || buf.size() != 2) 123 | { 124 | LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); 125 | continue; 126 | } 127 | idf = atof(buf[1].c_str()); 128 | _idfMap[buf[0]] = idf; 129 | idfSum += idf; 130 | 131 | } 132 | 133 | assert(lineno); 134 | _idfAverage = idfSum / lineno; 135 | assert(_idfAverage > 0.0); 136 | } 137 | void _loadStopWordDict(const string& filePath) 138 | { 139 | ifstream ifs(filePath.c_str()); 140 | if(!ifs) 141 | { 142 | LogError("open %s failed.", filePath.c_str()); 143 | assert(false); 144 | } 145 | string line ; 146 | while(getline(ifs, line)) 147 | { 148 | _stopWords.insert(line); 149 | } 150 | assert(_stopWords.size()); 151 | } 152 | private: 153 | bool _isSingleWord(const string& str) const 154 | { 155 | Unicode unicode; 156 | TransCode::decode(str, unicode); 157 | if(unicode.size() == 1) 158 | return true; 159 | return false; 160 | } 161 | 162 | private: 163 | static bool _cmp(const pair& lhs, const pair& rhs) 164 | { 165 | return lhs.second > rhs.second; 166 | } 167 | 168 | }; 169 | } 170 | 171 | #endif 172 | 173 | 174 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "StringUtil.hpp" 14 | 15 | namespace Limonp 16 | { 17 | using namespace std; 18 | class Config 19 | { 20 | public: 21 | explicit Config(const string& filePath) 22 | { 23 | _loadFile(filePath); 24 | } 25 | public: 26 | operator bool () 27 | { 28 | return !_map.empty(); 29 | } 30 | private: 31 | void _loadFile(const string& filePath) 32 | { 33 | ifstream ifs(filePath.c_str()); 34 | assert(ifs); 35 | string line; 36 | vector vecBuf; 37 | size_t lineno = 0; 38 | while(getline(ifs, line)) 39 | { 40 | lineno ++; 41 | trim(line); 42 | if(line.empty() || startsWith(line, "#")) 43 | { 44 | continue; 45 | } 46 | vecBuf.clear(); 47 | if(!split(line, vecBuf, "=") || 2 != vecBuf.size()) 48 | { 49 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 50 | assert(false); 51 | continue; 52 | } 53 | string& key = vecBuf[0]; 54 | string& value = vecBuf[1]; 55 | trim(key); 56 | trim(value); 57 | if(!_map.insert(make_pair(key, value)).second) 58 | { 59 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 60 | assert(false); 61 | continue; 62 | } 63 | } 64 | ifs.close(); 65 | } 66 | public: 67 | bool get(const string& key, string& value) const 68 | { 69 | map::const_iterator it = _map.find(key); 70 | if(_map.end() != it) 71 | { 72 | value = it->second; 73 | return true; 74 | } 75 | return false; 76 | } 77 | const char* operator [] (const char* key) const 78 | { 79 | if(NULL == key) 80 | { 81 | return NULL; 82 | } 83 | map::const_iterator it = _map.find(key); 84 | if(_map.end() != it) 85 | { 86 | return it->second.c_str(); 87 | } 88 | return NULL; 89 | } 90 | private: 91 | map _map; 92 | private: 93 | friend ostream& operator << (ostream& os, const Config& config); 94 | }; 95 | 96 | inline ostream& operator << (ostream& os, const Config& config) 97 | { 98 | return os << config._map; 99 | } 100 | } 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/HandyMacro.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MACRO_DEF_H 2 | #define LIMONP_MACRO_DEF_H 3 | 4 | #include 5 | 6 | #define LIMONP_CHECK(exp) \ 7 | if(!(exp)){fprintf(stderr, "File:%s, Line:%d Exp:[" #exp "] is true, abort.\n", __FILE__, __LINE__); abort();} 8 | 9 | #define print(x) cout<< #x": " << x < 5 | #include 6 | #include 7 | #include 8 | 9 | namespace Limonp 10 | { 11 | using namespace std; 12 | /* 13 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 14 | * LocalVector is simple and not well-tested. 15 | */ 16 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 17 | template 18 | class LocalVector 19 | { 20 | public: 21 | typedef const T* const_iterator ; 22 | typedef T value_type; 23 | typedef size_t size_type; 24 | private: 25 | T _buffer[LOCAL_VECTOR_BUFFER_SIZE]; 26 | T * _ptr; 27 | size_t _size; 28 | size_t _capacity; 29 | public: 30 | LocalVector() 31 | { 32 | _init(); 33 | }; 34 | LocalVector(const LocalVector& vec) 35 | { 36 | _init(); 37 | *this = vec; 38 | } 39 | LocalVector(const_iterator begin, const_iterator end) // TODO: make it faster 40 | { 41 | _init(); 42 | while(begin != end) 43 | { 44 | push_back(*begin++); 45 | } 46 | } 47 | LocalVector(size_t size, const T& t) // TODO: make it faster 48 | { 49 | _init(); 50 | while(size--) 51 | { 52 | push_back(t); 53 | } 54 | } 55 | ~LocalVector() 56 | { 57 | if(_ptr != _buffer) 58 | { 59 | free(_ptr); 60 | } 61 | }; 62 | public: 63 | LocalVector& operator = (const LocalVector& vec) 64 | { 65 | clear(); 66 | _size = vec.size(); 67 | _capacity = vec.capacity(); 68 | if(vec._buffer == vec._ptr) 69 | { 70 | memcpy(_buffer, vec._buffer, sizeof(T) * _size); 71 | _ptr = _buffer; 72 | } 73 | else 74 | { 75 | _ptr = (T*) malloc(vec.capacity() * sizeof(T)); 76 | assert(_ptr); 77 | memcpy(_ptr, vec._ptr, vec.size() * sizeof(T)); 78 | } 79 | return *this; 80 | } 81 | private: 82 | void _init() 83 | { 84 | _ptr = _buffer; 85 | _size = 0; 86 | _capacity = LOCAL_VECTOR_BUFFER_SIZE; 87 | } 88 | public: 89 | T& operator [] (size_t i) 90 | { 91 | return _ptr[i]; 92 | } 93 | const T& operator [] (size_t i) const 94 | { 95 | return _ptr[i]; 96 | } 97 | void push_back(const T& t) 98 | { 99 | if(_size == _capacity) 100 | { 101 | assert(_capacity); 102 | reserve(_capacity * 2); 103 | } 104 | _ptr[_size ++ ] = t; 105 | } 106 | void reserve(size_t size) 107 | { 108 | if(size <= _capacity) 109 | { 110 | return; 111 | } 112 | T * next = (T*)malloc(sizeof(T) * size); 113 | assert(next); 114 | T * old = _ptr; 115 | _ptr = next; 116 | memcpy(_ptr, old, sizeof(T) * _capacity); 117 | _capacity = size; 118 | if(old != _buffer) 119 | { 120 | free(old); 121 | } 122 | } 123 | bool empty() const 124 | { 125 | return 0 == size(); 126 | } 127 | size_t size() const 128 | { 129 | return _size; 130 | } 131 | size_t capacity() const 132 | { 133 | return _capacity; 134 | } 135 | const_iterator begin() const 136 | { 137 | return _ptr; 138 | } 139 | const_iterator end() const 140 | { 141 | return _ptr + _size; 142 | } 143 | void clear() 144 | { 145 | if(_ptr != _buffer) 146 | { 147 | free(_ptr); 148 | } 149 | _init(); 150 | } 151 | }; 152 | 153 | template 154 | ostream & operator << (ostream& os, const LocalVector& vec) 155 | { 156 | if(vec.empty()) 157 | { 158 | return os << "[]"; 159 | } 160 | os<<"[\""< 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__ 20 | 21 | #define LogDebug(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 22 | #define LogInfo(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 23 | #define LogWarn(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 24 | #define LogError(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 25 | #define LogFatal(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__) 26 | 27 | namespace Limonp 28 | { 29 | using namespace std; 30 | enum {LL_DEBUG = 0, LL_INFO = 1, LL_WARN = 2, LL_ERROR = 3, LL_FATAL = 4, LEVEL_ARRAY_SIZE = 5, CSTR_BUFFER_SIZE = 32}; 31 | static const char * LOG_LEVEL_ARRAY[LEVEL_ARRAY_SIZE]= {"DEBUG","INFO","WARN","ERROR","FATAL"}; 32 | static const char * LOG_FORMAT = "%s %s:%d %s %s\n"; 33 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 34 | 35 | class Logger 36 | { 37 | public: 38 | static void Logging(size_t level, const string& msg, const char* fileName, int lineno) 39 | { 40 | assert(level <= LL_FATAL); 41 | char buf[CSTR_BUFFER_SIZE]; 42 | time_t timeNow; 43 | time(&timeNow); 44 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow)); 45 | fprintf(stderr, LOG_FORMAT, buf, fileName, lineno,LOG_LEVEL_ARRAY[level], msg.c_str()); 46 | } 47 | static void LoggingF(size_t level, const char* fileName, int lineno, const char* const fmt, ...) 48 | { 49 | #ifdef LOGGER_LEVEL 50 | if(level < LOGGER_LEVEL) return; 51 | #endif 52 | int size = 256; 53 | string msg; 54 | va_list ap; 55 | while (1) { 56 | msg.resize(size); 57 | va_start(ap, fmt); 58 | int n = vsnprintf((char *)msg.c_str(), size, fmt, ap); 59 | va_end(ap); 60 | if (n > -1 && n < size) { 61 | msg.resize(n); 62 | break; 63 | } 64 | if (n > -1) 65 | size = n + 1; 66 | else 67 | size *= 2; 68 | } 69 | Logging(level, msg, fileName, lineno); 70 | } 71 | }; 72 | } 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/MysqlClient.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MYSQLCLIENT_H 2 | #define LIMONP_MYSQLCLIENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "logger.hpp" 9 | #include "InitOnOff.hpp" 10 | 11 | namespace Limonp 12 | { 13 | using namespace std; 14 | class MysqlClient: public InitOnOff 15 | { 16 | public: 17 | typedef vector< vector > RowsType; 18 | private: 19 | const string _host; 20 | const size_t _port; 21 | const string _user; 22 | const string _passwd; 23 | const string _db; 24 | const string _charset; 25 | public: 26 | MysqlClient(const string& host, size_t port, const string& user, const string& passwd, const string& db, const string& charset = "utf8"): _host(host), _port(port), _user(user), _passwd(passwd), _db(db), _charset(charset), _conn(NULL) 27 | { 28 | _setInitFlag(_init()); 29 | } 30 | ~MysqlClient() 31 | { 32 | if(_conn) 33 | { 34 | mysql_close(_conn); 35 | } 36 | }; 37 | private: 38 | bool _init() 39 | { 40 | //cout<& vals) 80 | { 81 | size_t retn = 0; 82 | string sql; 83 | for(size_t i = 0; i < vals.size(); i ++) 84 | { 85 | sql.clear(); 86 | string_format(sql, "insert into %s (%s) values %s", tableName.c_str(), keys.c_str(), vals[i].c_str()); 87 | retn += executeSql(sql.c_str()); 88 | } 89 | return retn; 90 | } 91 | bool select(const string& sql, RowsType& rows) 92 | { 93 | if(!executeSql(sql)) 94 | { 95 | LogError("executeSql failed. [%s]", sql.c_str()); 96 | return false; 97 | } 98 | MYSQL_RES * result = mysql_store_result(_conn); 99 | if(!result) 100 | { 101 | LogError("mysql_store_result failed.[%d]", mysql_error(_conn)); 102 | return false; 103 | } 104 | size_t num_fields = mysql_num_fields(result); 105 | MYSQL_ROW row; 106 | while((row = mysql_fetch_row(result))) 107 | { 108 | vector vec; 109 | for(size_t i = 0; i < num_fields; i ++) 110 | { 111 | row[i] ? vec.push_back(row[i]) : vec.push_back("NULL"); 112 | } 113 | rows.push_back(vec); 114 | } 115 | mysql_free_result(result); 116 | return true; 117 | } 118 | 119 | private: 120 | MYSQL * _conn; 121 | 122 | }; 123 | } 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | #include 7 | #include 8 | 9 | namespace Limonp 10 | { 11 | class NonCopyable 12 | { 13 | protected: 14 | NonCopyable(){}; 15 | ~NonCopyable(){}; 16 | private: 17 | NonCopyable(const NonCopyable& ); 18 | const NonCopyable& operator=(const NonCopyable& ); 19 | }; 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/StdExtension.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_STD_EXTEMSION_HPP 2 | #define LIMONP_STD_EXTEMSION_HPP 3 | 4 | #include 5 | 6 | #if(__cplusplus == 201103L) 7 | #include 8 | #include 9 | #else 10 | #include 11 | #include 12 | namespace std 13 | { 14 | using std::tr1::unordered_map; 15 | using std::tr1::unordered_set; 16 | } 17 | 18 | #endif 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | namespace std 27 | { 28 | template 29 | ostream& operator << (ostream& os, const vector& vec) 30 | { 31 | if(vec.empty()) 32 | { 33 | return os << "[]"; 34 | } 35 | os<<"[\""< 44 | ostream& operator << (ostream& os, const pair& pr) 45 | { 46 | os << pr.first << ":" << pr.second ; 47 | return os; 48 | } 49 | 50 | 51 | template 52 | string& operator << (string& str, const T& obj) 53 | { 54 | stringstream ss; 55 | ss << obj; // call ostream& operator << (ostream& os, 56 | return str = ss.str(); 57 | } 58 | 59 | template 60 | ostream& operator << (ostream& os, const map& mp) 61 | { 62 | if(mp.empty()) 63 | { 64 | os<<"{}"; 65 | return os; 66 | } 67 | os<<'{'; 68 | typename map::const_iterator it = mp.begin(); 69 | os<<*it; 70 | it++; 71 | while(it != mp.end()) 72 | { 73 | os<<", "<<*it; 74 | it++; 75 | } 76 | os<<'}'; 77 | return os; 78 | } 79 | template 80 | ostream& operator << (ostream& os, const std::unordered_map& mp) 81 | { 82 | if(mp.empty()) 83 | { 84 | return os << "{}"; 85 | } 86 | os<<'{'; 87 | typename std::unordered_map::const_iterator it = mp.begin(); 88 | os<<*it; 89 | it++; 90 | while(it != mp.end()) 91 | { 92 | os<<", "<<*it++; 93 | } 94 | return os<<'}'; 95 | } 96 | 97 | template 98 | ostream& operator << (ostream& os, const set& st) 99 | { 100 | if(st.empty()) 101 | { 102 | os << "{}"; 103 | return os; 104 | } 105 | os<<'{'; 106 | typename set::const_iterator it = st.begin(); 107 | os<<*it; 108 | it++; 109 | while(it != st.end()) 110 | { 111 | os<<", "<<*it; 112 | it++; 113 | } 114 | os<<'}'; 115 | return os; 116 | } 117 | 118 | template 119 | bool isIn(const ContainType& contain, const KeyType& key) 120 | { 121 | return contain.end() != contain.find(key); 122 | } 123 | 124 | template 125 | basic_string & operator << (basic_string & s, ifstream & ifs) 126 | { 127 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 128 | } 129 | 130 | template 131 | ofstream & operator << (ofstream & ofs, const basic_string& s) 132 | { 133 | ostreambuf_iterator itr (ofs); 134 | copy(s.begin(), s.end(), itr); 135 | return ofs; 136 | } 137 | } 138 | 139 | #endif 140 | -------------------------------------------------------------------------------- /src/CppJieba/Limonp/StringUtil.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_STR_FUNCTS_H 6 | #define LIMONP_STR_FUNCTS_H 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "StdExtension.hpp" 25 | 26 | namespace Limonp 27 | { 28 | using namespace std; 29 | 30 | inline void string_format(string& res, const char* fmt, ...) 31 | { 32 | int size = 256; 33 | va_list ap; 34 | res.clear(); 35 | while (1) { 36 | res.resize(size); 37 | va_start(ap, fmt); 38 | int n = vsnprintf((char *)res.c_str(), size, fmt, ap); 39 | va_end(ap); 40 | if (n > -1 && n < size) { 41 | res.resize(n); 42 | return; 43 | } 44 | if (n > -1) 45 | size = n + 1; 46 | else 47 | size *= 2; 48 | } 49 | } 50 | inline string string_format(const char* fmt, ...) 51 | { 52 | int size = 256; 53 | std::string str; 54 | va_list ap; 55 | while (1) { 56 | str.resize(size); 57 | va_start(ap, fmt); 58 | int n = vsnprintf((char *)str.c_str(), size, fmt, ap); 59 | va_end(ap); 60 | if (n > -1 && n < size) { 61 | str.resize(n); 62 | return str; 63 | } 64 | if (n > -1) 65 | size = n + 1; 66 | else 67 | size *= 2; 68 | } 69 | return str; 70 | } 71 | 72 | template 73 | void join(T begin, T end, string& res, const string& connector) 74 | { 75 | if(begin == end) 76 | { 77 | return; 78 | } 79 | stringstream ss; 80 | ss<<*begin; 81 | begin++; 82 | while(begin != end) 83 | { 84 | ss << connector << *begin; 85 | begin ++; 86 | } 87 | res = ss.str(); 88 | } 89 | 90 | template 91 | string join(T begin, T end, const string& connector) 92 | { 93 | string res; 94 | join(begin ,end, res, connector); 95 | return res; 96 | } 97 | 98 | 99 | 100 | inline bool split(const string& src, vector& res, const string& pattern, size_t offset = 0, size_t len = string::npos) 101 | { 102 | if(src.empty()) 103 | { 104 | return false; 105 | } 106 | res.clear(); 107 | 108 | size_t start = 0; 109 | size_t end = 0; 110 | size_t cnt = 0; 111 | while(start < src.size() && res.size() < len) 112 | { 113 | end = src.find_first_of(pattern, start); 114 | if(string::npos == end) 115 | { 116 | if(cnt >= offset) 117 | { 118 | res.push_back(src.substr(start)); 119 | } 120 | return true; 121 | } 122 | //if(end == src.size() - 1) 123 | //{ 124 | // res.push_back(""); 125 | // return true; 126 | //} 127 | if(cnt >= offset) 128 | { 129 | res.push_back(src.substr(start, end - start)); 130 | } 131 | cnt ++; 132 | start = end + 1; 133 | } 134 | return true; 135 | } 136 | 137 | inline string& upper(string& str) 138 | { 139 | transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper); 140 | return str; 141 | } 142 | 143 | inline string& lower(string& str) 144 | { 145 | transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower); 146 | return str; 147 | } 148 | 149 | inline std::string <rim(std::string &s) 150 | { 151 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun(std::isspace)))); 152 | return s; 153 | } 154 | 155 | inline std::string &rtrim(std::string &s) 156 | { 157 | s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(std::isspace))).base(), s.end()); 158 | return s; 159 | } 160 | 161 | inline std::string &trim(std::string &s) 162 | { 163 | return ltrim(rtrim(s)); 164 | } 165 | 166 | inline std::string & ltrim(std::string & s, char x) 167 | { 168 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to(), x)))); 169 | return s; 170 | } 171 | 172 | inline std::string & rtrim(std::string & s, char x) 173 | { 174 | s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to(), x))).base(), s.end()); 175 | return s; 176 | } 177 | 178 | inline std::string &trim(std::string &s, char x) 179 | { 180 | return ltrim(rtrim(s, x), x); 181 | } 182 | 183 | inline bool startsWith(const string& str, const string& prefix) 184 | { 185 | if(prefix.length() > str.length()) 186 | { 187 | return false; 188 | } 189 | return 0 == str.compare(0, prefix.length(), prefix); 190 | } 191 | 192 | inline bool endsWith(const string& str, const string& suffix) 193 | { 194 | if(suffix.length() > str.length()) 195 | { 196 | return false; 197 | } 198 | return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix); 199 | } 200 | 201 | inline bool isInStr(const string& str, char ch) 202 | { 203 | return str.find(ch) != string::npos; 204 | } 205 | 206 | inline uint16_t twocharToUint16(char high, char low) 207 | { 208 | return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); 209 | } 210 | 211 | template 212 | bool utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) 213 | { 214 | if(!str) 215 | { 216 | return false; 217 | } 218 | char ch1, ch2; 219 | uint16_t tmp; 220 | vec.clear(); 221 | for(size_t i = 0;i < len;) 222 | { 223 | if(!(str[i] & 0x80)) // 0xxxxxxx 224 | { 225 | vec.push_back(str[i]); 226 | i++; 227 | } 228 | else if ((unsigned char)str[i] <= 0xdf && i + 1 < len) // 110xxxxxx 229 | { 230 | ch1 = (str[i] >> 2) & 0x07; 231 | ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); 232 | tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); 233 | vec.push_back(tmp); 234 | i += 2; 235 | } 236 | else if((unsigned char)str[i] <= 0xef && i + 2 < len) 237 | { 238 | ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); 239 | ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 240 | tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); 241 | vec.push_back(tmp); 242 | i += 3; 243 | } 244 | else 245 | { 246 | return false; 247 | } 248 | } 249 | return true; 250 | } 251 | template 252 | bool utf8ToUnicode(const string& str, Uint16Container& vec) 253 | { 254 | return utf8ToUnicode(str.c_str(), str.size(), vec); 255 | } 256 | 257 | template 258 | bool unicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) 259 | { 260 | if(begin >= end) 261 | { 262 | return false; 263 | } 264 | res.clear(); 265 | uint16_t ui; 266 | while(begin != end) 267 | { 268 | ui = *begin; 269 | if(ui <= 0x7f) 270 | { 271 | res += char(ui); 272 | } 273 | else if(ui <= 0x7ff) 274 | { 275 | res += char(((ui>>6) & 0x1f) | 0xc0); 276 | res += char((ui & 0x3f) | 0x80); 277 | } 278 | else 279 | { 280 | res += char(((ui >> 12) & 0x0f )| 0xe0); 281 | res += char(((ui>>6) & 0x3f )| 0x80 ); 282 | res += char((ui & 0x3f) | 0x80); 283 | } 284 | begin ++; 285 | } 286 | return true; 287 | } 288 | 289 | 290 | template 291 | bool gbkTrans(const char* const str, size_t len, Uint16Container& vec) 292 | { 293 | vec.clear(); 294 | if(!str) 295 | { 296 | return false; 297 | } 298 | size_t i = 0; 299 | while(i < len) 300 | { 301 | if(0 == (str[i] & 0x80)) 302 | { 303 | vec.push_back(uint16_t(str[i])); 304 | i++; 305 | } 306 | else 307 | { 308 | if(i + 1 < len) //&& (str[i+1] & 0x80)) 309 | { 310 | uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff)); 311 | vec.push_back(tmp); 312 | i += 2; 313 | } 314 | else 315 | { 316 | return false; 317 | } 318 | } 319 | } 320 | return true; 321 | } 322 | 323 | template 324 | bool gbkTrans(const string& str, Uint16Container& vec) 325 | { 326 | return gbkTrans(str.c_str(), str.size(), vec); 327 | } 328 | 329 | template 330 | bool gbkTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) 331 | { 332 | if(begin >= end) 333 | { 334 | return false; 335 | } 336 | res.clear(); 337 | //pair pa; 338 | char first, second; 339 | while(begin != end) 340 | { 341 | //pa = uint16ToChar2(*begin); 342 | first = ((*begin)>>8) & 0x00ff; 343 | second = (*begin) & 0x00ff; 344 | if(first & 0x80) 345 | { 346 | res += first; 347 | res += second; 348 | } 349 | else 350 | { 351 | res += second; 352 | } 353 | begin++; 354 | } 355 | return true; 356 | } 357 | 358 | /* 359 | * format example: "%Y-%m-%d %H:%M:%S" 360 | */ 361 | inline void getTime(const string& format, string& timeStr) 362 | { 363 | time_t timeNow; 364 | time(&timeNow); 365 | timeStr.resize(64); 366 | size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow)); 367 | timeStr.resize(len); 368 | } 369 | } 370 | #endif 371 | -------------------------------------------------------------------------------- /src/CppJieba/MPSegment.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ASCII 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef CPPJIEBA_MPSEGMENT_H 6 | #define CPPJIEBA_MPSEGMENT_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include "Limonp/Logger.hpp" 12 | #include "DictTrie.hpp" 13 | #include "DictTrie.hpp" 14 | #include "ISegment.hpp" 15 | #include "SegmentBase.hpp" 16 | 17 | namespace CppJieba 18 | { 19 | 20 | struct SegmentChar 21 | { 22 | uint16_t uniCh; 23 | DagType dag; 24 | const DictUnit * pInfo; 25 | double weight; 26 | size_t nextPos; 27 | SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) 28 | {} 29 | }; 30 | 31 | class MPSegment: public SegmentBase 32 | { 33 | protected: 34 | DictTrie _dictTrie; 35 | 36 | public: 37 | MPSegment(){}; 38 | MPSegment(const string& dictPath, const string& userDictPath = "") 39 | { 40 | LIMONP_CHECK(init(dictPath, userDictPath)); 41 | }; 42 | virtual ~MPSegment(){}; 43 | public: 44 | bool init(const string& dictPath, const string& userDictPath = "") 45 | { 46 | LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath)); 47 | LogInfo("MPSegment init(%s) ok", dictPath.c_str()); 48 | return true; 49 | } 50 | bool isUserDictSingleChineseWord(const Unicode::value_type & value) const 51 | { 52 | return _dictTrie.isUserDictSingleChineseWord(value); 53 | } 54 | public: 55 | using SegmentBase::cut; 56 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 57 | { 58 | if(begin == end) 59 | { 60 | return false; 61 | } 62 | 63 | vector words; 64 | words.reserve(end - begin); 65 | if(!cut(begin, end, words)) 66 | { 67 | return false; 68 | } 69 | size_t offset = res.size(); 70 | res.resize(res.size() + words.size()); 71 | for(size_t i = 0; i < words.size(); i++) 72 | { 73 | if(!TransCode::encode(words[i], res[i + offset])) 74 | { 75 | LogError("encode failed."); 76 | res[i + offset].clear(); 77 | } 78 | } 79 | return true; 80 | } 81 | 82 | bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const 83 | { 84 | if(end == begin) 85 | { 86 | return false; 87 | } 88 | vector segmentChars(end - begin); 89 | 90 | //calc DAG 91 | for(size_t i = 0; i < segmentChars.size(); i ++) 92 | { 93 | segmentChars[i].uniCh = *(begin + i); 94 | segmentChars[i].dag.clear(); 95 | _dictTrie.find(begin + i, end, segmentChars[i].dag, i); 96 | segmentChars[i].dag.insert(pair(i, NULL)); 97 | } 98 | 99 | _calcDP(segmentChars); 100 | 101 | if(!_cut(segmentChars, res)) 102 | { 103 | LogError("_cut failed."); 104 | return false; 105 | } 106 | 107 | return true; 108 | } 109 | const DictTrie* getDictTrie() const 110 | { 111 | return &_dictTrie; 112 | } 113 | 114 | private: 115 | void _calcDP(vector& SegmentChars) const 116 | { 117 | size_t nextPos; 118 | const DictUnit* p; 119 | double val; 120 | 121 | for(int i = SegmentChars.size() - 1; i >= 0; i--) 122 | { 123 | SegmentChars[i].pInfo = NULL; 124 | SegmentChars[i].weight = MIN_DOUBLE; 125 | for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++) 126 | { 127 | nextPos = it->first; 128 | p = it->second; 129 | val = 0.0; 130 | if(nextPos + 1 < SegmentChars.size()) 131 | { 132 | val += SegmentChars[nextPos + 1].weight; 133 | } 134 | 135 | if(p) 136 | { 137 | val += p->weight; 138 | } 139 | else 140 | { 141 | val += _dictTrie.getMinWeight(); 142 | } 143 | if(val > SegmentChars[i].weight) 144 | { 145 | SegmentChars[i].pInfo = p; 146 | SegmentChars[i].weight = val; 147 | } 148 | } 149 | } 150 | } 151 | bool _cut(const vector& segmentChars, vector& res)const 152 | { 153 | size_t i = 0; 154 | while(i < segmentChars.size()) 155 | { 156 | const DictUnit* p = segmentChars[i].pInfo; 157 | if(p) 158 | { 159 | res.push_back(p->word); 160 | i += p->word.size(); 161 | } 162 | else//single chinese word 163 | { 164 | res.push_back(Unicode(1, segmentChars[i].uniCh)); 165 | i++; 166 | } 167 | } 168 | return true; 169 | } 170 | 171 | 172 | }; 173 | } 174 | 175 | #endif 176 | -------------------------------------------------------------------------------- /src/CppJieba/MixSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MIXSEGMENT_H 2 | #define CPPJIEBA_MIXSEGMENT_H 3 | 4 | #include 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "Limonp/StringUtil.hpp" 8 | 9 | namespace CppJieba 10 | { 11 | class MixSegment: public SegmentBase 12 | { 13 | private: 14 | MPSegment _mpSeg; 15 | HMMSegment _hmmSeg; 16 | public: 17 | MixSegment(){}; 18 | MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") 19 | { 20 | LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict)); 21 | } 22 | virtual ~MixSegment(){} 23 | public: 24 | bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") 25 | { 26 | LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); 27 | LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); 28 | LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); 29 | return true; 30 | } 31 | public: 32 | using SegmentBase::cut; 33 | public: 34 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 35 | { 36 | vector words; 37 | words.reserve(end - begin); 38 | if(!_mpSeg.cut(begin, end, words)) 39 | { 40 | LogError("mpSeg cutDAG failed."); 41 | return false; 42 | } 43 | 44 | vector hmmRes; 45 | hmmRes.reserve(end - begin); 46 | Unicode piece; 47 | piece.reserve(end - begin); 48 | for (size_t i = 0, j = 0; i < words.size(); i++) 49 | { 50 | //if mp get a word, it's ok, put it into result 51 | if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) 52 | { 53 | res.push_back(words[i]); 54 | continue; 55 | } 56 | 57 | // if mp get a single one and it is not in userdict, collect it in sequence 58 | j = i; 59 | while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) 60 | { 61 | piece.push_back(words[j][0]); 62 | j++; 63 | } 64 | 65 | // cut the sequence with hmm 66 | if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) 67 | { 68 | LogError("_hmmSeg cut failed."); 69 | return false; 70 | } 71 | 72 | //put hmm result to result 73 | for (size_t k = 0; k < hmmRes.size(); k++) 74 | { 75 | res.push_back(hmmRes[k]); 76 | } 77 | 78 | //clear tmp vars 79 | piece.clear(); 80 | hmmRes.clear(); 81 | 82 | //let i jump over this piece 83 | i = j - 1; 84 | } 85 | return true; 86 | } 87 | 88 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const 89 | { 90 | if(begin == end) 91 | { 92 | return false; 93 | } 94 | 95 | vector uRes; 96 | uRes.reserve(end - begin); 97 | if (!cut(begin, end, uRes)) 98 | { 99 | return false; 100 | } 101 | 102 | size_t offset = res.size(); 103 | res.resize(res.size() + uRes.size()); 104 | for(size_t i = 0; i < uRes.size(); i ++, offset++) 105 | { 106 | if(!TransCode::encode(uRes[i], res[offset])) 107 | { 108 | LogError("encode failed."); 109 | } 110 | } 111 | return true; 112 | } 113 | 114 | const DictTrie* getDictTrie() const 115 | { 116 | return _mpSeg.getDictTrie(); 117 | } 118 | }; 119 | } 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /src/CppJieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "MixSegment.hpp" 5 | #include "Limonp/StringUtil.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace CppJieba 9 | { 10 | using namespace Limonp; 11 | 12 | class PosTagger 13 | { 14 | private: 15 | MixSegment _segment; 16 | DictTrie _dictTrie; 17 | 18 | public: 19 | PosTagger(){}; 20 | PosTagger(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb) 21 | { 22 | LIMONP_CHECK(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb)); 23 | }; 24 | ~PosTagger(){}; 25 | public: 26 | bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb) 27 | { 28 | LIMONP_CHECK(_dictTrie.init(dictPath)); 29 | LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); 30 | return true; 31 | }; 32 | 33 | bool tag(const string& src, vector >& res) 34 | { 35 | vector cutRes; 36 | if (!_segment.cut(src, cutRes)) 37 | { 38 | LogError("_mixSegment cut failed"); 39 | return false; 40 | } 41 | 42 | const DictUnit *tmp = NULL; 43 | Unicode unico; 44 | for (vector::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) 45 | { 46 | if (!TransCode::decode(*itr, unico)) 47 | { 48 | LogError("decode failed."); 49 | return false; 50 | } 51 | tmp = _dictTrie.find(unico.begin(), unico.end()); 52 | res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); 53 | } 54 | tmp = NULL; 55 | return !res.empty(); 56 | } 57 | }; 58 | } 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/CppJieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "Limonp/Logger.hpp" 8 | #include "DictTrie.hpp" 9 | #include "ISegment.hpp" 10 | #include "SegmentBase.hpp" 11 | #include "FullSegment.hpp" 12 | #include "MixSegment.hpp" 13 | #include "TransCode.hpp" 14 | #include "DictTrie.hpp" 15 | 16 | namespace CppJieba 17 | { 18 | class QuerySegment: public SegmentBase 19 | { 20 | private: 21 | MixSegment _mixSeg; 22 | FullSegment _fullSeg; 23 | size_t _maxWordLen; 24 | 25 | public: 26 | QuerySegment(){}; 27 | QuerySegment(const string& dict, const string& model, size_t maxWordLen) 28 | { 29 | init(dict, model, maxWordLen); 30 | }; 31 | virtual ~QuerySegment(){}; 32 | public: 33 | bool init(const string& dict, const string& model, size_t maxWordLen) 34 | { 35 | LIMONP_CHECK(_mixSeg.init(dict, model)); 36 | LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie())); 37 | assert(maxWordLen); 38 | _maxWordLen = maxWordLen; 39 | return true; 40 | } 41 | 42 | public: 43 | using SegmentBase::cut; 44 | 45 | public: 46 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 47 | { 48 | if (begin >= end) 49 | { 50 | LogError("begin >= end"); 51 | return false; 52 | } 53 | 54 | //use mix cut first 55 | vector mixRes; 56 | if (!_mixSeg.cut(begin, end, mixRes)) 57 | { 58 | LogError("_mixSeg cut failed."); 59 | return false; 60 | } 61 | 62 | vector fullRes; 63 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) 64 | { 65 | 66 | // if it's too long, cut with _fullSeg, put fullRes in res 67 | if (mixResItr->size() > _maxWordLen) 68 | { 69 | if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) 70 | { 71 | for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) 72 | { 73 | res.push_back(*fullResItr); 74 | } 75 | 76 | //clear tmp res 77 | fullRes.clear(); 78 | } 79 | } 80 | else // just use the mix result 81 | { 82 | res.push_back(*mixResItr); 83 | } 84 | } 85 | 86 | return true; 87 | } 88 | 89 | 90 | bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const 91 | { 92 | if (begin >= end) 93 | { 94 | LogError("begin >= end"); 95 | return false; 96 | } 97 | 98 | vector uRes; 99 | if (!cut(begin, end, uRes)) 100 | { 101 | LogError("get unicode cut result error."); 102 | return false; 103 | } 104 | 105 | string tmp; 106 | for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) 107 | { 108 | if (TransCode::encode(*uItr, tmp)) 109 | { 110 | res.push_back(tmp); 111 | } 112 | else 113 | { 114 | LogError("encode failed."); 115 | } 116 | } 117 | 118 | return true; 119 | } 120 | }; 121 | } 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /src/CppJieba/SegmentBase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTBASE_H 2 | #define CPPJIEBA_SEGMENTBASE_H 3 | 4 | #include "TransCode.hpp" 5 | #include "Limonp/Logger.hpp" 6 | #include "Limonp/NonCopyable.hpp" 7 | #include "Limonp/HandyMacro.hpp" 8 | #include "ISegment.hpp" 9 | #include 10 | 11 | 12 | namespace CppJieba 13 | { 14 | using namespace Limonp; 15 | 16 | //const char* const SPECIAL_CHARS = " \t\n"; 17 | #ifndef CPPJIEBA_GBK 18 | const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; 19 | #else 20 | const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u}; 21 | #endif 22 | 23 | class SegmentBase: public ISegment, public NonCopyable 24 | { 25 | public: 26 | SegmentBase(){_loadSpecialSymbols();}; 27 | virtual ~SegmentBase(){}; 28 | private: 29 | unordered_set _specialSymbols; 30 | private: 31 | void _loadSpecialSymbols() 32 | { 33 | size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); 34 | for(size_t i = 0; i < size; i ++) 35 | { 36 | _specialSymbols.insert(SPECIAL_SYMBOL[i]); 37 | } 38 | assert(_specialSymbols.size()); 39 | } 40 | 41 | public: 42 | virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; 43 | virtual bool cut(const string& str, vector& res) const 44 | { 45 | res.clear(); 46 | 47 | Unicode unicode; 48 | unicode.reserve(str.size()); 49 | 50 | TransCode::decode(str, unicode); 51 | 52 | Unicode::const_iterator left = unicode.begin(); 53 | Unicode::const_iterator right; 54 | 55 | for(right = unicode.begin(); right != unicode.end(); right++) 56 | { 57 | if(isIn(_specialSymbols, *right)) 58 | { 59 | if(left != right) 60 | { 61 | cut(left, right, res); 62 | } 63 | res.resize(res.size() + 1); 64 | TransCode::encode(right, right + 1, res.back()); 65 | left = right + 1; 66 | } 67 | } 68 | if(left != right) 69 | { 70 | cut(left, right, res); 71 | } 72 | 73 | return true; 74 | } 75 | }; 76 | } 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/CppJieba/TransCode.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf-8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef CPPJIEBA_TRANSCODE_H 6 | #define CPPJIEBA_TRANSCODE_H 7 | 8 | 9 | #include "Limonp/StringUtil.hpp" 10 | #include "Limonp/LocalVector.hpp" 11 | 12 | namespace CppJieba 13 | { 14 | 15 | using namespace Limonp; 16 | typedef uint16_t UnicodeValueType; 17 | typedef Limonp::LocalVector Unicode; 18 | namespace TransCode 19 | { 20 | inline bool decode(const string& str, Unicode& res) 21 | { 22 | #ifdef CPPJIEBA_GBK 23 | return gbkTrans(str, res); 24 | #else 25 | return utf8ToUnicode(str, res); 26 | #endif 27 | } 28 | 29 | inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) 30 | { 31 | #ifdef CPPJIEBA_GBK 32 | return gbkTrans(begin, end, res); 33 | #else 34 | return unicodeToUtf8(begin, end, res); 35 | #endif 36 | } 37 | 38 | inline bool encode(const Unicode& uni, string& res) 39 | { 40 | return encode(uni.begin(), uni.end(), res); 41 | } 42 | } 43 | } 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/CppJieba/Trie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_TRIE_HPP 2 | #define CPPJIEBA_TRIE_HPP 3 | 4 | #include "Limonp/StdExtension.hpp" 5 | #include 6 | 7 | namespace CppJieba 8 | { 9 | using namespace std; 10 | template 11 | class TrieNode 12 | { 13 | public: 14 | typedef unordered_map* > KeyMapType; 15 | public: 16 | KeyMapType * ptKeyMap; 17 | const ValueType * ptValue; 18 | }; 19 | 20 | template , class KeysContainerType = vector, class ValueContainerType = vector > 21 | class Trie 22 | { 23 | public: 24 | typedef TrieNode TrieNodeType; 25 | private: 26 | TrieNodeType* _root; 27 | public: 28 | Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers) 29 | { 30 | _root = new TrieNodeType; 31 | _root->ptKeyMap = NULL; 32 | _root->ptValue = NULL; 33 | 34 | _createTrie(keys, valuePointers); 35 | } 36 | ~Trie() 37 | { 38 | if(_root) 39 | { 40 | _deleteNode(_root); 41 | } 42 | } 43 | public: 44 | const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const 45 | { 46 | typename TrieNodeType::KeyMapType::const_iterator citer; 47 | const TrieNodeType* ptNode = _root; 48 | for(typename KeyContainerType::const_iterator it = begin; it != end; it++) 49 | { 50 | assert(ptNode); 51 | if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it))) 52 | { 53 | return NULL; 54 | } 55 | ptNode = citer->second; 56 | } 57 | return ptNode->ptValue; 58 | } 59 | bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map& ordererMap, size_t offset = 0) const 60 | { 61 | const TrieNodeType * ptNode = _root; 62 | typename TrieNodeType::KeyMapType::const_iterator citer; 63 | ordererMap.clear(); 64 | for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++) 65 | { 66 | assert(ptNode); 67 | if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr))) 68 | { 69 | break; 70 | } 71 | ptNode = citer->second; 72 | if(ptNode->ptValue) 73 | { 74 | ordererMap[itr - begin + offset] = ptNode->ptValue; 75 | } 76 | } 77 | return ordererMap.size(); 78 | } 79 | private: 80 | void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers) 81 | { 82 | if(valuePointers.empty() || keys.empty()) 83 | { 84 | return; 85 | } 86 | assert(keys.size() == valuePointers.size()); 87 | 88 | for(size_t i = 0; i < keys.size(); i++) 89 | { 90 | _insertNode(keys[i], valuePointers[i]); 91 | } 92 | } 93 | private: 94 | void _insertNode(const KeyContainerType& key, const ValueType* ptValue) 95 | { 96 | TrieNodeType* ptNode = _root; 97 | 98 | typename TrieNodeType::KeyMapType::const_iterator kmIter; 99 | 100 | for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++) 101 | { 102 | if(NULL == ptNode->ptKeyMap) 103 | { 104 | ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType; 105 | } 106 | kmIter = ptNode->ptKeyMap->find(*citer); 107 | if(ptNode->ptKeyMap->end() == kmIter) 108 | { 109 | TrieNodeType * nextNode = new TrieNodeType; 110 | nextNode->ptKeyMap = NULL; 111 | nextNode->ptValue = NULL; 112 | 113 | (*ptNode->ptKeyMap)[*citer] = nextNode; 114 | ptNode = nextNode; 115 | } 116 | else 117 | { 118 | ptNode = kmIter->second; 119 | } 120 | } 121 | ptNode->ptValue = ptValue; 122 | } 123 | void _deleteNode(TrieNodeType* node) 124 | { 125 | if(!node) 126 | { 127 | return; 128 | } 129 | if(node->ptKeyMap) 130 | { 131 | typename TrieNodeType::KeyMapType::iterator it; 132 | for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) 133 | { 134 | _deleteNode(it->second); 135 | } 136 | delete node->ptKeyMap; 137 | } 138 | delete node; 139 | } 140 | }; 141 | } 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /src/config: -------------------------------------------------------------------------------- 1 | ngx_addon_name=ngx_http_cppjieba_module 2 | HTTP_MODULES="$HTTP_MODULES ngx_http_cppjieba_module" 3 | NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ngx_addon_dir/ngx_http_cppjieba_module.cpp" 4 | -------------------------------------------------------------------------------- /src/ngx_http_cppjieba_module.cpp: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | #include 3 | #include 4 | #include 5 | } 6 | 7 | #include "CppJieba/MixSegment.hpp" 8 | 9 | using std::string; 10 | using std::vector; 11 | 12 | //static ngx_str_t g_cppjieba_conf_arg1; 13 | //static ngx_str_t g_cppjieba_conf_arg2; 14 | 15 | inline unsigned char fromHex(unsigned char x) 16 | { 17 | return isdigit(x) ? x - '0' : x - 'A' + 10; 18 | } 19 | /* 20 | inline unsigned char toHex(unsigned char x) 21 | { 22 | return x > 9 ? x -10 + 'A': x + '0'; 23 | } 24 | inline void URLEncode(const string &sIn, string& sOut) 25 | { 26 | for( size_t ix = 0; ix < sIn.size(); ix++ ) 27 | { 28 | unsigned char buf[4]; 29 | memset( buf, 0, 4 ); 30 | if( isalnum( (unsigned char)sIn[ix] ) ) 31 | { 32 | buf[0] = sIn[ix]; 33 | } 34 | else 35 | { 36 | buf[0] = '%'; 37 | buf[1] = toHex( (unsigned char)sIn[ix] >> 4 ); 38 | buf[2] = toHex( (unsigned char)sIn[ix] % 16); 39 | } 40 | sOut += (char *)buf; 41 | } 42 | }; 43 | */ 44 | 45 | static void URLDecode(const string &sIn, string& sOut) 46 | { 47 | for( size_t ix = 0; ix < sIn.size(); ix++ ) 48 | { 49 | unsigned char ch = 0; 50 | if(sIn[ix]=='%') 51 | { 52 | ch = (fromHex(sIn[ix+1])<<4); 53 | ch |= fromHex(sIn[ix+2]); 54 | ix += 2; 55 | } 56 | else if(sIn[ix] == '+') 57 | { 58 | ch = ' '; 59 | } 60 | else 61 | { 62 | ch = sIn[ix]; 63 | } 64 | sOut += (char)ch; 65 | } 66 | } 67 | CppJieba::MixSegment * g_mix_segment;//(DICT_PATH, HMM_PATH, USER_DICT_PATH); 68 | 69 | typedef struct { 70 | ngx_str_t output_words; 71 | } ngx_http_cppjieba_loc_conf_t; 72 | 73 | // To process HelloWorld command arguments 74 | static char* ngx_http_cppjieba_set_conf(ngx_conf_t* cf, ngx_command_t* cmd, void* conf); 75 | 76 | // Allocate memory for HelloWorld command 77 | static void* ngx_http_cppjieba_create_loc_conf(ngx_conf_t* cf); 78 | 79 | // Copy HelloWorld argument to another place 80 | static char* ngx_http_cppjieba_merge_loc_conf(ngx_conf_t* cf, void* parent, void* child); 81 | 82 | //static ngx_int_t ngx_http_cppjieba_init(ngx_cycle_t *cf); 83 | //static void ngx_http_cppjieba_finalize(ngx_cycle_t *cf); 84 | 85 | static ngx_int_t get_post_content(ngx_http_request_t *r, char * data_buf, size_t content_length); 86 | // Structure for the HelloWorld command 87 | static ngx_command_t ngx_http_cppjieba_commands[] = { 88 | { 89 | ngx_string("cppjieba"), // The command name 90 | NGX_HTTP_LOC_CONF | NGX_CONF_TAKE3, 91 | ngx_http_cppjieba_set_conf, // The command handler 92 | NGX_HTTP_LOC_CONF_OFFSET, 93 | offsetof(ngx_http_cppjieba_loc_conf_t, output_words), 94 | NULL 95 | }, 96 | ngx_null_command 97 | }; 98 | 99 | // Structure for the HelloWorld context 100 | static ngx_http_module_t ngx_http_cppjieba_module_ctx = { 101 | NULL, 102 | NULL, 103 | NULL, 104 | NULL, 105 | NULL, 106 | NULL, 107 | ngx_http_cppjieba_create_loc_conf, 108 | ngx_http_cppjieba_merge_loc_conf 109 | }; 110 | 111 | // Structure for the HelloWorld module, the most important thing 112 | ngx_module_t ngx_http_cppjieba_module = { 113 | NGX_MODULE_V1, 114 | &ngx_http_cppjieba_module_ctx, 115 | ngx_http_cppjieba_commands, 116 | NGX_HTTP_MODULE, 117 | NULL, 118 | NULL, 119 | NULL, //ngx_http_cppjieba_init, 120 | NULL, 121 | NULL, 122 | NULL, //ngx_http_cppjieba_finalize, 123 | NULL, 124 | NGX_MODULE_V1_PADDING 125 | }; 126 | 127 | static void ngx_http_cppjieba_post_handler(ngx_http_request_t* r); 128 | 129 | static ngx_int_t ngx_http_cppjieba_handler(ngx_http_request_t* r) { 130 | ngx_int_t rc; 131 | ngx_buf_t* b; 132 | ngx_chain_t out; 133 | 134 | if(r->method & NGX_HTTP_POST) { 135 | ngx_int_t rc = ngx_http_read_client_request_body(r, ngx_http_cppjieba_post_handler); 136 | if (rc >= NGX_HTTP_SPECIAL_RESPONSE) { 137 | return rc; 138 | } 139 | return NGX_DONE; 140 | } 141 | 142 | if(!(r->method & NGX_HTTP_GET)) { 143 | return NGX_HTTP_NOT_ALLOWED; 144 | } 145 | 146 | // args is s=xxxxx 147 | ngx_str_t value; 148 | if (NGX_OK != ngx_http_arg(r, (u_char*)"s", 1, &value)) { 149 | return NGX_HTTP_BAD_REQUEST; 150 | } 151 | 152 | string sentence; 153 | URLDecode(string((char*)value.data, value.len), sentence); 154 | vector words; 155 | g_mix_segment->cut(sentence, words); 156 | string response; 157 | //string tmp; 158 | //tmp << words; 159 | //URLEncode(tmp, response); 160 | response << words; 161 | 162 | 163 | b = ngx_create_temp_buf(r->pool, response.size()); 164 | if (b == NULL) { 165 | return NGX_HTTP_INTERNAL_SERVER_ERROR; 166 | } 167 | 168 | ngx_memcpy(b->pos, response.c_str(), response.size()); 169 | b->last = b->pos + response.size(); 170 | b->last_buf = 1; 171 | 172 | out.buf = b; 173 | out.next = NULL; 174 | 175 | r->headers_out.status = NGX_HTTP_OK; 176 | r->headers_out.content_length_n = response.size(); 177 | ngx_str_t type = ngx_string("text/plain"); 178 | r->headers_out.content_type = type; 179 | 180 | rc = ngx_http_send_header(r); 181 | if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) { 182 | return rc; 183 | } 184 | 185 | return ngx_http_output_filter(r, &out); 186 | } 187 | 188 | static void* ngx_http_cppjieba_create_loc_conf(ngx_conf_t* cf) { 189 | ngx_http_cppjieba_loc_conf_t* conf; 190 | 191 | conf = (ngx_http_cppjieba_loc_conf_t*)ngx_pcalloc(cf->pool, sizeof(ngx_http_cppjieba_loc_conf_t)); 192 | if (conf == NULL) { 193 | return NGX_CONF_ERROR; 194 | } 195 | conf->output_words.len = 0; 196 | conf->output_words.data = NULL; 197 | 198 | return conf; 199 | } 200 | 201 | static char* ngx_http_cppjieba_merge_loc_conf(ngx_conf_t* cf, void* parent, void* child) { 202 | ngx_http_cppjieba_loc_conf_t* prev = (ngx_http_cppjieba_loc_conf_t*)parent; 203 | ngx_http_cppjieba_loc_conf_t* conf = (ngx_http_cppjieba_loc_conf_t*)child; 204 | ngx_conf_merge_str_value(conf->output_words, prev->output_words, "Nginx"); 205 | return NGX_CONF_OK; 206 | } 207 | 208 | static char* ngx_http_cppjieba_set_conf(ngx_conf_t* cf, ngx_command_t* cmd, void* conf) { 209 | ngx_http_core_loc_conf_t* clcf; 210 | clcf = (ngx_http_core_loc_conf_t*)ngx_http_conf_get_module_loc_conf(cf, ngx_http_core_module); 211 | clcf->handler = ngx_http_cppjieba_handler; 212 | ngx_conf_set_str_slot(cf, cmd, conf); 213 | if (cf->args->nelts != 4) { 214 | ngx_log_error(NGX_LOG_ERR, cf->log, 0, " [the number of conf'a args is not 4] "); 215 | return (char*)NGX_CONF_ERROR; 216 | } 217 | ngx_str_t * value = (ngx_str_t *)cf->args->elts; 218 | 219 | g_mix_segment = new CppJieba::MixSegment( 220 | string((const char *)value[1].data, value[1].len), 221 | string((const char *)value[2].data, value[2].len), 222 | string((const char *)value[3].data, value[3].len)); 223 | return NGX_CONF_OK; 224 | } 225 | 226 | //static ngx_int_t ngx_http_cppjieba_init(ngx_cycle_t *cf) 227 | //{ 228 | // g_mix_segment = new CppJieba::MixSegment( 229 | // string((const char *)g_cppjieba_conf_arg1.data, g_cppjieba_conf_arg1.len), 230 | // string((const char *)g_cppjieba_conf_arg2.data, g_cppjieba_conf_arg2.len)); 231 | // return NGX_OK; 232 | //} 233 | 234 | //static void ngx_http_cppjieba_finalize(ngx_cycle_t *cf) 235 | //{ 236 | // delete g_mix_segment; 237 | // g_mix_segment = NULL; 238 | //} 239 | 240 | static ngx_int_t get_post_content(ngx_http_request_t *r, char * data_buf, size_t content_length) { 241 | ngx_log_error(NGX_LOG_NOTICE, r->connection->log, 0, "[get_post_content] [content_length:%d]", content_length); //DEBUG 242 | if(r->request_body == NULL) { 243 | ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, "reqeust_body:null"); 244 | return NGX_ERROR; 245 | } 246 | ngx_chain_t* bufs = r->request_body->bufs; 247 | ngx_buf_t* buf = NULL; 248 | size_t body_length = 0; 249 | size_t buf_length; 250 | while(bufs) { 251 | buf = bufs->buf; 252 | bufs = bufs->next; 253 | buf_length = buf->last - buf->pos; 254 | if(body_length + buf_length > content_length) { 255 | memcpy(data_buf + body_length, buf->pos, content_length - body_length); 256 | body_length = content_length; 257 | break; 258 | } 259 | memcpy(data_buf + body_length, buf->pos, buf->last - buf->pos); 260 | body_length += buf->last - buf->pos; 261 | } 262 | if(body_length != content_length) { 263 | ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, "get_post_content's body_length != content_length in headers"); 264 | return NGX_ERROR; 265 | } 266 | return NGX_OK; 267 | } 268 | 269 | 270 | static ngx_int_t ngx_http_cppjieba_send_response(ngx_http_request_t * r, const char* type, const char* data_buf, size_t len) { 271 | ngx_int_t rc; 272 | ngx_buf_t* b; 273 | ngx_chain_t out; 274 | 275 | b = ngx_create_temp_buf(r->pool, len); 276 | if (b == NULL) { 277 | return NGX_HTTP_INTERNAL_SERVER_ERROR; 278 | } 279 | 280 | ngx_memcpy(b->pos, data_buf, len); 281 | b->last = b->pos + len; 282 | b->last_buf = 1; 283 | 284 | out.buf = b; 285 | out.next = NULL; 286 | 287 | r->headers_out.status = NGX_HTTP_OK; 288 | r->headers_out.content_length_n = len; 289 | r->headers_out.content_type.data = (u_char*) type; 290 | r->headers_out.content_type.len = strlen(type); 291 | 292 | rc = ngx_http_send_header(r); 293 | if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) { 294 | return rc; 295 | } 296 | 297 | return ngx_http_output_filter(r, &out); 298 | } 299 | 300 | static void ngx_http_cppjieba_post_handler(ngx_http_request_t* r) { 301 | if(r->headers_in.content_length_n == 0) { 302 | ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, "r->headers_in.content_length_n is 0"); 303 | ngx_http_finalize_request(r, NGX_ERROR); 304 | return; 305 | } 306 | ngx_int_t rc; 307 | char * data_buf = NULL; 308 | data_buf = (char*) ngx_pcalloc(r->pool, r->headers_in.content_length_n + 1); 309 | if (data_buf == NULL) { 310 | ngx_http_finalize_request(r, NGX_ERROR); 311 | return; 312 | } 313 | 314 | if (NGX_ERROR == get_post_content(r, data_buf, r->headers_in.content_length_n)) { 315 | ngx_http_finalize_request(r, NGX_ERROR); 316 | return; 317 | } 318 | 319 | string sentence; 320 | URLDecode(data_buf, sentence); 321 | vector words; 322 | g_mix_segment->cut(sentence, words); 323 | string response; 324 | response << words; 325 | 326 | 327 | rc = ngx_http_cppjieba_send_response( 328 | r, 329 | "text/plain", 330 | response.c_str(), 331 | response.size()); 332 | 333 | ngx_log_error(NGX_LOG_INFO, r->connection->log, 0, "[ngx_http_cppjieba_send_response] [response size:%d]", response.size()); 334 | ngx_http_finalize_request(r, rc); 335 | } 336 | --------------------------------------------------------------------------------