├── ChangeLog.md
├── LICENSE
├── README.md
├── dict
    ├── hmm_model.utf8
    ├── jieba.dict.utf8
    └── user.dict.utf8
└── src
    ├── CppJieba
        ├── DictTrie.hpp
        ├── FullSegment.hpp
        ├── HMMSegment.hpp
        ├── ISegment.hpp
        ├── KeywordExtractor.hpp
        ├── Limonp
        │   ├── Config.hpp
        │   ├── HandyMacro.hpp
        │   ├── InitOnOff.hpp
        │   ├── LocalVector.hpp
        │   ├── Logger.hpp
        │   ├── MysqlClient.hpp
        │   ├── NonCopyable.hpp
        │   ├── StdExtension.hpp
        │   └── StringUtil.hpp
        ├── MPSegment.hpp
        ├── MixSegment.hpp
        ├── PosTagger.hpp
        ├── QuerySegment.hpp
        ├── SegmentBase.hpp
        ├── TransCode.hpp
        └── Trie.hpp
    ├── config
    └── ngx_http_cppjieba_module.cpp


/ChangeLog.md:
--------------------------------------------------------------------------------
1 | # ChangeLog
2 | 
3 | ## 0.1.0 
4 | 
5 | * 支持 GET 和 POST 的分词请求
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Yanyi Wu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ngx\_http\_cppjieba\_module
 2 | 
 3 | ## 简介
 4 | 
 5 | [CppJieba] 的 `Nginx` 扩展模块。
 6 | 需要了解源码的可以参看 [NginxModuleDevelopment] 。
 7 | 
 8 | ## 支持Docker
 9 | 
10 | ```
11 | docker pull docker.cn/yanyiwu/nginx_cppjieba_server
12 | ```
13 | 
14 | ## 用法
15 | 
16 | 
17 | ### 安装和配置
18 | 
19 | 以下用法假设 `ngx_http_cppjieba_module` 下载后存放的地址是 `/tmp/ngx_http_cppjieba_module` (这个地址在 `Nginx` 编译时和词典加载时候会用到)   
20 | 
21 | #### 下载源码：
22 | 
23 | ```
24 | git clone git://github.com/aszxqw/ngx_http_cppjieba_module.git /tmp/ngx_http_cppjieba_module
25 | ```
26 | 
27 | #### 进入 `Nginx` 源码目录：
28 | 
29 | ```
30 | ./configure --add-module=/tmp/ngx_http_cppjieba_module/src
31 | ```
32 | 
33 | 因为 `ngx_http_cppjieba_module` 是 `C++` 源码，所以作为 `Nginx` 模块编译的时候需要 修改 `obj/Makefile`
34 | 
35 | ```
36 | # 1. 在 "CC = gcc" 下面增加一行，如下
37 | CXX = g++
38 | # 2. 修改链接器为 g++ ， 如下
39 | LINK = $(CXX)
40 | # 3. 修改 ngx_http_cppjieba_module.cpp 的编译器，从 $(CC) 改为 $(CXX) ， 如下
41 | $(CXX) -c $(CFLAGS)  $(ALL_INCS) \
42 |     -o objs/addon/src/ngx_http_cppjieba_module.o \
43 |     /tmp/ngx_http_cppjieba_module/src/ngx_http_cppjieba_module.cpp
44 | 
45 | ```
46 | 
47 | 这三步做完就可以 `make && sudo make install` nginx 了。
48 | 
49 | ```
50 | # 4. 修改 Nginx 配置文件 /usr/local/nginx/conf/nginx.conf
51 | location /cppjieba {
52 |     cppjieba /tmp/ngx_http_cppjieba_module/dict/jieba.dict.utf8 /tmp/ngx_http_cppjieba_module/dict/hmm_model.utf8 /tmp/ngx_http_cppjieba_module/dict/user.dict.utf8;
53 | }
54 | ```
55 | 
56 | 如果 `ngx_http_cppjieba_module` 的源码路径不是 `/tmp/ngx_http_cppjieba_module`， 将上述过程的 `/tmp/xxx` 改为 `/your/path/xxx` 即可。
57 | 
58 | ## 启动 Nginx
59 | 
60 | ```
61 | /usr/local/nginx/sbin/nginx
62 | ```
63 | 
64 | ## 测试 
65 | 
66 | ### GET
67 | 
68 | ```
69 | curl "http://127.0.0.1/cppjieba?s=长春市长春药店"
70 | ```
71 | 
72 | ### POST
73 | 
74 | ```
75 | curl --data "长春市长春药店" "http://127.0.0.1/cppjieba"
76 | ```
77 | 
78 | 预期结果：
79 | 
80 | ```
81 | ["长春市", "长春", "药店"]
82 | ```
83 | 
84 | 用 `chrome` 打开上述链接也可以，不过要 **记得** 设置浏览器的页面编码方式为 `utf-8` 。
85 | 
86 | 
87 | ## 客服
88 | 
89 | `i@yanyiwu.com`
90 | 
91 | [CppJieba]:https://github.com/aszxqw/cppjieba
92 | [NginxModuleDevelopment]:http://yanyiwu.com/work/2014/09/21/nginx-module-development-stuff.html
93 | 


--------------------------------------------------------------------------------
/dict/user.dict.utf8:
--------------------------------------------------------------------------------
1 | 云计算
2 | 韩玉鉴赏
3 | 


--------------------------------------------------------------------------------
/src/CppJieba/DictTrie.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_DICT_TRIE_HPP
  2 | #define CPPJIEBA_DICT_TRIE_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <map>
  7 | #include <cstring>
  8 | #include <stdint.h>
  9 | #include <cmath>
 10 | #include <limits>
 11 | #include "Limonp/StringUtil.hpp"
 12 | #include "Limonp/Logger.hpp"
 13 | #include "TransCode.hpp"
 14 | #include "Trie.hpp"
 15 | 
 16 | 
 17 | 
 18 | namespace CppJieba
 19 | {
 20 |     using namespace Limonp;
 21 |     const double MIN_DOUBLE = -3.14e+100;
 22 |     const double MAX_DOUBLE = 3.14e+100;
 23 |     const size_t DICT_COLUMN_NUM = 3;
 24 |     const char* const UNKNOWN_TAG = "x";
 25 | 
 26 | 
 27 |     struct DictUnit
 28 |     {
 29 |         Unicode word;
 30 |         double weight; 
 31 |         string tag;
 32 |     };
 33 | 
 34 |     inline ostream & operator << (ostream& os, const DictUnit& unit)
 35 |     {
 36 |         string s;
 37 |         s << unit.word;
 38 |         return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
 39 |     }
 40 | 
 41 |     typedef map<size_t, const DictUnit*> DagType;
 42 | 
 43 |     class DictTrie
 44 |     {
 45 |         public:
 46 |             typedef Trie<Unicode::value_type, DictUnit, Unicode, vector<Unicode>, vector<const DictUnit*> > TrieType;
 47 |         private:
 48 |             vector<DictUnit> _nodeInfos;
 49 |             TrieType * _trie;
 50 | 
 51 |             double _minWeight;
 52 |         private:
 53 |             unordered_set<Unicode::value_type> _userDictSingleChineseWord;
 54 |         public:
 55 |             bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
 56 |             {
 57 |                 return isIn(_userDictSingleChineseWord, word);
 58 |             }
 59 |         public:
 60 |             double getMinWeight() const {return _minWeight;};
 61 | 
 62 |         public:
 63 |             DictTrie()
 64 |             {
 65 |                 _trie = NULL;
 66 |                 _minWeight = MAX_DOUBLE;
 67 |             }
 68 |             DictTrie(const string& dictPath, const string& userDictPath = "")
 69 |             {
 70 |                 new (this) DictTrie();
 71 |                 init(dictPath, userDictPath);
 72 |             }
 73 |             ~DictTrie()
 74 |             {
 75 |                 if(_trie)
 76 |                 {
 77 |                     delete _trie;
 78 |                 }
 79 |             }
 80 |             
 81 |         public:
 82 |             bool init(const string& dictPath, const string& userDictPath = "")
 83 |             {
 84 |                 assert(!_trie);
 85 |                 _loadDict(dictPath, _nodeInfos);
 86 |                 _calculateWeight(_nodeInfos);
 87 |                 _minWeight = _findMinWeight(_nodeInfos);
 88 |                 
 89 |                 if(userDictPath.size())
 90 |                 {
 91 |                     double maxWeight = _findMaxWeight(_nodeInfos);
 92 |                     _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
 93 |                 }
 94 |                 _shrink(_nodeInfos);
 95 |                 _trie = _creatTrie(_nodeInfos);
 96 |                 assert(_trie);
 97 |                 return true;
 98 |             }
 99 | 
100 |         public:
101 |             const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
102 |             {
103 |                 return _trie->find(begin, end);
104 |             }
105 |             bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
106 |             {
107 |                 return _trie->find(begin, end, dag, offset);
108 |             }
109 | 
110 | 
111 |         private:
112 |             TrieType * _creatTrie(const vector<DictUnit>& dictUnits)
113 |             {
114 |                 assert(dictUnits.size());
115 |                 vector<Unicode> words;
116 |                 vector<const DictUnit*> valuePointers;
117 |                 for(size_t i = 0 ; i < dictUnits.size(); i ++)
118 |                 {
119 |                     words.push_back(dictUnits[i].word);
120 |                     valuePointers.push_back(&dictUnits[i]);
121 |                 }
122 | 
123 |                 TrieType * trie = new TrieType(words, valuePointers);
124 |                 return trie;
125 |             }
126 |             void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
127 |             {
128 |                 ifstream ifs(filePath.c_str());
129 |                 assert(ifs);
130 |                 string line;
131 |                 DictUnit nodeInfo;
132 |                 size_t lineno;
133 |                 for(lineno = 0; getline(ifs, line); lineno++)
134 |                 {
135 |                     if(!TransCode::decode(line, nodeInfo.word))
136 |                     {
137 |                         LogError("line[%u:%s] illegal.", lineno, line.c_str());
138 |                         continue;
139 |                     }
140 |                     if(nodeInfo.word.size() == 1)
141 |                     {
142 |                         _userDictSingleChineseWord.insert(nodeInfo.word[0]);
143 |                     }
144 |                     nodeInfo.weight = defaultWeight; 
145 |                     nodeInfo.tag = defaultTag;
146 |                     _nodeInfos.push_back(nodeInfo);
147 |                 }
148 |                 LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
149 |             }
150 |             void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
151 |             {
152 |                 ifstream ifs(filePath.c_str());
153 |                 assert(ifs);
154 |                 string line;
155 |                 vector<string> buf;
156 | 
157 |                 DictUnit nodeInfo;
158 |                 for(size_t lineno = 0 ; getline(ifs, line); lineno++)
159 |                 {
160 |                     split(line, buf, " ");
161 |                     assert(buf.size() == DICT_COLUMN_NUM);
162 |                     
163 |                     if(!TransCode::decode(buf[0], nodeInfo.word))
164 |                     {
165 |                         LogError("line[%u:%s] illegal.", lineno, line.c_str());
166 |                         continue;
167 |                     }
168 |                     nodeInfo.weight = atof(buf[1].c_str());
169 |                     nodeInfo.tag = buf[2];
170 |                     
171 |                     nodeInfos.push_back(nodeInfo);
172 |                 }
173 |             }
174 |             double _findMinWeight(const vector<DictUnit>& nodeInfos) const
175 |             {
176 |                 double ret = MAX_DOUBLE;
177 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
178 |                 {
179 |                     ret = min(nodeInfos[i].weight, ret);
180 |                 }
181 |                 return ret;
182 |             }
183 |             double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
184 |             {
185 |                 double ret = MIN_DOUBLE;
186 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
187 |                 {
188 |                     ret = max(nodeInfos[i].weight, ret);
189 |                 }
190 |                 return ret;
191 |             }
192 | 
193 |             void _calculateWeight(vector<DictUnit>& nodeInfos) const
194 |             {
195 |                 double sum = 0.0;
196 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
197 |                 {
198 |                     sum += nodeInfos[i].weight;
199 |                 }
200 |                 assert(sum);
201 |                 for(size_t i = 0; i < nodeInfos.size(); i++)
202 |                 {
203 |                     DictUnit& nodeInfo = nodeInfos[i];
204 |                     assert(nodeInfo.weight);
205 |                     nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
206 |                 }
207 |             }
208 | 
209 |             void _shrink(vector<DictUnit>& units) const
210 |             {
211 |                 vector<DictUnit>(units.begin(), units.end()).swap(units);
212 |             }
213 | 
214 | 
215 |     };
216 | }
217 | 
218 | #endif
219 | 


--------------------------------------------------------------------------------
/src/CppJieba/FullSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_FULLSEGMENT_H
  2 | #define CPPJIEBA_FULLSEGMENT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <set>
  6 | #include <cassert>
  7 | #include "Limonp/Logger.hpp"
  8 | #include "DictTrie.hpp"
  9 | #include "ISegment.hpp"
 10 | #include "SegmentBase.hpp"
 11 | #include "TransCode.hpp"
 12 | 
 13 | namespace CppJieba
 14 | {
 15 |     class FullSegment: public SegmentBase
 16 |     {
 17 |         private:
 18 |             const DictTrie* _dictTrie;
 19 |             bool _isBorrowed;
 20 |         public:
 21 |             FullSegment()
 22 |             {
 23 |                 _dictTrie = NULL;
 24 |                 _isBorrowed = false;
 25 |             }
 26 |             explicit FullSegment(const string& dictPath)
 27 |             {
 28 |                 _dictTrie = NULL;
 29 |                 init(dictPath);
 30 |             }
 31 |             explicit FullSegment(const DictTrie* dictTrie) 
 32 |             {
 33 |                 _dictTrie = NULL;
 34 |                 init(dictTrie);
 35 |             }
 36 |             virtual ~FullSegment()
 37 |             {
 38 |                 if(_dictTrie && ! _isBorrowed) 
 39 |                 {
 40 |                     delete _dictTrie;
 41 |                 }
 42 | 
 43 |             };
 44 |         public:
 45 |             bool init(const string& dictPath)
 46 |             {
 47 |                 assert(_dictTrie == NULL);
 48 |                 _dictTrie = new DictTrie(dictPath);
 49 |                 _isBorrowed = false;
 50 |                 return true;
 51 |             }
 52 |             bool init(const DictTrie* dictTrie) 
 53 |             {
 54 |                 assert(_dictTrie == NULL);
 55 |                 assert(dictTrie);
 56 |                 _dictTrie = dictTrie;
 57 |                 _isBorrowed = true;
 58 |                 return true;
 59 |             }
 60 | 
 61 |         public:
 62 |             using SegmentBase::cut;
 63 | 
 64 |         public:
 65 |             bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
 66 |             {
 67 |                 assert(_dictTrie);
 68 |                 if (begin >= end)
 69 |                 {
 70 |                     LogError("begin >= end");
 71 |                     return false;
 72 |                 }
 73 | 
 74 |                 //resut of searching in trie tree
 75 |                 DagType tRes;
 76 | 
 77 |                 //max index of res's words
 78 |                 int maxIdx = 0;
 79 | 
 80 |                 // always equals to (uItr - begin)
 81 |                 int uIdx = 0;
 82 | 
 83 |                 //tmp variables
 84 |                 int wordLen = 0;
 85 |                 for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
 86 |                 {
 87 |                     //find word start from uItr
 88 |                     if (_dictTrie->find(uItr, end, tRes, 0))
 89 |                     {
 90 |                         for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
 91 |                         //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
 92 |                         {
 93 |                             wordLen = itr->second->word.size();
 94 |                             if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
 95 |                             {
 96 |                                 res.push_back(itr->second->word);
 97 |                             }
 98 |                             maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
 99 |                         }
100 |                         tRes.clear();
101 |                     }
102 |                     else // not found word start from uItr
103 |                     {
104 |                         if (maxIdx <= uIdx) // never exist in prev results
105 |                         {
106 |                             //put itr itself in res
107 |                             res.push_back(Unicode(1, *uItr));
108 | 
109 |                             //mark it exits
110 |                             ++maxIdx;
111 |                         }
112 |                     }
113 |                     ++uIdx;
114 |                 }
115 | 
116 |                 return true;
117 |             }
118 | 
119 |             bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
120 |             {
121 |                 assert(_dictTrie);
122 |                 if (begin >= end)
123 |                 {
124 |                     LogError("begin >= end");
125 |                     return false;
126 |                 }
127 | 
128 |                 vector<Unicode> uRes;
129 |                 if (!cut(begin, end, uRes))
130 |                 {
131 |                     LogError("get unicode cut result error.");
132 |                     return false;
133 |                 }
134 | 
135 |                 string tmp;
136 |                 for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
137 |                 {
138 |                     if (TransCode::encode(*uItr, tmp))
139 |                     {
140 |                         res.push_back(tmp);
141 |                     }
142 |                     else
143 |                     {
144 |                         LogError("encode failed.");
145 |                     }
146 |                 }
147 | 
148 |                 return true;
149 |             }
150 |     };
151 | }
152 | 
153 | #endif
154 | 


--------------------------------------------------------------------------------
/src/CppJieba/HMMSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIBEA_HMMSEGMENT_H
  2 | #define CPPJIBEA_HMMSEGMENT_H
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <memory.h>
  7 | #include <cassert>
  8 | #include "Limonp/StringUtil.hpp"
  9 | #include "Limonp/Logger.hpp"
 10 | #include "TransCode.hpp"
 11 | #include "ISegment.hpp"
 12 | #include "SegmentBase.hpp"
 13 | #include "DictTrie.hpp"
 14 | 
 15 | namespace CppJieba
 16 | {
 17 |     using namespace Limonp;
 18 |     typedef unordered_map<uint16_t, double> EmitProbMap;
 19 |     class HMMSegment: public SegmentBase
 20 |     {
 21 |         public:
 22 |             /*
 23 |              * STATUS:
 24 |              * 0:B, 1:E, 2:M, 3:S
 25 |              * */
 26 |             enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
 27 |         private:
 28 |             char _statMap[STATUS_SUM];
 29 |             double _startProb[STATUS_SUM];
 30 |             double _transProb[STATUS_SUM][STATUS_SUM];
 31 |             EmitProbMap _emitProbB;
 32 |             EmitProbMap _emitProbE;
 33 |             EmitProbMap _emitProbM;
 34 |             EmitProbMap _emitProbS;
 35 |             vector<EmitProbMap* > _emitProbVec;
 36 | 
 37 |         public:
 38 |             HMMSegment(){}
 39 |             explicit HMMSegment(const string& filePath)
 40 |             {
 41 |                 LIMONP_CHECK(init(filePath));
 42 |             }
 43 |             virtual ~HMMSegment(){}
 44 |         public:
 45 |             bool init(const string& filePath)
 46 |             {
 47 |                 memset(_startProb, 0, sizeof(_startProb));
 48 |                 memset(_transProb, 0, sizeof(_transProb));
 49 |                 _statMap[0] = 'B';
 50 |                 _statMap[1] = 'E';
 51 |                 _statMap[2] = 'M';
 52 |                 _statMap[3] = 'S';
 53 |                 _emitProbVec.push_back(&_emitProbB);
 54 |                 _emitProbVec.push_back(&_emitProbE);
 55 |                 _emitProbVec.push_back(&_emitProbM);
 56 |                 _emitProbVec.push_back(&_emitProbS);
 57 |                 LIMONP_CHECK(_loadModel(filePath.c_str()));
 58 |                 LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
 59 |                 return true;
 60 |             }
 61 |         public:
 62 |             using SegmentBase::cut;
 63 |         public:
 64 |             bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const 
 65 |             {
 66 |                 Unicode::const_iterator left = begin;
 67 |                 Unicode::const_iterator right = begin;
 68 |                 while(right != end)
 69 |                 {
 70 |                     if(*right < 0x80) 
 71 |                     {
 72 |                         if(left != right && !_cut(left, right, res))
 73 |                         {
 74 |                             return false;
 75 |                         }
 76 |                         left = right;
 77 |                         while(*right < 0x80 && right != end)
 78 |                         {
 79 |                             right++;
 80 |                         }
 81 |                         res.push_back(Unicode(left, right));
 82 |                         left = right;
 83 |                     }
 84 |                     else
 85 |                     {
 86 |                         right++;
 87 |                     }
 88 |                 }
 89 |                 if(left != right && !_cut(left, right, res))
 90 |                 {
 91 |                     return false;
 92 |                 }
 93 |                 return true;
 94 |             }
 95 |         private:
 96 |             bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
 97 |             {
 98 |                 vector<size_t> status; 
 99 |                 if(!_viterbi(begin, end, status))
100 |                 {
101 |                     LogError("_viterbi failed.");
102 |                     return false;
103 |                 }
104 | 
105 |                 Unicode::const_iterator left = begin;
106 |                 Unicode::const_iterator right;
107 |                 for(size_t i = 0; i < status.size(); i++)
108 |                 {
109 |                     if(status[i] % 2) //if(E == status[i] || S == status[i])
110 |                     {
111 |                         right = begin + i + 1;
112 |                         res.push_back(Unicode(left, right));
113 |                         left = right;
114 |                     }
115 |                 }
116 |                 return true;
117 |             }
118 |         public:
119 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
120 |             {
121 |                 if(begin == end)
122 |                 {
123 |                     return false;
124 |                 }
125 |                 vector<Unicode> words;
126 |                 words.reserve(end - begin);
127 |                 if(!cut(begin, end, words))
128 |                 {
129 |                     return false;
130 |                 }
131 |                 size_t offset = res.size();
132 |                 res.resize(res.size() + words.size());
133 |                 for(size_t i = 0; i < words.size(); i++)
134 |                 {
135 |                     if(!TransCode::encode(words[i], res[offset + i]))
136 |                     {
137 |                         LogError("encode failed.");
138 |                     }
139 |                 }
140 |                 return true;
141 |             }
142 | 
143 |         private:
144 |             bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
145 |             {
146 |                 if(begin == end)
147 |                 {
148 |                     return false;
149 |                 }
150 | 
151 |                 size_t Y = STATUS_SUM;
152 |                 size_t X = end - begin;
153 | 
154 |                 size_t XYSize = X * Y;
155 |                 size_t now, old, stat;
156 |                 double tmp, endE, endS;
157 | 
158 |                 vector<int> path(XYSize);
159 |                 vector<double> weight(XYSize);
160 | 
161 |                 //start
162 |                 for(size_t y = 0; y < Y; y++)
163 |                 {
164 |                     weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
165 |                     path[0 + y * X] = -1;
166 |                 }
167 | 
168 | 
169 |                 double emitProb;
170 | 
171 |                 for(size_t x = 1; x < X; x++)
172 |                 {
173 |                     for(size_t y = 0; y < Y; y++)
174 |                     {
175 |                         now = x + y*X;
176 |                         weight[now] = MIN_DOUBLE;
177 |                         path[now] = E; // warning
178 |                         emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
179 |                         for(size_t preY = 0; preY < Y; preY++)
180 |                         {
181 |                             old = x - 1 + preY * X;
182 |                             tmp = weight[old] + _transProb[preY][y] + emitProb;
183 |                             if(tmp > weight[now])
184 |                             {
185 |                                 weight[now] = tmp;
186 |                                 path[now] = preY;
187 |                             }
188 |                         }
189 |                     }
190 |                 }
191 | 
192 |                 endE = weight[X-1+E*X];
193 |                 endS = weight[X-1+S*X];
194 |                 stat = 0;
195 |                 if(endE >= endS)
196 |                 {
197 |                     stat = E;
198 |                 }
199 |                 else
200 |                 {
201 |                     stat = S;
202 |                 }
203 | 
204 |                 status.resize(X);
205 |                 for(int x = X -1 ; x >= 0; x--)
206 |                 {
207 |                     status[x] = stat;
208 |                     stat = path[x + stat*X];
209 |                 }
210 | 
211 |                 return true;
212 |             }
213 |             bool _loadModel(const char* const filePath)
214 |             {
215 |                 LogDebug("loadModel [%s] start ...", filePath);
216 |                 ifstream ifile(filePath);
217 |                 string line;
218 |                 vector<string> tmp;
219 |                 vector<string> tmp2;
220 |                 //load _startProb
221 |                 if(!_getLine(ifile, line))
222 |                 {
223 |                     return false;
224 |                 }
225 |                 split(line, tmp, " ");
226 |                 if(tmp.size() != STATUS_SUM)
227 |                 {
228 |                     LogError("start_p illegal");
229 |                     return false;
230 |                 }
231 |                 for(size_t j = 0; j< tmp.size(); j++)
232 |                 {
233 |                     _startProb[j] = atof(tmp[j].c_str());
234 |                     //cout<<_startProb[j]<<endl;
235 |                 }
236 | 
237 |                 //load _transProb
238 |                 for(size_t i = 0; i < STATUS_SUM; i++)
239 |                 {
240 |                     if(!_getLine(ifile, line))
241 |                     {
242 |                         return false;
243 |                     }
244 |                     split(line, tmp, " ");
245 |                     if(tmp.size() != STATUS_SUM)
246 |                     {
247 |                         LogError("trans_p illegal");
248 |                         return false;
249 |                     }
250 |                     for(size_t j =0; j < STATUS_SUM; j++)
251 |                     {
252 |                         _transProb[i][j] = atof(tmp[j].c_str());
253 |                         //cout<<_transProb[i][j]<<endl;
254 |                     }
255 |                 }
256 | 
257 |                 //load _emitProbB
258 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
259 |                 {
260 |                     return false;
261 |                 }
262 | 
263 |                 //load _emitProbE
264 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
265 |                 {
266 |                     return false;
267 |                 }
268 | 
269 |                 //load _emitProbM
270 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
271 |                 {
272 |                     return false;
273 |                 }
274 | 
275 |                 //load _emitProbS
276 |                 if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
277 |                 {
278 |                     return false;
279 |                 }
280 | 
281 |                 LogDebug("loadModel [%s] end.", filePath);
282 | 
283 |                 return true;
284 |             }
285 |             bool _getLine(ifstream& ifile, string& line)
286 |             {
287 |                 while(getline(ifile, line))
288 |                 {
289 |                     trim(line);
290 |                     if(line.empty())
291 |                     {
292 |                         continue;
293 |                     }
294 |                     if(startsWith(line, "#"))
295 |                     {
296 |                         continue;
297 |                     }
298 |                     return true;
299 |                 }
300 |                 return false;
301 |             }
302 |             bool _loadEmitProb(const string& line, EmitProbMap& mp)
303 |             {
304 |                 if(line.empty())
305 |                 {
306 |                     return false;
307 |                 }
308 |                 vector<string> tmp, tmp2;
309 |                 Unicode unicode;
310 |                 split(line, tmp, ",");
311 |                 for(size_t i = 0; i < tmp.size(); i++)
312 |                 {
313 |                     split(tmp[i], tmp2, ":");
314 |                     if(2 != tmp2.size())
315 |                     {
316 |                         LogError("_emitProb illegal.");
317 |                         return false;
318 |                     }
319 |                     if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
320 |                     {
321 |                         LogError("TransCode failed.");
322 |                         return false;
323 |                     }
324 |                     mp[unicode[0]] = atof(tmp2[1].c_str());
325 |                 }
326 |                 return true;
327 |             }
328 |             double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const 
329 |             {
330 |                 EmitProbMap::const_iterator cit = ptMp->find(key);
331 |                 if(cit == ptMp->end())
332 |                 {
333 |                     return defVal;
334 |                 }
335 |                 return cit->second;
336 | 
337 |             }
338 | 
339 | 
340 |     };
341 | }
342 | 
343 | #endif
344 | 


--------------------------------------------------------------------------------
/src/CppJieba/ISegment.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_SEGMENTINTERFACE_H
 2 | #define CPPJIEBA_SEGMENTINTERFACE_H
 3 | 
 4 | 
 5 | namespace CppJieba
 6 | {
 7 |     class ISegment
 8 |     {
 9 |         public:
10 |             virtual ~ISegment(){};
11 |         public:
12 |             virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
13 |             virtual bool cut(const string& str, vector<string>& res) const = 0;
14 |     };
15 | }
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/CppJieba/KeywordExtractor.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
  2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H
  3 | 
  4 | #include "MixSegment.hpp"
  5 | #include <cmath>
  6 | #include <set>
  7 | 
  8 | namespace CppJieba
  9 | {
 10 |     using namespace Limonp;
 11 | 
 12 |     /*utf8*/
 13 |     class KeywordExtractor
 14 |     {
 15 |         private:
 16 |             MixSegment _segment;
 17 |         private:
 18 |             unordered_map<string, double> _idfMap;
 19 |             double _idfAverage;
 20 | 
 21 |             unordered_set<string> _stopWords;
 22 |         public:
 23 |             KeywordExtractor(){};
 24 |             KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
 25 |             {
 26 |                 LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
 27 |             };
 28 |             ~KeywordExtractor(){};
 29 | 
 30 |         public:
 31 |             bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
 32 |             {
 33 |                 _loadIdfDict(idfPath);
 34 |                 _loadStopWordDict(stopWordPath);
 35 |                 LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
 36 |                 return true;
 37 |             };
 38 |         public:
 39 | 
 40 |             bool extract(const string& str, vector<string>& keywords, size_t topN) const
 41 |             {
 42 |                 vector<pair<string, double> > topWords;
 43 |                 if(!extract(str, topWords, topN))
 44 |                 {
 45 |                     return false;
 46 |                 }
 47 |                 for(size_t i = 0; i < topWords.size(); i++)
 48 |                 {
 49 |                     keywords.push_back(topWords[i].first);
 50 |                 }
 51 |                 return true;
 52 |             }
 53 | 
 54 |             bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
 55 |             {
 56 |                 vector<string> words;
 57 |                 if(!_segment.cut(str, words))
 58 |                 {
 59 |                     LogError("segment cut(%s) failed.", str.c_str());
 60 |                     return false;
 61 |                 }
 62 | 
 63 |                 map<string, double> wordmap;
 64 |                 for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
 65 |                 {
 66 |                     if(_isSingleWord(*iter))
 67 |                     {
 68 |                         continue;
 69 |                     }
 70 |                     wordmap[*iter] += 1.0;
 71 |                 }
 72 | 
 73 |                 for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
 74 |                 {
 75 |                     if(_stopWords.end() != _stopWords.find(itr->first))
 76 |                     {
 77 |                         wordmap.erase(itr++);
 78 |                         continue;
 79 |                     }
 80 | 
 81 |                     unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
 82 |                     if(cit != _idfMap.end())
 83 |                     {
 84 |                         itr->second *= cit->second;
 85 |                     }
 86 |                     else
 87 |                     {
 88 |                         itr->second *= _idfAverage;
 89 |                     }
 90 |                     itr ++;
 91 |                 }
 92 | 
 93 |                 keywords.clear();
 94 |                 std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
 95 |                 topN = min(topN, keywords.size());
 96 |                 partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
 97 |                 keywords.resize(topN);
 98 |                 return true;
 99 |             }
100 |         private:
101 |             void _loadIdfDict(const string& idfPath)
102 |             {
103 |                 ifstream ifs(idfPath.c_str());
104 |                 if(!ifs)
105 |                 {
106 |                     LogError("open %s failed.", idfPath.c_str());
107 |                     assert(false);
108 |                 }
109 |                 string line ;
110 |                 vector<string> buf;
111 |                 double idf = 0.0;
112 |                 double idfSum = 0.0;
113 |                 size_t lineno = 0;
114 |                 for(;getline(ifs, line); lineno++)
115 |                 {
116 |                     buf.clear();
117 |                     if(line.empty())
118 |                     {
119 |                         LogError("line[%d] empty. skipped.", lineno);
120 |                         continue;
121 |                     }
122 |                     if(!split(line, buf, " ") || buf.size() != 2)
123 |                     {
124 |                         LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
125 |                         continue;
126 |                     }
127 |                     idf = atof(buf[1].c_str());
128 |                     _idfMap[buf[0]] = idf;
129 |                     idfSum += idf;
130 | 
131 |                 } 
132 | 
133 |                 assert(lineno);
134 |                 _idfAverage = idfSum / lineno;
135 |                 assert(_idfAverage > 0.0);
136 |             }
137 |             void _loadStopWordDict(const string& filePath)
138 |             {
139 |                 ifstream ifs(filePath.c_str());
140 |                 if(!ifs)
141 |                 {
142 |                     LogError("open %s failed.", filePath.c_str());
143 |                     assert(false);
144 |                 }
145 |                 string line ;
146 |                 while(getline(ifs, line))
147 |                 {
148 |                     _stopWords.insert(line);
149 |                 }
150 |                 assert(_stopWords.size());
151 |             }
152 |         private:
153 |             bool _isSingleWord(const string& str) const
154 |             {
155 |                 Unicode unicode;
156 |                 TransCode::decode(str, unicode);
157 |                 if(unicode.size() == 1)
158 |                   return true;
159 |                 return false;
160 |             }
161 | 
162 |         private:
163 |             static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
164 |             {
165 |                 return lhs.second > rhs.second;
166 |             }
167 |             
168 |     };
169 | }
170 | 
171 | #endif
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Config.hpp:
--------------------------------------------------------------------------------
  1 | /************************************
  2 |  * file enc : utf8
  3 |  * author   : wuyanyi09@gmail.com
  4 |  ************************************/
  5 | #ifndef LIMONP_CONFIG_H
  6 | #define LIMONP_CONFIG_H
  7 | 
  8 | 
  9 | #include <map>
 10 | #include <fstream>
 11 | #include <iostream>
 12 | #include <assert.h>
 13 | #include "StringUtil.hpp"
 14 | 
 15 | namespace Limonp
 16 | {
 17 |     using namespace std;
 18 |     class Config
 19 |     {
 20 |         public:
 21 |             explicit Config(const string& filePath)
 22 |             {
 23 |                 _loadFile(filePath);
 24 |             }
 25 |         public:
 26 |             operator bool ()
 27 |             {
 28 |                 return !_map.empty();
 29 |             }
 30 |         private:
 31 |             void _loadFile(const string& filePath)
 32 |             {
 33 |                 ifstream ifs(filePath.c_str());
 34 |                 assert(ifs);
 35 |                 string line;
 36 |                 vector<string> vecBuf;
 37 |                 size_t lineno = 0;
 38 |                 while(getline(ifs, line))
 39 |                 {
 40 |                     lineno ++;
 41 |                     trim(line);
 42 |                     if(line.empty() || startsWith(line, "#"))
 43 |                     {
 44 |                         continue;
 45 |                     }
 46 |                     vecBuf.clear();
 47 |                     if(!split(line, vecBuf, "=") || 2 != vecBuf.size())
 48 |                     {
 49 |                         fprintf(stderr, "line[%s] illegal.\n", line.c_str());
 50 |                         assert(false);
 51 |                         continue;
 52 |                     }
 53 |                     string& key = vecBuf[0];
 54 |                     string& value = vecBuf[1];
 55 |                     trim(key);
 56 |                     trim(value);
 57 |                     if(!_map.insert(make_pair(key, value)).second)
 58 |                     {
 59 |                         fprintf(stderr, "key[%s] already exits.\n", key.c_str());
 60 |                         assert(false);
 61 |                         continue;
 62 |                     }
 63 |                 }
 64 |                 ifs.close();
 65 |             }
 66 |         public:
 67 |             bool get(const string& key, string& value) const
 68 |             {
 69 |                 map<string, string>::const_iterator it = _map.find(key);
 70 |                 if(_map.end() != it)
 71 |                 {
 72 |                     value = it->second;
 73 |                     return true;
 74 |                 }
 75 |                 return false;
 76 |             }
 77 |             const char* operator [] (const char* key) const
 78 |             {
 79 |                 if(NULL == key)
 80 |                 {
 81 |                     return NULL;
 82 |                 }
 83 |                 map<string, string>::const_iterator it = _map.find(key);
 84 |                 if(_map.end() != it)
 85 |                 {
 86 |                     return it->second.c_str();
 87 |                 }
 88 |                 return NULL;
 89 |             }
 90 |         private:
 91 |             map<string, string> _map;
 92 |         private:
 93 |             friend ostream& operator << (ostream& os, const Config& config);
 94 |     };
 95 |     
 96 |     inline ostream& operator << (ostream& os, const Config& config)
 97 |     {
 98 |         return os << config._map;
 99 |     }
100 | }
101 | 
102 | #endif
103 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/HandyMacro.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_MACRO_DEF_H
 2 | #define LIMONP_MACRO_DEF_H
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | #define LIMONP_CHECK(exp) \
 7 |     if(!(exp)){fprintf(stderr, "File:%s, Line:%d Exp:[" #exp "] is true, abort.\n", __FILE__, __LINE__); abort();}
 8 | 
 9 | #define print(x) cout<< #x": " << x <<endl
10 | /*
11 | #define XX_GET_SET(varType, varName, funName)\
12 | private: varType varName;\
13 | public: inline varType get##funName(void) const {return varName;}\
14 | public: inline void set##funName(varType var) {varName = var;}
15 | 
16 | #define XX_GET(varType, varName, funName)\
17 | private: varType varName;\
18 | public: inline varType get##funName(void) const {return varName;}
19 | 
20 | #define XX_SET(varType, varName, funName)\
21 | private: varType varName;\
22 | public: inline void set##funName(varType var) {varName = var;}
23 | 
24 | #define XX_GET_SET_BY_REF(varType, varName, funName)\
25 | private: varType varName;\
26 | public: inline const varType& get##funName(void) const {return varName;}\
27 | public: inline void set##funName(const varType& var){varName = var;}
28 | */
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/InitOnOff.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_INITONOFF_H
 2 | #define LIMONP_INITONOFF_H
 3 | 
 4 | namespace Limonp
 5 | {
 6 |     class InitOnOff
 7 |     {
 8 |         public:
 9 |             InitOnOff(){_setInitFlag(false);};
10 |             ~InitOnOff(){};
11 |         protected:
12 |             bool _isInited;
13 |             bool _getInitFlag()const{return _isInited;};
14 |             bool _setInitFlag(bool flag){return _isInited = flag;};
15 |         public:
16 |             operator bool() const {return _getInitFlag();};
17 | 
18 |     };
19 | }
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/LocalVector.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_LOCAL_VECTOR_HPP
  2 | #define LIMONP_LOCAL_VECTOR_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <stdlib.h>
  6 | #include <assert.h>
  7 | #include <string.h>
  8 | 
  9 | namespace Limonp
 10 | {
 11 |     using namespace std;
 12 |     /*
 13 |      * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
 14 |      * LocalVector<T> is simple and not well-tested. 
 15 |      */
 16 |     const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
 17 |     template <class T>
 18 |         class LocalVector
 19 |         {
 20 |             public:
 21 |                 typedef const T* const_iterator ;
 22 |                 typedef T value_type;
 23 |                 typedef size_t size_type;
 24 |             private:
 25 |                 T _buffer[LOCAL_VECTOR_BUFFER_SIZE];
 26 |                 T * _ptr;
 27 |                 size_t _size;
 28 |                 size_t _capacity;
 29 |             public:
 30 |                 LocalVector()
 31 |                 {
 32 |                     _init();
 33 |                 };
 34 |                 LocalVector(const LocalVector<T>& vec)
 35 |                 {
 36 |                     _init();
 37 |                     *this = vec;
 38 |                 }
 39 |                 LocalVector(const_iterator  begin, const_iterator end) // TODO: make it faster
 40 |                 {
 41 |                     _init();
 42 |                     while(begin != end)
 43 |                     {
 44 |                         push_back(*begin++);
 45 |                     }
 46 |                 }
 47 |                 LocalVector(size_t size, const T& t) // TODO: make it faster
 48 |                 {
 49 |                     _init();
 50 |                     while(size--)
 51 |                     {
 52 |                         push_back(t);
 53 |                     }
 54 |                 }
 55 |                 ~LocalVector()
 56 |                 {
 57 |                     if(_ptr != _buffer)
 58 |                     {
 59 |                         free(_ptr);
 60 |                     }
 61 |                 };
 62 |             public:
 63 |                 LocalVector<T>& operator = (const LocalVector<T>& vec)
 64 |                 {
 65 |                     clear();
 66 |                     _size = vec.size();
 67 |                     _capacity = vec.capacity();
 68 |                     if(vec._buffer == vec._ptr)
 69 |                     {
 70 |                         memcpy(_buffer, vec._buffer, sizeof(T) * _size);
 71 |                         _ptr = _buffer;
 72 |                     }
 73 |                     else
 74 |                     {
 75 |                         _ptr = (T*) malloc(vec.capacity() * sizeof(T));
 76 |                         assert(_ptr);
 77 |                         memcpy(_ptr, vec._ptr, vec.size() * sizeof(T));
 78 |                     }
 79 |                     return *this;
 80 |                 }
 81 |             private:
 82 |                 void _init()
 83 |                 {
 84 |                     _ptr = _buffer;
 85 |                     _size = 0;
 86 |                     _capacity = LOCAL_VECTOR_BUFFER_SIZE;
 87 |                 }
 88 |             public:
 89 |                 T& operator [] (size_t i) 
 90 |                 {
 91 |                     return _ptr[i];
 92 |                 }
 93 |                 const T& operator [] (size_t i) const
 94 |                 {
 95 |                     return _ptr[i];
 96 |                 }
 97 |                 void push_back(const T& t)
 98 |                 {
 99 |                     if(_size == _capacity)
100 |                     {
101 |                         assert(_capacity);
102 |                         reserve(_capacity * 2);
103 |                     }
104 |                     _ptr[_size ++ ] = t;
105 |                 }
106 |                 void reserve(size_t size) 
107 |                 {
108 |                     if(size <= _capacity)
109 |                     {
110 |                         return;
111 |                     }
112 |                     T * next =  (T*)malloc(sizeof(T) * size);
113 |                     assert(next);
114 |                     T * old = _ptr;
115 |                     _ptr = next;
116 |                     memcpy(_ptr, old, sizeof(T) * _capacity);
117 |                     _capacity = size;
118 |                     if(old != _buffer)
119 |                     {
120 |                         free(old);
121 |                     }
122 |                 }
123 |                 bool empty() const
124 |                 {
125 |                     return 0 == size();
126 |                 }
127 |                 size_t size() const
128 |                 {
129 |                     return _size;
130 |                 }
131 |                 size_t capacity() const
132 |                 {
133 |                     return _capacity;
134 |                 }
135 |                 const_iterator begin() const
136 |                 {
137 |                     return _ptr;
138 |                 }
139 |                 const_iterator end() const
140 |                 {
141 |                     return _ptr + _size;
142 |                 }
143 |                 void clear()
144 |                 {
145 |                     if(_ptr != _buffer)
146 |                     {
147 |                         free(_ptr);
148 |                     }
149 |                     _init();
150 |                 }
151 |         };
152 | 
153 |     template <class T>
154 |         ostream & operator << (ostream& os, const LocalVector<T>& vec)
155 |         {
156 |             if(vec.empty())
157 |             {
158 |                 return os << "[]";
159 |             }
160 |             os<<"[\""<<vec[0];
161 |             for(size_t i = 1; i < vec.size(); i++)
162 |             {
163 |                 os<<"\", \""<<vec[i];
164 |             }
165 |             os<<"\"]";
166 |             return os;
167 |         }
168 | 
169 | }
170 | 
171 | #endif
172 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/Logger.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  * file enc : utf8
 3 |  * author   : wuyanyi09@gmail.com
 4 |  ************************************/
 5 | #ifndef LIMONP_LOGGER_H
 6 | #define LIMONP_LOGGER_H
 7 | 
 8 | #include <vector>
 9 | #include <iostream>
10 | #include <fstream>
11 | #include <string>
12 | #include <cstring>
13 | #include <stdio.h>
14 | #include <cstdlib>
15 | #include <stdarg.h>
16 | #include <time.h>
17 | #include <cassert>
18 | 
19 | #define FILE_BASENAME strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
20 | 
21 | #define LogDebug(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_DEBUG, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
22 | #define LogInfo(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_INFO, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
23 | #define LogWarn(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_WARN, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
24 | #define LogError(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_ERROR, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
25 | #define LogFatal(fmt, ...) Limonp::Logger::LoggingF(Limonp::LL_FATAL, FILE_BASENAME, __LINE__, fmt, ## __VA_ARGS__)
26 | 
27 | namespace Limonp
28 | {
29 |     using namespace std;
30 |     enum {LL_DEBUG = 0, LL_INFO = 1, LL_WARN = 2, LL_ERROR = 3, LL_FATAL = 4, LEVEL_ARRAY_SIZE = 5, CSTR_BUFFER_SIZE = 32};
31 |     static const char * LOG_LEVEL_ARRAY[LEVEL_ARRAY_SIZE]= {"DEBUG","INFO","WARN","ERROR","FATAL"};
32 |     static const char * LOG_FORMAT = "%s %s:%d %s %s\n";
33 |     static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
34 | 
35 |     class Logger
36 |     {
37 |         public:
38 |             static void Logging(size_t level, const string& msg, const char* fileName, int lineno)
39 |             {
40 |                 assert(level <= LL_FATAL);
41 |                 char buf[CSTR_BUFFER_SIZE];
42 |                 time_t timeNow;
43 |                 time(&timeNow);
44 |                 strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&timeNow));
45 |                 fprintf(stderr, LOG_FORMAT, buf, fileName, lineno,LOG_LEVEL_ARRAY[level], msg.c_str());
46 |             }
47 |             static void LoggingF(size_t level, const char* fileName, int lineno, const char* const fmt, ...)
48 |             {
49 | #ifdef LOGGER_LEVEL
50 |                 if(level < LOGGER_LEVEL) return;
51 | #endif
52 |                 int size = 256;
53 |                 string msg;
54 |                 va_list ap;
55 |                 while (1) {
56 |                     msg.resize(size);
57 |                     va_start(ap, fmt);
58 |                     int n = vsnprintf((char *)msg.c_str(), size, fmt, ap);
59 |                     va_end(ap);
60 |                     if (n > -1 && n < size) {
61 |                         msg.resize(n);
62 |                         break;
63 |                     }
64 |                     if (n > -1)
65 |                       size = n + 1;
66 |                     else
67 |                       size *= 2;
68 |                 }
69 |                 Logging(level, msg, fileName, lineno);
70 |             }
71 |     };
72 | }
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/MysqlClient.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_MYSQLCLIENT_H
  2 | #define LIMONP_MYSQLCLIENT_H
  3 | 
  4 | #include <mysql.h>
  5 | #include <iostream>
  6 | #include <vector>
  7 | #include <string>
  8 | #include "logger.hpp"
  9 | #include "InitOnOff.hpp"
 10 | 
 11 | namespace Limonp
 12 | {
 13 |     using namespace std;
 14 |     class MysqlClient: public InitOnOff
 15 |     {
 16 |         public:
 17 |             typedef vector< vector<string> > RowsType;
 18 |         private:
 19 |             const string _host;
 20 |             const size_t _port;
 21 |             const string _user;
 22 |             const string _passwd;
 23 |             const string _db;
 24 |             const string _charset;
 25 |         public:
 26 |             MysqlClient(const string& host, size_t port, const string& user, const string& passwd, const string& db, const string& charset = "utf8"): _host(host), _port(port), _user(user), _passwd(passwd), _db(db), _charset(charset), _conn(NULL)
 27 |             {
 28 |                 _setInitFlag(_init());
 29 |             }
 30 |             ~MysqlClient()
 31 |             {
 32 |                 if(_conn)
 33 |                 {
 34 |                     mysql_close(_conn);
 35 |                 }
 36 |             };
 37 |         private:
 38 |             bool _init()
 39 |             {
 40 |                 //cout<<mysql_get_client_info()<<endl;
 41 |                 if(NULL == (_conn = mysql_init(NULL)))
 42 |                 {
 43 |                     LogError("mysql_init faield. %s", mysql_error(_conn));
 44 |                     return false;
 45 |                 }
 46 | 
 47 |                 if (mysql_real_connect(_conn, _host.c_str(), _user.c_str(), _passwd.c_str(), _db.c_str(), _port, NULL, 0) == NULL)
 48 |                 {
 49 |                     LogError("mysql_real_connect failed. %s", mysql_error(_conn));
 50 |                     mysql_close(_conn);
 51 |                     _conn = NULL;
 52 |                     return false;
 53 |                 }  
 54 | 
 55 |                 if(mysql_set_character_set(_conn, _charset.c_str()))
 56 |                 {
 57 |                     LogError("mysql_set_character_set [%s] failed.", _charset.c_str());
 58 |                     return false;
 59 |                 }
 60 | 
 61 |                 //set reconenct
 62 |                 char value = 1;
 63 |                 mysql_options(_conn, MYSQL_OPT_RECONNECT, &value);
 64 | 
 65 |                 LogInfo("MysqlClient {host: %s, database:%s, charset:%s}", _host.c_str(), _db.c_str(), _charset.c_str());
 66 |                 return true;
 67 |             }
 68 |         public:
 69 |             bool executeSql(const string& sql)
 70 |             {
 71 |                 assert(_getInitFlag());
 72 |                 if(mysql_query(_conn, sql.c_str())) 
 73 |                 {
 74 |                     LogError("mysql_query failed.  %s", mysql_error(_conn));
 75 |                     return false;
 76 |                 }
 77 |                 return true;
 78 |             }
 79 |             size_t insert(const string& tableName, const string& keys, const vector<string>& vals)
 80 |             {
 81 |                 size_t retn = 0;
 82 |                 string sql;
 83 |                 for(size_t i = 0; i < vals.size(); i ++)
 84 |                 {
 85 |                     sql.clear();
 86 |                     string_format(sql, "insert into %s (%s) values %s", tableName.c_str(), keys.c_str(), vals[i].c_str());
 87 |                     retn += executeSql(sql.c_str());
 88 |                 }
 89 |                 return retn;
 90 |             }
 91 |             bool select(const string& sql, RowsType& rows)
 92 |             {
 93 |                 if(!executeSql(sql))
 94 |                 {
 95 |                     LogError("executeSql failed. [%s]", sql.c_str());
 96 |                     return false;
 97 |                 }
 98 |                 MYSQL_RES * result = mysql_store_result(_conn);
 99 |                 if(!result)
100 |                 {
101 |                     LogError("mysql_store_result failed.[%d]", mysql_error(_conn));
102 |                     return false;
103 |                 }
104 |                 size_t num_fields = mysql_num_fields(result);
105 |                 MYSQL_ROW row;
106 |                 while((row = mysql_fetch_row(result)))
107 |                 {
108 |                     vector<string> vec;
109 |                     for(size_t i = 0; i < num_fields; i ++)
110 |                     {
111 |                         row[i] ? vec.push_back(row[i]) : vec.push_back("NULL");
112 |                     }
113 |                     rows.push_back(vec);
114 |                 }
115 |                 mysql_free_result(result);
116 |                 return true;
117 |             }
118 | 
119 |         private:
120 |             MYSQL * _conn;
121 | 
122 |     };
123 | }
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/NonCopyable.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  ************************************/
 3 | #ifndef LIMONP_NONCOPYABLE_H
 4 | #define LIMONP_NONCOPYABLE_H
 5 | 
 6 | #include <iostream>
 7 | #include <string>
 8 | 
 9 | namespace Limonp
10 | {
11 |     class NonCopyable
12 |     {
13 |         protected:
14 |             NonCopyable(){};
15 |             ~NonCopyable(){};
16 |         private:
17 |             NonCopyable(const NonCopyable& );
18 |             const NonCopyable& operator=(const NonCopyable& );
19 |     };
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/StdExtension.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_STD_EXTEMSION_HPP
  2 | #define LIMONP_STD_EXTEMSION_HPP
  3 | 
  4 | #include <map>
  5 | 
  6 | #if(__cplusplus == 201103L)
  7 | #include <unordered_map>
  8 | #include <unordered_set>
  9 | #else
 10 | #include <tr1/unordered_map>
 11 | #include <tr1/unordered_set>
 12 | namespace std
 13 | {
 14 |     using std::tr1::unordered_map;
 15 |     using std::tr1::unordered_set;
 16 | }
 17 | 
 18 | #endif
 19 | 
 20 | #include <set>
 21 | #include <vector>
 22 | #include <fstream>
 23 | #include <sstream>
 24 | 
 25 | 
 26 | namespace std
 27 | {
 28 |     template<typename T>
 29 |         ostream& operator << (ostream& os, const vector<T>& vec)
 30 |         {
 31 |             if(vec.empty())
 32 |             {
 33 |                 return os << "[]";
 34 |             }
 35 |             os<<"[\""<<vec[0];
 36 |             for(size_t i = 1; i < vec.size(); i++)
 37 |             {
 38 |                 os<<"\", \""<<vec[i];
 39 |             }
 40 |             os<<"\"]";
 41 |             return os;
 42 |         }
 43 |     template<class T1, class T2>
 44 |         ostream& operator << (ostream& os, const pair<T1, T2>& pr)
 45 |         {
 46 |             os << pr.first << ":" << pr.second ;
 47 |             return os;
 48 |         }
 49 | 
 50 | 
 51 |     template<class T>
 52 |         string& operator << (string& str, const T& obj)
 53 |         {
 54 |             stringstream ss;
 55 |             ss << obj; // call ostream& operator << (ostream& os,
 56 |             return str = ss.str();
 57 |         }
 58 | 
 59 |     template<class T1, class T2>
 60 |         ostream& operator << (ostream& os, const map<T1, T2>& mp)
 61 |         {
 62 |             if(mp.empty())
 63 |             {
 64 |                 os<<"{}";
 65 |                 return os;
 66 |             }
 67 |             os<<'{';
 68 |             typename map<T1, T2>::const_iterator it = mp.begin();
 69 |             os<<*it;
 70 |             it++;
 71 |             while(it != mp.end())
 72 |             {
 73 |                 os<<", "<<*it;
 74 |                 it++;
 75 |             }
 76 |             os<<'}';
 77 |             return os;
 78 |         }
 79 |     template<class T1, class T2>
 80 |         ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp)
 81 |         {
 82 |             if(mp.empty())
 83 |             {
 84 |                 return os << "{}";
 85 |             }
 86 |             os<<'{';
 87 |             typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
 88 |             os<<*it;
 89 |             it++;
 90 |             while(it != mp.end())
 91 |             {
 92 |                 os<<", "<<*it++;
 93 |             }
 94 |             return os<<'}';
 95 |         }
 96 | 
 97 |     template<class T>
 98 |         ostream& operator << (ostream& os, const set<T>& st)
 99 |         {
100 |             if(st.empty())
101 |             {
102 |                 os << "{}";
103 |                 return os;
104 |             }
105 |             os<<'{';
106 |             typename set<T>::const_iterator it = st.begin();
107 |             os<<*it;
108 |             it++;
109 |             while(it != st.end())
110 |             {
111 |                 os<<", "<<*it;
112 |                 it++;
113 |             }
114 |             os<<'}';
115 |             return os;
116 |         }
117 | 
118 |     template<class KeyType, class ContainType>
119 |         bool isIn(const ContainType& contain, const KeyType& key)
120 |         {
121 |             return contain.end() != contain.find(key);
122 |         }
123 | 
124 |     template<class T>
125 |         basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs)
126 |         {
127 |             return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
128 |         }
129 | 
130 |     template<class T>
131 |         ofstream & operator << (ofstream & ofs, const basic_string<T>& s)
132 |         {
133 |             ostreambuf_iterator<T> itr (ofs);
134 |             copy(s.begin(), s.end(), itr);
135 |             return ofs;
136 |         }
137 | }
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/src/CppJieba/Limonp/StringUtil.hpp:
--------------------------------------------------------------------------------
  1 | /************************************
  2 |  * file enc : ascii
  3 |  * author   : wuyanyi09@gmail.com
  4 |  ************************************/
  5 | #ifndef LIMONP_STR_FUNCTS_H
  6 | #define LIMONP_STR_FUNCTS_H
  7 | #include <fstream>
  8 | #include <iostream>
  9 | #include <string>
 10 | #include <vector>
 11 | #include <algorithm>
 12 | #include <cctype>
 13 | #include <map>
 14 | #include <stdint.h>
 15 | #include <stdio.h>
 16 | #include <stdarg.h>
 17 | #include <memory.h>
 18 | #include <functional> 
 19 | #include <locale>
 20 | #include <sstream>
 21 | #include <sys/types.h>
 22 | #include <iterator>
 23 | #include <algorithm>
 24 | #include "StdExtension.hpp"
 25 | 
 26 | namespace Limonp
 27 | {
 28 |     using namespace std;
 29 | 
 30 |     inline void string_format(string& res, const char* fmt, ...)
 31 |     {
 32 |         int size = 256;
 33 |         va_list ap;
 34 |         res.clear();
 35 |         while (1) {
 36 |             res.resize(size);
 37 |             va_start(ap, fmt);
 38 |             int n = vsnprintf((char *)res.c_str(), size, fmt, ap);
 39 |             va_end(ap);
 40 |             if (n > -1 && n < size) {
 41 |                 res.resize(n);
 42 |                 return;
 43 |             }
 44 |             if (n > -1)
 45 |               size = n + 1;
 46 |             else
 47 |               size *= 2;
 48 |         }
 49 |     }
 50 |     inline string string_format(const char* fmt, ...) 
 51 |     {
 52 |         int size = 256;
 53 |         std::string str;
 54 |         va_list ap;
 55 |         while (1) {
 56 |             str.resize(size);
 57 |             va_start(ap, fmt);
 58 |             int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
 59 |             va_end(ap);
 60 |             if (n > -1 && n < size) {
 61 |                 str.resize(n);
 62 |                 return str;
 63 |             }
 64 |             if (n > -1)
 65 |               size = n + 1;
 66 |             else
 67 |               size *= 2;
 68 |         }
 69 |         return str;
 70 |     }
 71 | 
 72 |     template<class T>
 73 |         void join(T begin, T end, string& res, const string& connector)
 74 |         {
 75 |             if(begin == end)
 76 |             {
 77 |                 return;
 78 |             }
 79 |             stringstream ss;
 80 |             ss<<*begin;
 81 |             begin++;
 82 |             while(begin != end)
 83 |             {
 84 |                 ss << connector << *begin;
 85 |                 begin ++;
 86 |             }
 87 |             res = ss.str();
 88 |         }
 89 | 
 90 |     template<class T>
 91 |         string join(T begin, T end, const string& connector)
 92 |         {
 93 |             string res;
 94 |             join(begin ,end, res, connector);
 95 |             return res;
 96 |         }
 97 | 
 98 | 
 99 | 
100 |     inline bool split(const string& src, vector<string>& res, const string& pattern, size_t offset = 0, size_t len = string::npos)
101 |     {
102 |         if(src.empty())
103 |         {
104 |             return false;
105 |         }
106 |         res.clear();
107 | 
108 |         size_t start = 0;
109 |         size_t end = 0;
110 |         size_t cnt = 0;
111 |         while(start < src.size() && res.size() < len)
112 |         {
113 |             end = src.find_first_of(pattern, start);
114 |             if(string::npos == end)
115 |             {
116 |                 if(cnt >= offset)
117 |                 {
118 |                     res.push_back(src.substr(start));
119 |                 }
120 |                 return true;
121 |             }
122 |             //if(end == src.size() - 1)
123 |             //{
124 |             //    res.push_back("");
125 |             //    return true;
126 |             //}
127 |             if(cnt >= offset)
128 |             {
129 |                 res.push_back(src.substr(start, end - start));
130 |             }
131 |             cnt ++;
132 |             start = end + 1;
133 |         }
134 |         return true;
135 |     }
136 | 
137 |     inline string& upper(string& str)
138 |     {
139 |         transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
140 |         return str;
141 |     }
142 | 
143 |     inline string& lower(string& str)
144 |     {
145 |         transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
146 |         return str;
147 |     }
148 | 
149 |     inline std::string &ltrim(std::string &s) 
150 |     {
151 |         s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<int, int>(std::isspace))));
152 |         return s;
153 |     }
154 | 
155 |     inline std::string &rtrim(std::string &s) 
156 |     {
157 |         s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
158 |         return s;
159 |     }
160 | 
161 |     inline std::string &trim(std::string &s) 
162 |     {
163 |         return ltrim(rtrim(s));
164 |     }
165 | 
166 |     inline std::string & ltrim(std::string & s, char x)
167 |     {
168 |         s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
169 |         return s;
170 |     }
171 | 
172 |     inline std::string & rtrim(std::string & s, char x)
173 |     {
174 |         s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
175 |         return s;
176 |     }
177 | 
178 |     inline std::string &trim(std::string &s, char x)
179 |     {
180 |         return ltrim(rtrim(s, x), x);
181 |     }
182 | 
183 |     inline bool startsWith(const string& str, const string& prefix)
184 |     {
185 |         if(prefix.length() > str.length())
186 |         {
187 |             return false;
188 |         }
189 |         return 0 == str.compare(0, prefix.length(), prefix);
190 |     }
191 | 
192 |     inline bool endsWith(const string& str, const string& suffix)
193 |     {
194 |         if(suffix.length() > str.length())
195 |         {
196 |             return false;
197 |         }
198 |         return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
199 |     }
200 | 
201 |     inline bool isInStr(const string& str, char ch)
202 |     {
203 |         return str.find(ch) != string::npos;
204 |     }
205 | 
206 |     inline uint16_t twocharToUint16(char high, char low)
207 |     {
208 |         return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
209 |     }
210 | 
211 |     template <class Uint16Container>
212 |     bool utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec)
213 |     {
214 |         if(!str)
215 |         {
216 |             return false;
217 |         }
218 |         char ch1, ch2;
219 |         uint16_t tmp;
220 |         vec.clear();
221 |         for(size_t i = 0;i < len;)
222 |         {
223 |             if(!(str[i] & 0x80)) // 0xxxxxxx
224 |             {
225 |                 vec.push_back(str[i]);
226 |                 i++;
227 |             }
228 |             else if ((unsigned char)str[i] <= 0xdf && i + 1 < len) // 110xxxxxx
229 |             {
230 |                 ch1 = (str[i] >> 2) & 0x07;
231 |                 ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
232 |                 tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
233 |                 vec.push_back(tmp);
234 |                 i += 2;
235 |             }
236 |             else if((unsigned char)str[i] <= 0xef && i + 2 < len)
237 |             {
238 |                 ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
239 |                 ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 
240 |                 tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
241 |                 vec.push_back(tmp);
242 |                 i += 3;
243 |             }
244 |             else
245 |             {
246 |                 return false;
247 |             }
248 |         }
249 |         return true;
250 |     }
251 |     template <class Uint16Container>
252 |     bool utf8ToUnicode(const string& str, Uint16Container& vec)
253 |     {
254 |         return utf8ToUnicode(str.c_str(), str.size(), vec);
255 |     }
256 | 
257 |     template <class Uint16ContainerConIter>
258 |     bool unicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res)
259 |     {
260 |         if(begin >= end)
261 |         {
262 |             return false;
263 |         }
264 |         res.clear();
265 |         uint16_t ui;
266 |         while(begin != end)
267 |         {
268 |             ui = *begin;
269 |             if(ui <= 0x7f)
270 |             {
271 |                 res += char(ui);
272 |             }
273 |             else if(ui <= 0x7ff)
274 |             {
275 |                 res += char(((ui>>6) & 0x1f) | 0xc0);
276 |                 res += char((ui & 0x3f) | 0x80);
277 |             }
278 |             else
279 |             {
280 |                 res += char(((ui >> 12) & 0x0f )| 0xe0);
281 |                 res += char(((ui>>6) & 0x3f )| 0x80 );
282 |                 res += char((ui & 0x3f) | 0x80);
283 |             }
284 |             begin ++;
285 |         }
286 |         return true;
287 |     }
288 | 
289 |     
290 |     template <class Uint16Container>
291 |     bool gbkTrans(const char* const str, size_t len, Uint16Container& vec)
292 |     {
293 |         vec.clear();
294 |         if(!str)
295 |         {
296 |             return false;
297 |         }
298 |         size_t i = 0;
299 |         while(i < len)
300 |         {
301 |             if(0 == (str[i] & 0x80))
302 |             {
303 |                 vec.push_back(uint16_t(str[i]));
304 |                 i++;
305 |             }
306 |             else
307 |             {
308 |                 if(i + 1 < len) //&& (str[i+1] & 0x80))
309 |                 {
310 |                     uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
311 |                     vec.push_back(tmp);
312 |                     i += 2;
313 |                 }
314 |                 else
315 |                 {
316 |                     return false;
317 |                 }
318 |             }
319 |         }
320 |         return true;
321 |     }
322 | 
323 |     template <class Uint16Container>
324 |     bool gbkTrans(const string& str, Uint16Container& vec)
325 |     {
326 |         return gbkTrans(str.c_str(), str.size(), vec);
327 |     }
328 | 
329 |     template <class Uint16ContainerConIter>
330 |     bool gbkTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res)
331 |     {
332 |         if(begin >= end)
333 |         {
334 |             return false;
335 |         }
336 |         res.clear();
337 |         //pair<char, char> pa;
338 |         char first, second;
339 |         while(begin != end)
340 |         {
341 |             //pa = uint16ToChar2(*begin);
342 |             first = ((*begin)>>8) & 0x00ff;
343 |             second = (*begin) & 0x00ff;
344 |             if(first & 0x80)
345 |             {
346 |                 res += first;
347 |                 res += second;
348 |             }
349 |             else
350 |             {
351 |                 res += second;
352 |             }
353 |             begin++;
354 |         }
355 |         return true;
356 |     }
357 | 
358 |     /*
359 |      * format example: "%Y-%m-%d %H:%M:%S"
360 |      */
361 |     inline void getTime(const string& format, string&  timeStr)
362 |     {
363 |         time_t timeNow;
364 |         time(&timeNow);
365 |         timeStr.resize(64);
366 |         size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
367 |         timeStr.resize(len);
368 |     }
369 | }
370 | #endif
371 | 


--------------------------------------------------------------------------------
/src/CppJieba/MPSegment.hpp:
--------------------------------------------------------------------------------
  1 | /************************************
  2 |  * file enc : ASCII
  3 |  * author   : wuyanyi09@gmail.com
  4 |  ************************************/
  5 | #ifndef CPPJIEBA_MPSEGMENT_H
  6 | #define CPPJIEBA_MPSEGMENT_H
  7 | 
  8 | #include <algorithm>
  9 | #include <set>
 10 | #include <cassert>
 11 | #include "Limonp/Logger.hpp"
 12 | #include "DictTrie.hpp"
 13 | #include "DictTrie.hpp"
 14 | #include "ISegment.hpp"
 15 | #include "SegmentBase.hpp"
 16 | 
 17 | namespace CppJieba
 18 | {
 19 | 
 20 |     struct SegmentChar 
 21 |     {
 22 |         uint16_t uniCh;
 23 |         DagType dag;
 24 |         const DictUnit * pInfo;
 25 |         double weight;
 26 |         size_t nextPos;
 27 |         SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
 28 |         {}
 29 |     };
 30 | 
 31 |     class MPSegment: public SegmentBase
 32 |     {
 33 |         protected:
 34 |             DictTrie _dictTrie;
 35 | 
 36 |         public:
 37 |             MPSegment(){};
 38 |             MPSegment(const string& dictPath, const string& userDictPath = "")
 39 |             {
 40 |                 LIMONP_CHECK(init(dictPath, userDictPath));
 41 |             };
 42 |             virtual ~MPSegment(){};
 43 |         public:
 44 |             bool init(const string& dictPath, const string& userDictPath = "")
 45 |             {
 46 |                 LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
 47 |                 LogInfo("MPSegment init(%s) ok", dictPath.c_str());
 48 |                 return true;
 49 |             }
 50 |             bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
 51 |             {
 52 |                 return _dictTrie.isUserDictSingleChineseWord(value);
 53 |             }
 54 |         public:
 55 |             using SegmentBase::cut;
 56 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
 57 |             {
 58 |                 if(begin == end)
 59 |                 {
 60 |                     return false;
 61 |                 }
 62 | 
 63 |                 vector<Unicode> words;
 64 |                 words.reserve(end - begin);
 65 |                 if(!cut(begin, end, words))
 66 |                 {
 67 |                     return false;
 68 |                 }
 69 |                 size_t offset = res.size();
 70 |                 res.resize(res.size() + words.size());
 71 |                 for(size_t i = 0; i < words.size(); i++)
 72 |                 {
 73 |                     if(!TransCode::encode(words[i], res[i + offset]))
 74 |                     {
 75 |                         LogError("encode failed.");
 76 |                         res[i + offset].clear();
 77 |                     }
 78 |                 }
 79 |                 return true;
 80 |             }
 81 | 
 82 |             bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
 83 |             {
 84 |                 if(end == begin)
 85 |                 {
 86 |                     return false;
 87 |                 }
 88 |                 vector<SegmentChar> segmentChars(end - begin);
 89 | 
 90 |                 //calc DAG
 91 |                 for(size_t i = 0; i < segmentChars.size(); i ++)
 92 |                 {
 93 |                     segmentChars[i].uniCh = *(begin + i);
 94 |                     segmentChars[i].dag.clear();
 95 |                     _dictTrie.find(begin + i, end, segmentChars[i].dag, i);
 96 |                     segmentChars[i].dag.insert(pair<DagType::key_type, DagType::mapped_type>(i, NULL));
 97 |                 }
 98 | 
 99 |                 _calcDP(segmentChars);
100 | 
101 |                 if(!_cut(segmentChars, res))
102 |                 {
103 |                     LogError("_cut failed.");
104 |                     return false;
105 |                 }
106 | 
107 |                 return true;
108 |             }
109 |             const DictTrie* getDictTrie() const 
110 |             {
111 |                 return &_dictTrie;
112 |             }
113 | 
114 |         private:
115 |             void _calcDP(vector<SegmentChar>& SegmentChars) const
116 |             {
117 |                 size_t nextPos;
118 |                 const DictUnit* p;
119 |                 double val;
120 | 
121 |                 for(int i = SegmentChars.size() - 1; i >= 0; i--)
122 |                 {
123 |                     SegmentChars[i].pInfo = NULL;
124 |                     SegmentChars[i].weight = MIN_DOUBLE;
125 |                     for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++)
126 |                     {
127 |                         nextPos = it->first;
128 |                         p = it->second;
129 |                         val = 0.0;
130 |                         if(nextPos + 1 < SegmentChars.size())
131 |                         {
132 |                             val += SegmentChars[nextPos + 1].weight;
133 |                         }
134 | 
135 |                         if(p)
136 |                         {
137 |                             val += p->weight; 
138 |                         }
139 |                         else
140 |                         {
141 |                             val += _dictTrie.getMinWeight();
142 |                         }
143 |                         if(val > SegmentChars[i].weight)
144 |                         {
145 |                             SegmentChars[i].pInfo = p;
146 |                             SegmentChars[i].weight = val;
147 |                         }
148 |                     }
149 |                 }
150 |             }
151 |             bool _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res)const
152 |             {
153 |                 size_t i = 0;
154 |                 while(i < segmentChars.size())
155 |                 {
156 |                     const DictUnit* p = segmentChars[i].pInfo;
157 |                     if(p)
158 |                     {
159 |                         res.push_back(p->word);
160 |                         i += p->word.size();
161 |                     }
162 |                     else//single chinese word
163 |                     {
164 |                         res.push_back(Unicode(1, segmentChars[i].uniCh));
165 |                         i++;
166 |                     }
167 |                 }
168 |                 return true;
169 |             }
170 | 
171 | 
172 |     };
173 | }
174 | 
175 | #endif
176 | 


--------------------------------------------------------------------------------
/src/CppJieba/MixSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_MIXSEGMENT_H
  2 | #define CPPJIEBA_MIXSEGMENT_H
  3 | 
  4 | #include <cassert>
  5 | #include "MPSegment.hpp"
  6 | #include "HMMSegment.hpp"
  7 | #include "Limonp/StringUtil.hpp"
  8 | 
  9 | namespace CppJieba
 10 | {
 11 |     class MixSegment: public SegmentBase
 12 |     {
 13 |         private:
 14 |             MPSegment _mpSeg;
 15 |             HMMSegment _hmmSeg;
 16 |         public:
 17 |             MixSegment(){};
 18 |             MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 19 |             {
 20 |                 LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
 21 |             }
 22 |             virtual ~MixSegment(){}
 23 |         public:
 24 |             bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 25 |             {
 26 |                 LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
 27 |                 LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
 28 |                 LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
 29 |                 return true;
 30 |             }
 31 |         public:
 32 |             using SegmentBase::cut;
 33 |         public:
 34 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
 35 |             {
 36 |                 vector<Unicode> words;
 37 |                 words.reserve(end - begin);
 38 |                 if(!_mpSeg.cut(begin, end, words))
 39 |                 {
 40 |                     LogError("mpSeg cutDAG failed.");
 41 |                     return false;
 42 |                 }
 43 | 
 44 |                 vector<Unicode> hmmRes;
 45 |                 hmmRes.reserve(end - begin);
 46 |                 Unicode piece;
 47 |                 piece.reserve(end - begin);
 48 |                 for (size_t i = 0, j = 0; i < words.size(); i++)
 49 |                 {
 50 |                     //if mp get a word, it's ok, put it into result
 51 |                     if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
 52 |                     {
 53 |                         res.push_back(words[i]);
 54 |                         continue;
 55 |                     }
 56 | 
 57 |                     // if mp get a single one and it is not in userdict, collect it in sequence
 58 |                     j = i;
 59 |                     while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
 60 |                     {
 61 |                         piece.push_back(words[j][0]);
 62 |                         j++;
 63 |                     }
 64 | 
 65 |                     // cut the sequence with hmm
 66 |                     if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
 67 |                     {
 68 |                         LogError("_hmmSeg cut failed.");
 69 |                         return false;
 70 |                     }
 71 | 
 72 |                     //put hmm result to result
 73 |                     for (size_t k = 0; k < hmmRes.size(); k++)
 74 |                     {
 75 |                         res.push_back(hmmRes[k]);
 76 |                     }
 77 | 
 78 |                     //clear tmp vars
 79 |                     piece.clear();
 80 |                     hmmRes.clear();
 81 | 
 82 |                     //let i jump over this piece
 83 |                     i = j - 1;
 84 |                 }
 85 |                 return true;
 86 |             }
 87 | 
 88 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
 89 |             {
 90 |                 if(begin == end)
 91 |                 {
 92 |                     return false;
 93 |                 }
 94 | 
 95 |                 vector<Unicode> uRes;
 96 |                 uRes.reserve(end - begin);
 97 |                 if (!cut(begin, end, uRes))
 98 |                 {
 99 |                     return false;
100 |                 }
101 | 
102 |                 size_t offset = res.size();
103 |                 res.resize(res.size() + uRes.size());
104 |                 for(size_t i = 0; i < uRes.size(); i ++, offset++)
105 |                 {
106 |                     if(!TransCode::encode(uRes[i], res[offset]))
107 |                     {
108 |                         LogError("encode failed.");
109 |                     }
110 |                 }
111 |                 return true;
112 |             }
113 | 
114 |             const DictTrie* getDictTrie() const 
115 |             {
116 |                 return _mpSeg.getDictTrie();
117 |             }
118 |     };
119 | }
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/src/CppJieba/PosTagger.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_POS_TAGGING_H
 2 | #define CPPJIEBA_POS_TAGGING_H
 3 | 
 4 | #include "MixSegment.hpp"
 5 | #include "Limonp/StringUtil.hpp"
 6 | #include "DictTrie.hpp"
 7 | 
 8 | namespace CppJieba
 9 | {
10 |     using namespace Limonp;
11 | 
12 |     class PosTagger
13 |     {
14 |         private:
15 |             MixSegment _segment;
16 |             DictTrie _dictTrie;
17 | 
18 |         public:
19 |             PosTagger(){};
20 |             PosTagger(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
21 |             {
22 |                 LIMONP_CHECK(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb));
23 |             };
24 |             ~PosTagger(){};
25 |         public:
26 |             bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
27 |             {
28 |                 LIMONP_CHECK(_dictTrie.init(dictPath));
29 |                 LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
30 |                 return true;
31 |             };
32 | 
33 |             bool tag(const string& src, vector<pair<string, string> >& res)
34 |             {
35 |                 vector<string> cutRes;
36 |                 if (!_segment.cut(src, cutRes))
37 |                 {
38 |                     LogError("_mixSegment cut failed");
39 |                     return false;
40 |                 }
41 | 
42 |                 const DictUnit *tmp = NULL;
43 |                 Unicode unico;
44 |                 for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
45 |                 {
46 |                     if (!TransCode::decode(*itr, unico))
47 |                     {
48 |                         LogError("decode failed.");
49 |                         return false;
50 |                     }
51 |                     tmp = _dictTrie.find(unico.begin(), unico.end());
52 |                     res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
53 |                 }
54 |                 tmp = NULL;
55 |                 return !res.empty();
56 |             }
57 |     };
58 | }
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/CppJieba/QuerySegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_QUERYSEGMENT_H
  2 | #define CPPJIEBA_QUERYSEGMENT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <set>
  6 | #include <cassert>
  7 | #include "Limonp/Logger.hpp"
  8 | #include "DictTrie.hpp"
  9 | #include "ISegment.hpp"
 10 | #include "SegmentBase.hpp"
 11 | #include "FullSegment.hpp"
 12 | #include "MixSegment.hpp"
 13 | #include "TransCode.hpp"
 14 | #include "DictTrie.hpp"
 15 | 
 16 | namespace CppJieba
 17 | {
 18 |     class QuerySegment: public SegmentBase
 19 |     {
 20 |     private:
 21 |         MixSegment _mixSeg;
 22 |         FullSegment _fullSeg;
 23 |         size_t _maxWordLen;
 24 | 
 25 |     public:
 26 |         QuerySegment(){};
 27 |         QuerySegment(const string& dict, const string& model, size_t maxWordLen)
 28 |         {
 29 |             init(dict, model, maxWordLen);
 30 |         };
 31 |         virtual ~QuerySegment(){};
 32 |     public:
 33 |         bool init(const string& dict, const string& model, size_t maxWordLen)
 34 |         {
 35 |             LIMONP_CHECK(_mixSeg.init(dict, model));
 36 |             LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
 37 |             assert(maxWordLen);
 38 |             _maxWordLen = maxWordLen;
 39 |             return true;
 40 |         }
 41 | 
 42 |     public:
 43 |         using SegmentBase::cut;
 44 | 
 45 |     public:
 46 |         bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
 47 |         {
 48 |             if (begin >= end)
 49 |             {
 50 |                 LogError("begin >= end");
 51 |                 return false;
 52 |             }
 53 | 
 54 |             //use mix cut first
 55 |             vector<Unicode> mixRes;
 56 |             if (!_mixSeg.cut(begin, end, mixRes))
 57 |             {
 58 |                 LogError("_mixSeg cut failed.");
 59 |                 return false;
 60 |             }
 61 | 
 62 |             vector<Unicode> fullRes;
 63 |             for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
 64 |             {
 65 |                 
 66 |                 // if it's too long, cut with _fullSeg, put fullRes in res
 67 |                 if (mixResItr->size() > _maxWordLen)
 68 |                 {
 69 |                     if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
 70 |                     {
 71 |                        for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
 72 |                        {
 73 |                            res.push_back(*fullResItr);
 74 |                        }
 75 | 
 76 |                        //clear tmp res
 77 |                        fullRes.clear();
 78 |                     }
 79 |                 }
 80 |                 else // just use the mix result
 81 |                 {
 82 |                     res.push_back(*mixResItr);
 83 |                 }
 84 |             }
 85 | 
 86 |             return true;
 87 |         }
 88 | 
 89 | 
 90 |         bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
 91 |         {
 92 |             if (begin >= end)
 93 |             {
 94 |                 LogError("begin >= end");
 95 |                 return false;
 96 |             }
 97 | 
 98 |             vector<Unicode> uRes;
 99 |             if (!cut(begin, end, uRes))
100 |             {
101 |                 LogError("get unicode cut result error.");
102 |                 return false;
103 |             }
104 | 
105 |             string tmp;
106 |             for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
107 |             {
108 |                 if (TransCode::encode(*uItr, tmp))
109 |                 {
110 |                     res.push_back(tmp);
111 |                 }
112 |                 else
113 |                 {
114 |                     LogError("encode failed.");
115 |                 }
116 |             }
117 | 
118 |             return true;
119 |         }
120 |     };
121 | }
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/src/CppJieba/SegmentBase.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_SEGMENTBASE_H
 2 | #define CPPJIEBA_SEGMENTBASE_H
 3 | 
 4 | #include "TransCode.hpp"
 5 | #include "Limonp/Logger.hpp"
 6 | #include "Limonp/NonCopyable.hpp"
 7 | #include "Limonp/HandyMacro.hpp"
 8 | #include "ISegment.hpp"
 9 | #include <cassert>
10 | 
11 | 
12 | namespace CppJieba
13 | {
14 |     using namespace Limonp;
15 | 
16 |     //const char* const SPECIAL_CHARS = " \t\n";
17 | #ifndef CPPJIEBA_GBK
18 |     const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};  
19 | #else
20 |     const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};  
21 | #endif
22 | 
23 |     class SegmentBase: public ISegment, public NonCopyable
24 |     {
25 |         public:
26 |             SegmentBase(){_loadSpecialSymbols();};
27 |             virtual ~SegmentBase(){};
28 |         private:
29 |             unordered_set<UnicodeValueType> _specialSymbols;
30 |         private:
31 |             void _loadSpecialSymbols()
32 |             {
33 |                 size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
34 |                 for(size_t i = 0; i < size; i ++)
35 |                 {
36 |                     _specialSymbols.insert(SPECIAL_SYMBOL[i]);
37 |                 }
38 |                 assert(_specialSymbols.size());
39 |             }
40 | 
41 |         public:
42 |             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
43 |             virtual bool cut(const string& str, vector<string>& res) const
44 |             {
45 |                 res.clear();
46 | 
47 |                 Unicode unicode;
48 |                 unicode.reserve(str.size());
49 | 
50 |                 TransCode::decode(str, unicode);
51 |                 
52 |                 Unicode::const_iterator left = unicode.begin();
53 |                 Unicode::const_iterator right;
54 |                 
55 |                 for(right = unicode.begin(); right != unicode.end(); right++)
56 |                 {
57 |                     if(isIn(_specialSymbols, *right))
58 |                     {
59 |                         if(left != right)
60 |                         {
61 |                             cut(left, right, res);
62 |                         }
63 |                         res.resize(res.size() + 1);
64 |                         TransCode::encode(right, right + 1, res.back());
65 |                         left = right + 1;
66 |                     }
67 |                 }
68 |                 if(left != right)
69 |                 {
70 |                     cut(left, right, res);
71 |                 }
72 |                 
73 |                 return true;
74 |             }
75 |     };
76 | }
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/CppJieba/TransCode.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  * file enc : utf-8
 3 |  * author   : wuyanyi09@gmail.com
 4 |  ************************************/
 5 | #ifndef CPPJIEBA_TRANSCODE_H
 6 | #define CPPJIEBA_TRANSCODE_H
 7 | 
 8 | 
 9 | #include "Limonp/StringUtil.hpp"
10 | #include "Limonp/LocalVector.hpp"
11 | 
12 | namespace CppJieba
13 | {
14 | 
15 |     using namespace Limonp;
16 |     typedef uint16_t UnicodeValueType;
17 |     typedef Limonp::LocalVector<UnicodeValueType> Unicode;
18 |     namespace TransCode
19 |     {
20 |         inline bool decode(const string& str, Unicode& res)
21 |         {
22 | #ifdef CPPJIEBA_GBK
23 |             return gbkTrans(str, res);
24 | #else
25 |             return utf8ToUnicode(str, res);
26 | #endif
27 |         }
28 | 
29 |         inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
30 |         {
31 | #ifdef CPPJIEBA_GBK
32 |             return gbkTrans(begin, end, res);
33 | #else
34 |             return unicodeToUtf8(begin, end, res);
35 | #endif
36 |         }
37 |         
38 |         inline bool encode(const Unicode& uni, string& res)
39 |         {
40 |             return encode(uni.begin(), uni.end(), res);
41 |         }
42 |     }
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/CppJieba/Trie.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_TRIE_HPP
  2 | #define CPPJIEBA_TRIE_HPP
  3 | 
  4 | #include "Limonp/StdExtension.hpp"
  5 | #include <vector>
  6 | 
  7 | namespace CppJieba
  8 | {
  9 |     using namespace std;
 10 |     template <class KeyType, class ValueType>
 11 |         class TrieNode
 12 |         {
 13 |             public:
 14 |                 typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > KeyMapType;
 15 |             public:
 16 |                 KeyMapType * ptKeyMap;
 17 |                 const ValueType * ptValue;
 18 |         };
 19 | 
 20 |     template <class KeyType, class ValueType, class KeyContainerType = vector<KeyType>, class KeysContainerType = vector<KeyContainerType>, class ValueContainerType = vector<const ValueType* > >
 21 |         class Trie
 22 |         {
 23 |             public:
 24 |                 typedef TrieNode<KeyType, ValueType> TrieNodeType;
 25 |             private:
 26 |                 TrieNodeType* _root;
 27 |             public:
 28 |                 Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
 29 |                 {
 30 |                     _root = new TrieNodeType;
 31 |                     _root->ptKeyMap = NULL;
 32 |                     _root->ptValue = NULL;
 33 | 
 34 |                     _createTrie(keys, valuePointers);
 35 |                 }
 36 |                 ~Trie()
 37 |                 {
 38 |                     if(_root)
 39 |                     {
 40 |                         _deleteNode(_root);
 41 |                     }
 42 |                 }
 43 |             public:
 44 |                 const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const
 45 |                 {
 46 |                     typename TrieNodeType::KeyMapType::const_iterator citer;
 47 |                     const TrieNodeType* ptNode = _root;
 48 |                     for(typename KeyContainerType::const_iterator it = begin; it != end; it++)
 49 |                     {
 50 |                         assert(ptNode);
 51 |                         if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it)))
 52 |                         {
 53 |                             return NULL;
 54 |                         }
 55 |                         ptNode = citer->second;
 56 |                     }
 57 |                     return ptNode->ptValue;
 58 |                 }
 59 |                 bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map<typename KeyContainerType::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
 60 |                 {
 61 |                     const TrieNodeType * ptNode = _root;
 62 |                     typename TrieNodeType::KeyMapType::const_iterator citer;
 63 |                     ordererMap.clear();
 64 |                     for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
 65 |                     {
 66 |                         assert(ptNode);
 67 |                         if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr)))
 68 |                         {
 69 |                             break;
 70 |                         }
 71 |                         ptNode = citer->second;
 72 |                         if(ptNode->ptValue)
 73 |                         {
 74 |                             ordererMap[itr - begin + offset] = ptNode->ptValue;
 75 |                         }
 76 |                     }
 77 |                     return ordererMap.size();
 78 |                 }
 79 |             private:
 80 |                 void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
 81 |                 {
 82 |                     if(valuePointers.empty() || keys.empty())
 83 |                     {
 84 |                         return;
 85 |                     }
 86 |                     assert(keys.size() == valuePointers.size());
 87 | 
 88 |                     for(size_t i = 0; i < keys.size(); i++)
 89 |                     {
 90 |                         _insertNode(keys[i], valuePointers[i]);
 91 |                     }
 92 |                 }
 93 |             private:
 94 |                 void _insertNode(const KeyContainerType& key, const ValueType* ptValue)
 95 |                 {
 96 |                     TrieNodeType* ptNode  = _root;
 97 | 
 98 |                     typename TrieNodeType::KeyMapType::const_iterator kmIter;
 99 | 
100 |                     for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++)
101 |                     {
102 |                         if(NULL == ptNode->ptKeyMap)
103 |                         {
104 |                             ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType;
105 |                         }
106 |                         kmIter = ptNode->ptKeyMap->find(*citer);
107 |                         if(ptNode->ptKeyMap->end() == kmIter)
108 |                         {
109 |                             TrieNodeType * nextNode = new TrieNodeType;
110 |                             nextNode->ptKeyMap = NULL;
111 |                             nextNode->ptValue = NULL;
112 | 
113 |                             (*ptNode->ptKeyMap)[*citer] = nextNode;
114 |                             ptNode = nextNode;
115 |                         }
116 |                         else
117 |                         {
118 |                             ptNode = kmIter->second;
119 |                         }
120 |                     }
121 |                     ptNode->ptValue = ptValue;
122 |                 }
123 |                 void _deleteNode(TrieNodeType* node)
124 |                 {
125 |                     if(!node)
126 |                     {
127 |                         return;
128 |                     }
129 |                     if(node->ptKeyMap)
130 |                     {
131 |                         typename TrieNodeType::KeyMapType::iterator it;
132 |                         for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
133 |                         {
134 |                             _deleteNode(it->second);
135 |                         }
136 |                         delete node->ptKeyMap;
137 |                     }
138 |                     delete node;
139 |                 }
140 |         };
141 | }
142 | 
143 | #endif
144 | 


--------------------------------------------------------------------------------
/src/config:
--------------------------------------------------------------------------------
1 | ngx_addon_name=ngx_http_cppjieba_module
2 | HTTP_MODULES="$HTTP_MODULES ngx_http_cppjieba_module"
3 | NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ngx_addon_dir/ngx_http_cppjieba_module.cpp"
4 | 


--------------------------------------------------------------------------------
/src/ngx_http_cppjieba_module.cpp:
--------------------------------------------------------------------------------
  1 | extern "C" {
  2 | #include <ngx_config.h>
  3 | #include <ngx_core.h>
  4 | #include <ngx_http.h>
  5 | }
  6 | 
  7 | #include "CppJieba/MixSegment.hpp"
  8 | 
  9 | using std::string;
 10 | using std::vector;
 11 | 
 12 | //static ngx_str_t g_cppjieba_conf_arg1;
 13 | //static ngx_str_t g_cppjieba_conf_arg2;
 14 | 
 15 | inline unsigned char fromHex(unsigned char x) 
 16 | {
 17 |     return isdigit(x) ? x - '0' : x - 'A' + 10;
 18 | }
 19 | /*
 20 | inline unsigned char toHex(unsigned char x)
 21 | {
 22 |     return x > 9 ? x -10 + 'A': x + '0';
 23 | }
 24 | inline void URLEncode(const string &sIn, string& sOut)
 25 | {
 26 |     for( size_t ix = 0; ix < sIn.size(); ix++ )
 27 |     {      
 28 |         unsigned char buf[4];
 29 |         memset( buf, 0, 4 );
 30 |         if( isalnum( (unsigned char)sIn[ix] ) )
 31 |         {      
 32 |             buf[0] = sIn[ix];
 33 |         }
 34 |         else
 35 |         {
 36 |             buf[0] = '%';
 37 |             buf[1] = toHex( (unsigned char)sIn[ix] >> 4 );
 38 |             buf[2] = toHex( (unsigned char)sIn[ix] % 16);
 39 |         }
 40 |         sOut += (char *)buf;
 41 |     }
 42 | };
 43 | */
 44 | 
 45 | static void URLDecode(const string &sIn, string& sOut)
 46 | {
 47 |     for( size_t ix = 0; ix < sIn.size(); ix++ )
 48 |     {
 49 |         unsigned char ch = 0;
 50 |         if(sIn[ix]=='%')
 51 |         {
 52 |             ch = (fromHex(sIn[ix+1])<<4);
 53 |             ch |= fromHex(sIn[ix+2]);
 54 |             ix += 2;
 55 |         }
 56 |         else if(sIn[ix] == '+')
 57 |         {
 58 |             ch = ' ';
 59 |         }
 60 |         else
 61 |         {
 62 |             ch = sIn[ix];
 63 |         }
 64 |         sOut += (char)ch;
 65 |     }
 66 | }
 67 | CppJieba::MixSegment * g_mix_segment;//(DICT_PATH, HMM_PATH, USER_DICT_PATH);
 68 | 
 69 | typedef struct {
 70 |     ngx_str_t output_words;
 71 | } ngx_http_cppjieba_loc_conf_t;
 72 | 
 73 | // To process HelloWorld command arguments
 74 | static char* ngx_http_cppjieba_set_conf(ngx_conf_t* cf, ngx_command_t* cmd, void* conf);
 75 | 
 76 | // Allocate memory for HelloWorld command
 77 | static void* ngx_http_cppjieba_create_loc_conf(ngx_conf_t* cf);
 78 | 
 79 | // Copy HelloWorld argument to another place
 80 | static char* ngx_http_cppjieba_merge_loc_conf(ngx_conf_t* cf, void* parent, void* child);
 81 | 
 82 | //static ngx_int_t ngx_http_cppjieba_init(ngx_cycle_t *cf);
 83 | //static void ngx_http_cppjieba_finalize(ngx_cycle_t *cf);
 84 | 
 85 | static ngx_int_t get_post_content(ngx_http_request_t *r, char * data_buf, size_t content_length);
 86 | // Structure for the HelloWorld command
 87 | static ngx_command_t ngx_http_cppjieba_commands[] = {
 88 |     {
 89 |         ngx_string("cppjieba"), // The command name
 90 |         NGX_HTTP_LOC_CONF | NGX_CONF_TAKE3,
 91 |         ngx_http_cppjieba_set_conf, // The command handler
 92 |         NGX_HTTP_LOC_CONF_OFFSET,
 93 |         offsetof(ngx_http_cppjieba_loc_conf_t, output_words),
 94 |         NULL
 95 |     },
 96 |     ngx_null_command
 97 | };
 98 | 
 99 | // Structure for the HelloWorld context
100 | static ngx_http_module_t ngx_http_cppjieba_module_ctx = {
101 |     NULL,
102 |     NULL,
103 |     NULL,
104 |     NULL,
105 |     NULL,
106 |     NULL,
107 |     ngx_http_cppjieba_create_loc_conf,
108 |     ngx_http_cppjieba_merge_loc_conf
109 | };
110 | 
111 | // Structure for the HelloWorld module, the most important thing
112 | ngx_module_t ngx_http_cppjieba_module = {
113 |     NGX_MODULE_V1,
114 |     &ngx_http_cppjieba_module_ctx,
115 |     ngx_http_cppjieba_commands,
116 |     NGX_HTTP_MODULE,
117 |     NULL,
118 |     NULL,
119 |     NULL, //ngx_http_cppjieba_init,
120 |     NULL,
121 |     NULL,
122 |     NULL, //ngx_http_cppjieba_finalize,
123 |     NULL,
124 |     NGX_MODULE_V1_PADDING
125 | };
126 | 
127 | static void ngx_http_cppjieba_post_handler(ngx_http_request_t* r);
128 | 
129 | static ngx_int_t ngx_http_cppjieba_handler(ngx_http_request_t* r) {
130 |     ngx_int_t rc;
131 |     ngx_buf_t* b;
132 |     ngx_chain_t out;
133 | 
134 |     if(r->method & NGX_HTTP_POST) {
135 |         ngx_int_t rc = ngx_http_read_client_request_body(r, ngx_http_cppjieba_post_handler);
136 |         if (rc >= NGX_HTTP_SPECIAL_RESPONSE) {
137 |             return rc;
138 |         }
139 |         return NGX_DONE;
140 |     }
141 | 
142 |     if(!(r->method & NGX_HTTP_GET)) {
143 |         return NGX_HTTP_NOT_ALLOWED;
144 |     }
145 | 
146 |     // args is s=xxxxx
147 |     ngx_str_t value;
148 |     if (NGX_OK != ngx_http_arg(r, (u_char*)"s", 1, &value)) {
149 |         return NGX_HTTP_BAD_REQUEST;
150 |     }
151 | 
152 |     string sentence;
153 |     URLDecode(string((char*)value.data, value.len), sentence);
154 |     vector<string> words;
155 |     g_mix_segment->cut(sentence, words);
156 |     string response;
157 |     //string tmp;
158 |     //tmp << words;
159 |     //URLEncode(tmp, response);
160 |     response << words;
161 | 
162 | 
163 |     b = ngx_create_temp_buf(r->pool, response.size());
164 |     if (b == NULL) {
165 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
166 |     }
167 | 
168 |     ngx_memcpy(b->pos, response.c_str(), response.size());
169 |     b->last = b->pos + response.size();
170 |     b->last_buf = 1;
171 | 
172 |     out.buf = b;
173 |     out.next = NULL;
174 | 
175 |     r->headers_out.status = NGX_HTTP_OK;
176 |     r->headers_out.content_length_n = response.size();
177 |     ngx_str_t type = ngx_string("text/plain");
178 |     r->headers_out.content_type = type;
179 | 
180 |     rc = ngx_http_send_header(r);
181 |     if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) {
182 |         return rc;
183 |     }
184 | 
185 |     return ngx_http_output_filter(r, &out);
186 | }
187 | 
188 | static void* ngx_http_cppjieba_create_loc_conf(ngx_conf_t* cf) {
189 |     ngx_http_cppjieba_loc_conf_t* conf;
190 | 
191 |     conf = (ngx_http_cppjieba_loc_conf_t*)ngx_pcalloc(cf->pool, sizeof(ngx_http_cppjieba_loc_conf_t));
192 |     if (conf == NULL) {
193 |         return NGX_CONF_ERROR;
194 |     }
195 |     conf->output_words.len = 0;
196 |     conf->output_words.data = NULL;
197 | 
198 |     return conf;
199 | }
200 | 
201 | static char* ngx_http_cppjieba_merge_loc_conf(ngx_conf_t* cf, void* parent, void* child) {
202 |     ngx_http_cppjieba_loc_conf_t* prev = (ngx_http_cppjieba_loc_conf_t*)parent;
203 |     ngx_http_cppjieba_loc_conf_t* conf = (ngx_http_cppjieba_loc_conf_t*)child;
204 |     ngx_conf_merge_str_value(conf->output_words, prev->output_words, "Nginx");
205 |     return NGX_CONF_OK;
206 | }
207 | 
208 | static char* ngx_http_cppjieba_set_conf(ngx_conf_t* cf, ngx_command_t* cmd, void* conf) {
209 |     ngx_http_core_loc_conf_t* clcf;
210 |     clcf = (ngx_http_core_loc_conf_t*)ngx_http_conf_get_module_loc_conf(cf, ngx_http_core_module);
211 |     clcf->handler = ngx_http_cppjieba_handler;
212 |     ngx_conf_set_str_slot(cf, cmd, conf);
213 |     if (cf->args->nelts != 4) {
214 |         ngx_log_error(NGX_LOG_ERR, cf->log, 0, " [the number of conf'a args is not 4] ");
215 |         return (char*)NGX_CONF_ERROR;
216 |     }
217 |     ngx_str_t * value = (ngx_str_t *)cf->args->elts;
218 | 
219 |     g_mix_segment = new CppJieba::MixSegment(
220 |                 string((const char *)value[1].data, value[1].len), 
221 |                 string((const char *)value[2].data, value[2].len),
222 |                 string((const char *)value[3].data, value[3].len));
223 |     return NGX_CONF_OK;
224 | }
225 | 
226 | //static ngx_int_t ngx_http_cppjieba_init(ngx_cycle_t *cf) 
227 | //{
228 | //    g_mix_segment = new CppJieba::MixSegment(
229 | //                string((const char *)g_cppjieba_conf_arg1.data, g_cppjieba_conf_arg1.len), 
230 | //                string((const char *)g_cppjieba_conf_arg2.data, g_cppjieba_conf_arg2.len));
231 | //    return NGX_OK;
232 | //}
233 | 
234 | //static void ngx_http_cppjieba_finalize(ngx_cycle_t *cf)
235 | //{
236 | //    delete g_mix_segment;
237 | //    g_mix_segment = NULL;
238 | //}
239 | 
240 | static ngx_int_t get_post_content(ngx_http_request_t *r, char * data_buf, size_t content_length) {
241 |     ngx_log_error(NGX_LOG_NOTICE, r->connection->log, 0, "[get_post_content] [content_length:%d]", content_length); //DEBUG
242 |     if(r->request_body == NULL) {
243 |         ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, "reqeust_body:null");
244 |         return NGX_ERROR;
245 |     }
246 |     ngx_chain_t* bufs = r->request_body->bufs;
247 |     ngx_buf_t* buf = NULL;
248 |     size_t body_length = 0;
249 |     size_t buf_length;
250 |     while(bufs) {
251 |         buf = bufs->buf;
252 |         bufs = bufs->next;
253 |         buf_length = buf->last - buf->pos;
254 |         if(body_length + buf_length > content_length) {
255 |             memcpy(data_buf + body_length, buf->pos, content_length - body_length);
256 |             body_length = content_length;
257 |             break;
258 |         }
259 |         memcpy(data_buf + body_length, buf->pos, buf->last - buf->pos);
260 |         body_length += buf->last - buf->pos;
261 |     }
262 |     if(body_length != content_length) {
263 |         ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, "get_post_content's body_length != content_length in headers");
264 |         return NGX_ERROR;
265 |     }
266 |     return NGX_OK;
267 | }
268 | 
269 | 
270 | static ngx_int_t ngx_http_cppjieba_send_response(ngx_http_request_t * r, const char* type, const char* data_buf, size_t len) {
271 |     ngx_int_t rc;
272 |     ngx_buf_t* b;
273 |     ngx_chain_t out;
274 | 
275 |     b = ngx_create_temp_buf(r->pool, len);
276 |     if (b == NULL) {
277 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
278 |     }
279 | 
280 |     ngx_memcpy(b->pos, data_buf, len);
281 |     b->last = b->pos + len;
282 |     b->last_buf = 1;
283 | 
284 |     out.buf = b;
285 |     out.next = NULL;
286 | 
287 |     r->headers_out.status = NGX_HTTP_OK;
288 |     r->headers_out.content_length_n = len;
289 |     r->headers_out.content_type.data = (u_char*) type;
290 |     r->headers_out.content_type.len = strlen(type);
291 | 
292 |     rc = ngx_http_send_header(r);
293 |     if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) {
294 |         return rc;
295 |     }
296 | 
297 |     return ngx_http_output_filter(r, &out);
298 | }
299 | 
300 | static void ngx_http_cppjieba_post_handler(ngx_http_request_t* r) {
301 |     if(r->headers_in.content_length_n == 0) {
302 |         ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, "r->headers_in.content_length_n is 0");
303 |         ngx_http_finalize_request(r, NGX_ERROR);
304 |         return;
305 |     }
306 |     ngx_int_t rc;
307 |     char * data_buf = NULL;
308 |     data_buf = (char*) ngx_pcalloc(r->pool, r->headers_in.content_length_n + 1);
309 |     if (data_buf == NULL) {
310 |         ngx_http_finalize_request(r, NGX_ERROR);
311 |         return;
312 |     }
313 | 
314 |     if (NGX_ERROR == get_post_content(r, data_buf, r->headers_in.content_length_n)) {
315 |         ngx_http_finalize_request(r, NGX_ERROR);
316 |         return;
317 |     }
318 | 
319 |     string sentence;
320 |     URLDecode(data_buf, sentence);
321 |     vector<string> words;
322 |     g_mix_segment->cut(sentence, words);
323 |     string response;
324 |     response << words;
325 | 
326 | 
327 |     rc = ngx_http_cppjieba_send_response(
328 |                 r, 
329 |                 "text/plain", 
330 |                 response.c_str(), 
331 |                 response.size());
332 | 
333 |     ngx_log_error(NGX_LOG_INFO, r->connection->log, 0, "[ngx_http_cppjieba_send_response] [response size:%d]", response.size());
334 |     ngx_http_finalize_request(r, rc);
335 | }
336 | 


--------------------------------------------------------------------------------