├── .gitignore
├── README.md
├── doc
    └── qs_workflow.png
├── include
    ├── buildEngine.hpp
    ├── correctionEngine.h
    ├── dictionary.hpp
    ├── pinyinConvert.h
    ├── pinyinEngine.h
    ├── segment
    │   ├── darts.h
    │   ├── hash_table.hpp
    │   ├── kstring.hpp
    │   ├── line_reader.h
    │   ├── normalize.h
    │   ├── segment.h
    │   ├── segment_dict.h
    │   └── trd2simp.h
    ├── segmentWrapper.h
    ├── suggestion.hpp
    └── util
    │   ├── darts.h
    │   ├── mtrie.h
    │   ├── normalize.h
    │   ├── py_types.h
    │   ├── types.h
    │   ├── utf8.h
    │   └── utf8
    │       ├── checked.h
    │       ├── core.h
    │       └── unchecked.h
└── test
    ├── Makefile
    ├── t_build_unit.cc
    ├── t_dictionary_unit.cc
    ├── t_normalize_unit.cc
    ├── t_segment_unit.cc
    ├── t_suggestion_unit.cc
    └── unit_test.h


/.gitignore:
--------------------------------------------------------------------------------
1 | include/*.swp
2 | /resource/
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Query Suggestion
 2 | ### 1. Intoduction
 3 | This a query auto-completion system can be used in any searching scenario.I found query suggestion is an useful application in project,but there was 
 4 | no more documents or codes ,so I want to rebuild this project, andd make it more light and easy to use, and support different interface.
 5 | 
 6 | ### 2. System Framework
 7 | Query Suggestion here was a very simple version, I had not adopt many algorithms,bellow is the system workflow.
 8 | ![SystemFramework](./doc/qs_workflow.png)
 9 | 
10 | As shown in figure, the main process of this project was the offline, we generated prefixes(contain pinyin,shengmu,chinese words) maps, at online situtation,
11 | use the input query(maybe one word, one alphabet, not a completed query) as key to find in maps,and recall it's keywords list, last we rank the candidated 
12 | keywords and return. The advantages of the project was **efficient and convenient**, you just only need to prepare the corpus, which may the query logs in
13 | search engine, or titles of product or news and other corpus.But the disadvantages also obvious, real-time and incomplete coverage because you can not supply \
14 | an enough big corpus so the suggestion words you want may not recommended.Anyway, we already use it in our E-commerce search and got a better result.
15 | ### 2. Algorithms and Model
16 | 
17 | ### 3. Usage
18 | 
19 | ### 4. Roadmap
20 | - ~~1. Combine normalization module in include/segment/normalize.h and include/util/normalize.h~~
21 | - ~~2. Add more unit test for normalization module~~
22 | - ~~3. Design the old algorithm system framwork.~~
23 | - 4 . Supply Python interface for old algorithm.
24 | - 5 . Collect algorithm papers and documents.
25 | - 6 . Choose one more effectient algorithm.
26 | 
27 | ### 5. Contact
28 | If you have any questions ,suggestions or ideas you can contact me with *jerryshi0110@gmail.com*
29 | 


--------------------------------------------------------------------------------
/doc/qs_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syw2014/query-suggestion/235ef793333c8b44911f1bc3e86a09d277e8441b/doc/qs_workflow.png


--------------------------------------------------------------------------------
/include/buildEngine.hpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: buildEngine.h
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Mon 11 Jul 2016 01:42:29 PM CST
  7 |  ************************************************************************/
  8 | // Data building module contains data process,tokenize, prefix gerenation
  9 | //
 10 | 
 11 | #ifndef MODULE_BUILD_ENGINE_HPP
 12 | #define MODULE_BUILD_ENGINE_HPP
 13 | 
 14 | #include <set>
 15 | #include <memory>
 16 | #include <string>
 17 | #include <fstream>
 18 | #include <sstream>
 19 | #include <iostream>
 20 | #include <algorithm>
 21 | 
 22 | #include <boost/filesystem.hpp>
 23 | #include <boost/lexical_cast.hpp>
 24 | #include <boost/concept_check.hpp>
 25 | #include <boost/unordered_map.hpp>
 26 | #include <boost/algorithm/string.hpp>
 27 | #include <boost/date_time/posix_time/posix_time.hpp>
 28 | 
 29 | // segment
 30 | #include "segmentWrapper.h"
 31 | #include "dictionary.hpp"
 32 | 
 33 | typedef std::vector<std::pair<uint32_t, double> > TermIDTFType;
 34 | typedef boost::unordered_map<std::string, TermIDTFType> KeyInfoType;
 35 | typedef boost::unordered_map<std::string, std::vector<uint32_t> > KeyTermIDsType;
 36 | typedef std::pair<double, uint32_t> TFResPairType;  // term frequency and reserve , now the reserve used as searching result number 
 37 | typedef std::vector<std::pair<std::string, TFResPairType> > TermInfoType;
 38 | 
 39 | // Build engine for data module build
 40 | // Provide  interfaces for data module building.
 41 | class BuildEngine {
 42 |     private:
 43 |         static const double prefix_w_ = 10000.0;       // different weight for different prefix type
 44 |         static const double pinyin_prefix_w_ = 1000.0; 
 45 |         static const double shengmu_w_ = 100.0; 
 46 |         static const double word_infix_w_ = 10.0;
 47 |         static const double word_suffix_w_ = 1.0;
 48 | 
 49 |         static const uint32_t topK_ = 15;     // default number of candidate words
 50 | 
 51 |         //static std::string res_dir_;           // resource directory, pinyin and token resource
 52 | 
 53 |         // main data structure
 54 |         std::auto_ptr<SegmentWrapper> segWrapper_;  // chinese string tokenizer
 55 |         std::auto_ptr<Dictionary> pySegDict_;       // pinyin tokenizer
 56 |         
 57 |         TermInfoType termsInfo_;          // total term set and it's corresponding infos(tf, numbers) generated from corpus
 58 |         std::vector<double> tf_;          // term frequency , it was corresponded to words in terms_
 59 |         KeyTermIDsType key_termIds_;      // key(prefix) to term ids map
 60 | 
 61 |         std::set<std::string> shm_ ;      // total sheng mu set
 62 |         public:
 63 | 
 64 |         static std::string res_dir_;           // resource directory, pinyin and token resource
 65 |         private:
 66 | 
 67 |         // extract sheng mu from pinyin
 68 |         // @pinyin: pinyin string vector
 69 |         // @shm: sheng mu extract form pinyin
 70 |         // eg: 行 hang, xing shengmu: h,x
 71 |         void GetShengMuByPinYin_(const std::vector<std::string>& pinyin
 72 |                                 ,std::vector<std::string>& shm) {
 73 |             shm.clear();
 74 |             if (pinyin.empty())
 75 |                 return;
 76 |             
 77 |             std::set<std::string> head;
 78 |             for (uint32_t i = 0; i < pinyin.size(); ++i) {
 79 |                 const std::string& py = pinyin[i];
 80 |                 if (py.empty())
 81 |                     continue;
 82 |                 else {
 83 |                     std::string ch;
 84 |                     ch += py[0]; // first char
 85 |                     //TODO, To be confirm this logistic
 86 |                     if ((char)py[0] >= 0) {
 87 |                         if (head.insert(ch).second)
 88 |                             shm.push_back(ch);
 89 |                     } else {
 90 |                         if (head.insert(py).second)
 91 |                             shm.push_back(py);
 92 |                     }
 93 |                 }
 94 |             }
 95 |         }
 96 | 
 97 |         // store elements and it's weight in vector into map
 98 |         // @keys: 
 99 |         // @key_info: key information map
100 |         // @termid: the index id of string which generated keys
101 |         // @weight: keys weight
102 |         void StoreInKeyMap(const std::vector<std::string>& keys
103 |                            ,KeyInfoType& key_info
104 |                            ,const uint32_t termid
105 |                            ,const double weight) {
106 |             if (keys.empty())
107 |                 return;
108 |             for (uint32_t idx = 0; idx < keys.size(); ++idx) {
109 |                 key_info[keys[idx]].push_back(std::make_pair(termid, weight));
110 |             }
111 |         }
112 | 
113 |     public:
114 |         BuildEngine() {
115 |             // resource directory check
116 |             if (!boost::filesystem::exists(res_dir_)) {
117 |                 std::cout << "resouce directory: " << res_dir_ << "is not exist!" << std::endl;
118 |                 std::cout << "Tips: resource directory may like: \"../resource/\"" << std::endl;
119 |                 return;
120 |             }
121 | 
122 |             // construct tokenizer
123 |             segWrapper_.reset(new SegmentWrapper(res_dir_+"dict"));
124 |             pySegDict_.reset(new Dictionary(res_dir_+"cn"));
125 | 
126 |             // load shengmu from files.
127 |             // b,p,m,f,d,t,n,l,g,k,j,q,x,zh,ch,sh,r,z,c,s,y,w
128 |             std::ifstream ifs((res_dir_+"cn/ShengMu.txt").c_str());
129 |             if (!ifs) {
130 |                 std::cout << "Open file " << (res_dir_+"cn/ShengMu.txt") << "failed!" << std::endl;
131 |                 return;
132 |             }
133 |             std::string line;
134 |             while (getline(ifs, line)) {
135 |                 boost::algorithm::trim(line);
136 |                 if (line.empty())
137 |                     continue;
138 |                 shm_.insert(line);
139 |             }
140 |             ifs.close();
141 |         }
142 | 
143 |         ~BuildEngine() {
144 |             termsInfo_.clear();
145 |             tf_.clear();
146 |             key_termIds_.clear();
147 |         }
148 | 
149 |         // get data building results
150 |         void GetDataModule(TermInfoType& termsInfo
151 |                            ,KeyTermIDsType& key_termids) {
152 |             termsInfo.clear();
153 |             key_termids.clear();
154 | 
155 |             termsInfo.swap(termsInfo_);
156 |             key_termids.swap(key_termIds_);
157 |         }
158 | 
159 |         // build data module from file
160 |         // Notes: 
161 |         // data structure in file must be like:
162 |         // term \t freq \t result_num
163 |         // the separator is tab
164 |         bool Build(const std::string& nm) {
165 |             termsInfo_.clear();
166 |             tf_.clear();
167 |             key_termIds_.clear();
168 |             
169 |             std::ifstream ifs(nm.c_str());
170 |             if(!ifs) {
171 |                 std::cout << "File " << nm << "open failed!" << std::endl;
172 |                 return false;
173 |             }
174 | 
175 |             std::cout << "Start building...\n";
176 |             std::string line;
177 |             std::map<std::string, TFResPairType> t_freqRes;  // term , tf, result_num
178 |             // extract term, it's freq and result number
179 |             while (getline(ifs, line)) {
180 |                 if (line.empty())
181 |                     continue;
182 |                 boost::algorithm::trim(line);
183 |                 boost::to_lower(line);
184 |                 std::size_t pos = line.find("\t");
185 |                 if (pos == std::string::npos)
186 |                     continue;
187 |                 std::vector<std::string> vec;
188 |                 std::string term;
189 |                 boost::algorithm::split(vec, line, boost::algorithm::is_any_of("\t"));
190 |                 // the data must contians term and it's frequency
191 |                 if (vec.size() < 2)
192 |                     continue;
193 |                 term = vec[0];
194 |                 double freq = 0.0;
195 |                 uint32_t result_num = 0;
196 |                 // result num or other meanings
197 |                 if (vec.size() != 3)
198 |                     result_num = 0;
199 |                 else {
200 |                     try {
201 |                         result_num = boost::lexical_cast<int>(vec[2]);
202 |                     } catch(...) {
203 |                          result_num = 0;
204 |                     }
205 |                 }
206 |                 // term frequency
207 |                 try {
208 |                     freq = boost::lexical_cast<double>(vec[1]);
209 |                 } catch(...) {
210 |                    // freq = 1.0;
211 |                     std::cout << "bad line in:" << nm << ":" << line << std::endl;
212 |                     continue;
213 |                 }
214 |                 t_freqRes[term] = std::make_pair(freq, result_num);
215 |             }
216 |             ifs.close();
217 |             //std::cout << "T: " << t_freq.size() << std::endl;
218 |             // step2, store term and it's freq
219 |             std::map<std::string, TFResPairType>::iterator it = t_freqRes.begin();
220 |             uint32_t size = t_freqRes.size();
221 |             termsInfo_.resize(size);
222 |             tf_.resize(size);
223 |             for (uint32_t idx = 0; it != t_freqRes.end() && idx < size; ++it, ++idx) {
224 |                 termsInfo_[idx] = std::make_pair(it->first, it->second); // store term info
225 |                 tf_[idx] = it->second.first; // tf
226 |             }
227 |             t_freqRes.clear();
228 |             //std::cout << "TT: " << terms_.size() << "\t " << tf_.size() << std::endl;
229 |             // step3, generate keys
230 |             KeyInfoType key_info;
231 |             for (uint32_t idx = 0; idx < termsInfo_.size(); ++idx) {
232 |                 Generate(termsInfo_[idx].first, idx, key_info);
233 |             }
234 | 
235 |             // step4 , compuate score for every key
236 |             KeyInfoType::iterator iter;
237 |             for (iter = key_info.begin(); iter != key_info.end(); ++iter) {
238 |                 std::vector<std::pair<uint32_t, double> >& info = iter->second;
239 |                 // score = tf * weight;
240 |                 for (uint32_t i = 0; i < info.size(); ++i) {
241 |                     if (info[i].first > tf_.size()) {
242 |                         info[i].second = 1.1;
243 |                         continue;
244 |                     }
245 |                     info[i].second = tf_[info[i].first] * info[i].second;
246 |                 }
247 |                 sort(info.begin(), info.end(), SORT<uint32_t, double>::sortDescendBySecond);
248 |                 // result deduplication
249 |                 std::vector<uint32_t> termsid;
250 |                 std::set<uint32_t> uniq_ids;
251 |                 for (uint32_t i = 0; i < info.size() && uniq_ids.size() <= 15; ++i) {
252 |                     if (uniq_ids.insert(info[i].first).second) {
253 |                         termsid.push_back(info[i].first);
254 |                     }
255 |                 }
256 |                 key_termIds_[iter->first].swap(termsid);
257 |                 uniq_ids.clear();
258 |             }
259 | 
260 |             std::cout << "key_termids size: " << key_termIds_.size() << std::endl;
261 |             std::cout << "Building completed!\n";
262 |         }
263 | 
264 |         // build from vector
265 |         bool Build(const std::vector<std::string>& termVec) {
266 |             return true;
267 |         }
268 | 
269 |         // store results to files
270 |         // @termFile: file to store all terms
271 |         // @keyFile: store all keys and term ids
272 |         bool Flush(const std::string& termFile, const std::string& keyFile) {
273 |             std::ofstream ofs_term(termFile.c_str());
274 |             if (!ofs_term) {
275 |                 std::cout << "open file " << termFile << "failed!\n";
276 |                 return false;
277 |             }
278 |             std::ofstream ofs_key(keyFile.c_str());
279 |             if (!ofs_key) {
280 |                 std::cout << "open file " << keyFile << "failed!\n";
281 |                 return false;
282 |             }
283 |             // store terms
284 |             for (uint32_t i = 0; i < termsInfo_.size(); ++i) {
285 |                 ofs_term << i << "\t" << termsInfo_[i].first << "\t"
286 |                   << termsInfo_[i].second.first << "\t" 
287 |                   << termsInfo_[i].second.second << "\n";
288 |             }
289 |             ofs_term.close();
290 | 
291 |             // store keys
292 |             KeyTermIDsType::iterator iter;
293 |             for (iter = key_termIds_.begin(); iter != key_termIds_.end(); ++iter) {
294 |                 std::vector<uint32_t>& ids = iter->second;
295 |                 // candidate is itself
296 |                 // do not suggestion itself
297 |                 if (ids.size() == 1 && ids[0] < termsInfo_.size() && termsInfo_[ids[0]].first == iter->first)
298 |                     continue;
299 |                 ofs_key << iter->first; // key
300 |                 for (uint32_t i = 0; i < ids.size(); ++i) {
301 |                     // make sure id is in the range of term vector
302 |                     if (ids[i] > termsInfo_.size())
303 |                         continue;
304 |                     // do not suggest itself
305 |                     if (iter->first == termsInfo_[ids[i]].first)
306 |                         continue;
307 |                     ofs_key << "\t" << termsInfo_[ids[i]].first;
308 |                 }
309 |                 ofs_key << "\n";
310 |             }
311 |             ofs_key.close();
312 | 
313 |             return true;
314 |         }
315 | 
316 |         // parse string into chars and words
317 |         // @chars: single unicode 
318 |         // @words: string segmentation results
319 |         bool Parse(const std::string& str
320 |                     ,std::vector<std::string>& chars
321 |                     ,std::vector<std::string>& words) {
322 |             if (str.empty())
323 |                 return false;
324 | 
325 |             // extract chars
326 |             chars.clear();
327 |             std::vector<UnicodeType> unicodes;
328 |             if (Normalize::ToUnicode(str, unicodes)) {
329 |                 //if(unicodes.empty())
330 |                 //    return false;
331 |                 chars.resize(unicodes.size());
332 |                 for (uint32_t i = 0; i < unicodes.size(); ++i) {
333 |                     std::string unicode;
334 |                     Normalize::UnicodeToUTF8Str(unicodes[i], unicode);
335 |                     chars[i] = unicode;
336 |                 }
337 |             } 
338 | 
339 |             // extract words
340 |             words.clear();
341 |             segWrapper_->segment(str, words, false);
342 | 
343 |             return true;
344 |         }
345 | 
346 |         // generate prefix
347 |         // @str: segment tokens
348 |         // @termid: token index
349 |         // @key_info: index key, it's corresponding term id and tf
350 |         // @num: number of keys
351 |         bool Generate(const std::string& str
352 |                       ,const uint32_t termid
353 |                       ,KeyInfoType& key_info
354 |                       ,const uint32_t num = 10) {
355 |             if (str.empty())
356 |                 return false;
357 |             
358 |             std::vector<std::string> chars;
359 |             std::vector<std::string> words;
360 | 
361 |             Parse(str, chars, words);
362 | 
363 |             std::vector<std::string> keys;
364 | 
365 |             // Start generation
366 |             GenerateByPrefix(chars, keys, num);
367 |             StoreInKeyMap(keys, key_info, termid, prefix_w_);
368 |             
369 |             GenerateByPinYinPrefix(str, keys, num);
370 |             StoreInKeyMap(keys, key_info, termid, pinyin_prefix_w_);
371 | 
372 |             GenerateByShengMuPrefix(str, keys, num);
373 |             StoreInKeyMap(keys, key_info, termid, shengmu_w_);
374 | 
375 |             GenerateByWordInfix(words, keys, num);
376 |             StoreInKeyMap(keys, key_info, termid, word_infix_w_);
377 | 
378 |             GenerateByWordSuffix(words, keys, num);
379 |             StoreInKeyMap(keys, key_info, termid, word_suffix_w_);
380 | 
381 |             return true;
382 |         }
383 | 
384 |         // generate key by prefix
385 |         // @chars: unicodes for input string
386 |         // @keys: index key generated based on every unicode
387 |         // @num: the number of unicodes chosen to be stored in keys 
388 |         bool GenerateByPrefix(const std::vector<std::string>& chars
389 |                               ,std::vector<std::string>& keys
390 |                               ,const uint32_t num ) {
391 |             keys.clear();
392 |             std::string prefix("");
393 |             for (uint32_t i = 0; i < chars.size() && i < num; ++i) {
394 |                 prefix += chars[i];
395 |                 keys.push_back(prefix);
396 |             }
397 | 
398 |             return true;
399 |         }
400 | 
401 |         // generate key by word infix,
402 |         // @words: segment tokens
403 |         // @keys: index key generated based on infix in token terms, if the token number < 3
404 |         //        then return directly, as there are only two chinese characters in string, no infix.
405 |         // @num: the number of words chosen to be processed.
406 |         bool GenerateByWordInfix(const std::vector<std::string>& words
407 |                                  ,std::vector<std::string>& keys
408 |                                  ,const uint32_t num) {
409 |             keys.clear();
410 |             uint32_t size = words.size();
411 |             if (size < 3)
412 |                 return false;
413 | 
414 |             for (uint32_t i = 0; i < size - 1 && i < num; ++i )
415 |                 keys.push_back(words[i]);
416 | 
417 |             return true;
418 |         }
419 | 
420 |         // generate key by last term(suffix)
421 |         // @workds: segment tokens
422 |         // @keys: index keys 
423 |         // @num: the number of words chosen to be processed
424 |         bool GenerateByWordSuffix(const std::vector<std::string>& words
425 |                                   ,std::vector<std::string>& keys
426 |                                   ,const uint32_t num) {
427 |             keys.clear();
428 |             if (words.size() > 1)
429 |                 keys.push_back(words.back());
430 |             else
431 |                 return false;
432 | 
433 |             return true;
434 |         }
435 | 
436 |         // generate key by pinyin prefix
437 |         // @str: input string, can be any combination
438 |         // @keys: index key generated based on pinyin prefix which generated by Dictionary::GetPinYin()
439 |         // @num: the length of prefix 
440 |         bool GenerateByPinYinPrefix(const std::string& str
441 |                                     ,std::vector<std::string>& keys
442 |                                     ,const uint32_t num) {
443 |             keys.clear();
444 |             if (str.empty())
445 |                 return false;
446 |             // get pinyin from pinyin module
447 |             std::vector<std::string> pinyin;
448 |             pySegDict_->GetPinYin(str, pinyin);
449 |             
450 |             // extract prefixes from pinyin string
451 |             std::set<std::string> prefix;
452 |             for (uint32_t i = 0; i < pinyin.size(); ++i) {
453 |                 std::string pre("");
454 |                 // process every pinyin
455 |                 for (uint32_t j = 0; j < pinyin[i].size(); ++j) {
456 |                     pre += pinyin[i][j]; // each letter
457 |                     if (prefix.insert(pre).second && pre.length() < num)
458 |                         keys.push_back(pre);
459 |                 }
460 |             }
461 |             
462 |             return true;
463 |         }
464 | 
465 |         // generate key by sheng mu prefix
466 |         // @str: input string
467 |         // @keys: index key generated based on sheng mu prefix of pinyin
468 |         // @len:
469 |         // eg: 银行          ShengMu
470 |         //     yin       ->   y
471 |         //     hang xing ->  y x(polyphone)
472 |         //     result: y, yx
473 |         bool GenerateByShengMuPrefix(const std::string& str
474 |                                      ,std::vector<std::string>& keys
475 |                                      ,const uint32_t len) {
476 |             keys.clear();
477 |             if (str.empty())
478 |                 return false;
479 | 
480 |             std::vector<std::vector<std::string> > shm_list;
481 |             // convert string to unicodes
482 |             std::vector<UCS2Char> unicodes;
483 |             Normalize::ToUnicode(str, unicodes);
484 |             if (unicodes.empty())
485 |                 return false;
486 |             for (uint32_t i = 0; i < unicodes.size(); ++i) {
487 |                 // get one letter
488 |                 UCS2Char uchar = unicodes[i];
489 |                 std::string ustr;
490 |                 Normalize::UnicodeToUTF8Str(uchar, ustr);
491 |                 // type inditification
492 |                 // space
493 |                 if (ustr == " ") {
494 |                     shm_list.push_back(std::vector<std::string>(1, " ")); // insert a space
495 |                 } else {
496 |                     // is chinese char, 
497 |                     if (Normalize::IsChinese(uchar)) {
498 |                         std::vector<std::string> shm;
499 |                         std::vector<std::string> pinyin;
500 |                        // std::cout < "T -> pinyin: " << ustr << std::endl;
501 |                         pySegDict_->GetPinYin(ustr, pinyin); // extract pinyin
502 |                         GetShengMuByPinYin_(pinyin, shm);    // extract sheng mu
503 |                         shm_list.push_back(std::vector<std::string>());
504 |                         shm_list.back().swap(shm);
505 |                     } else {
506 |                         // is alphabet / digital
507 |                         shm_list.push_back(std::vector<std::string>(1,ustr));
508 |                     }
509 |                 }
510 |             }
511 |             // get all shengmu combination includes polyphone
512 |             if (shm_list.size() < 1)
513 |                 return false;
514 |             std::vector<std::string> shengmus = shm_list[0]; // pinyin of the first word
515 |             for (uint32_t idx = 1; idx < shm_list.size(); ++idx) {
516 |                 std::vector<std::string>& current_shm = shm_list[idx];
517 |                 if (current_shm.size() == 1) { // no polyphone
518 |                     for (uint32_t id = 0; id < shengmus.size(); ++id ) {
519 |                         shengmus[id] += current_shm[0];
520 |                     }
521 |                 } else { // with polyphone
522 |                     std::vector<std::string> pre_shm;
523 |                     pre_shm.swap(shengmus);
524 |                     // combine the previous and current
525 |                     for (uint32_t pre_idx = 0; pre_idx < pre_shm.size(); ++pre_idx) {
526 |                         for (uint32_t cur_idx = 0; cur_idx < current_shm.size(); ++cur_idx) {
527 |                             shengmus.push_back( pre_shm[pre_idx] + current_shm[cur_idx] );
528 |                         }
529 |                     }
530 |                 }
531 |             }
532 | 
533 |             // get all shengmu prefix remove the duplication
534 |             std::set<std::string> prefixes;
535 |             for (uint32_t idx = 0; idx < shengmus.size(); ++idx) {
536 |                 std::string prefix;
537 |                 for (uint32_t py_id = 0; py_id < shengmus[idx].size(); ++py_id) {
538 |                     prefix += shengmus[idx][py_id];
539 |                     if (prefixes.insert(prefix).second && prefix.length() < len)
540 |                         keys.push_back(prefix);
541 |                 }
542 |             }
543 | 
544 |             return true;
545 |         }
546 | };
547 | 
548 | #endif // buildEngine.hpp
549 | 


--------------------------------------------------------------------------------
/include/correctionEngine.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: correctionEngine.h
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Wed 13 Jul 2016 05:04:48 PM CST
 7 |  ************************************************************************/
 8 | // Corrrection engine contains pinyin segmentation, pinyin and chinese 
 9 | // character interconversion, pinyin correction, english words correction
10 | // chinese words correction,etc.
11 | 
12 | #ifndef CORRECTION_ENGINE_H
13 | #define CORRECTION_ENGINE_H
14 | 
15 | #include <iostream>
16 | #include ""
17 | 
18 | 
19 | 
20 | #endif // correctionEngine.h
21 | 


--------------------------------------------------------------------------------
/include/dictionary.hpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: dictionary.hpp
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Wed 13 Jul 2016 05:41:24 PM CST
  7 |  ************************************************************************/
  8 | // Trie struct for quick find, pinyin tokenizer, pinyin -> chinese character
  9 | // chinese -> pinyin, fuzzy pinyin map, pinyin filter map.
 10 | 
 11 | #ifndef DICTIONARY_HPP
 12 | #define DICTIONARY_HPP
 13 | 
 14 | #include <iostream>
 15 | #include <fstream>
 16 | 
 17 | #include "util/normalize.h"
 18 | #include "util/darts.h"
 19 | 
 20 | #include <boost/lexical_cast.hpp>
 21 | #include <boost/unordered_map.hpp>
 22 | #include <boost/algorithm/string.hpp>
 23 | #include <boost/algorithm/string/trim.hpp>
 24 | #include <boost/algorithm/string/split.hpp>
 25 | #include <boost/shared_ptr.hpp>
 26 | 
 27 | typedef Darts::DoubleArray Trie;
 28 | typedef uint16_t UCS2Char;
 29 | typedef boost::unordered_map<UCS2Char, std::vector<std::string> > Cn2PinYinType; // chinese word to pinyin list
 30 | typedef boost::unordered_map<std::string, std::vector<UCS2Char> > PinYin2CnType; // pinyin to word list
 31 | 
 32 | 
 33 | // class dictionary was created based on trie , usage of this class
 34 | // Dictioanry::Segment(): given a string, sement it into pinyin tokens, like "yinhang" , tokens:'yin','hang'; 
 35 | // Dictionary::GetChar(): given pinyin return it's chinese character list
 36 | // Dictionary::GetPinYin(): given a chinese char return it's pinyin list, do not distinguish ployphone.
 37 | class Dictionary
 38 | {
 39 |     private:
 40 |         Trie trie_; // trie tree
 41 |         Cn2PinYinType cn2pinyin_;         // chinese character -> pinyin list
 42 |         PinYin2CnType pinyin2cn_;         // pinyin -> chinese character list
 43 |         std::vector<std::string> pinyin_; // all single pinyin        
 44 |         boost::unordered_map<std::string, bool> filter_pinyin_; // filter pinyin
 45 |         
 46 |         // load pinyin and chinese character from file
 47 |         // @dir: the resource path
 48 |         void Load_(const std::string& dir) {
 49 |            
 50 |             // TODO:
 51 |             // quick load
 52 |             /* int32_t flag = 1; // to check if the bin file opened successfully.
 53 |             try {
 54 |                 if ((flag = trie_.open((dir+"/pinyin.bin").c_str())) == 0) {
 55 |                 }
 56 |             } catch(...) {  // do not throw exception
 57 |             } */
 58 |             
 59 |             std::ifstream ifs((dir+"/pinyin.txt").c_str());
 60 |             if(!ifs.is_open()) {
 61 |                 std::cout << "Open " << (dir+"/pinyin.txt") << "failed!\n";
 62 |                 return;
 63 |             }
 64 |             std::string line;
 65 |             std::set<std::string> pinyinSet;
 66 |             while (getline(ifs, line)) {
 67 |                 boost::algorithm::trim(line);
 68 |                 std::vector<std::string> vec;
 69 |                 boost::algorithm::split(vec, line, boost::is_any_of(" "));
 70 |                 if (vec.size() != 2)
 71 |                     continue;
 72 |                 std::string cnChar = vec[0];
 73 |                 std::string pinyin = vec[1].substr(0, vec[1].length()-1); 
 74 |                 Normalize::ToUTF8(cnChar);
 75 |                 // filter pinyin
 76 |                 if (filter_pinyin_.find(pinyin) == filter_pinyin_.end())
 77 |                 AddPinYinCnMap(pinyin, cnChar);
 78 |                 pinyinSet.insert(pinyin);
 79 |             }
 80 |             pinyin_.insert(pinyin_.end(), pinyinSet.begin(), pinyinSet.end());
 81 |             std::cout << "Resouces loaded pinyin size: " << pinyin_.size() << std::endl; 
 82 |         }
 83 | 
 84 |         // Merge 
 85 |         void Merge_( std::vector<std::string>& tks) {
 86 |             std::vector<uint32_t> flags(tks.size(), 0);
 87 |             for (uint32_t i = 0; i < tks.size(); ++i) {
 88 |                 uint32_t j = 0;
 89 |                 for (; j < tks[i].length(); ++j) {
 90 |                     if (Normalize::IsDigit(tks[i][j])
 91 |                     || (Normalize::IsAlpha(tks[i][j])
 92 |                     && tks[i].length() < 2))
 93 |                         break;
 94 |                 }
 95 |                 if (j < tks[i].length()) flags[i] = 1;
 96 |             }
 97 | 
 98 |             // TODO: Rule1
 99 |             // eg: a, b, cd, e
100 |             // a, b , e needs to be merged
101 |             // we make a conclsion that abcde is one word.
102 |             for (uint32_t i = 0; i < flags.size(); ++i) {
103 |                 if ((i>=1) && (i+1) < flags.size()) {
104 |                     if (flags[i-1] == 1 && flags[i+1] == 1)
105 |                         flags[i] = 1;
106 |                 }
107 |             }
108 |             // TODO: Rule2
109 |             // eg: a,p,p,le
110 |             // we know it's english word, if the penult token needs to be merged
111 |             // we also set the merge flag of the last token to true.
112 |             uint32_t k = flags.size() - 1;
113 |             if (k >= 1) {
114 |               if (flags[k-1] == 1)
115 |                 flags[k] = 1;
116 |               }
117 | 
118 |             // merge
119 |             for (uint32_t i = 0; i < tks.size(); ++i) {
120 |                 if (flags[i]) {
121 |                     uint32_t t = i;
122 |                     ++i;
123 |                     while (i < tks.size() && flags[i]) {
124 |                         tks[t] += tks[i];
125 |                        // std::cout << "TTT:: " << tks[t] << std::endl;
126 |                         tks.erase(tks.begin()+i);
127 |                         flags.erase(flags.begin()+i);
128 |                     }
129 |                 }
130 |             }
131 |         }
132 | 
133 |         // Clean, remove punctuation string
134 |         void Clean_(std::vector<std::string>& tks) {
135 |             for (uint32_t i = 0; i < tks.size(); ++i) {
136 |                 boost::algorithm::trim(tks[i]);
137 |                 if (tks[i].length() > 0 
138 |                    && Normalize::IsPunctuation(tks[i][0])) {
139 |                     tks[i].clear();
140 |                 }
141 |             }
142 |         }
143 |     public:
144 |         Dictionary(const std::string& dir) {
145 |             cn2pinyin_.clear();
146 |             pinyin2cn_.clear();
147 |             pinyin_.clear();
148 |             filter_pinyin_.clear();
149 | 
150 |             Init();
151 |             LoadResource(dir);
152 |         }
153 |         ~Dictionary() {
154 |         }
155 | 
156 |         void Init() {
157 | 
158 |             // add other pinyin
159 |   /*          pinyin_.push_back("chon");
160 |             pinyin_.push_back("con");
161 |             pinyin_.push_back("don");
162 |             pinyin_.push_back("gon");
163 |             pinyin_.push_back("hon");
164 |             pinyin_.push_back("jion");
165 |             pinyin_.push_back("kon");
166 |             pinyin_.push_back("lon");
167 |             pinyin_.push_back("non");
168 |             pinyin_.push_back("qion");
169 |             pinyin_.push_back("ron");
170 |             pinyin_.push_back("son");
171 |             pinyin_.push_back("ton");
172 |             pinyin_.push_back("xion");
173 |             pinyin_.push_back("yon");
174 |             pinyin_.push_back("zhon");
175 |             pinyin_.push_back("zon");
176 | */
177 |             // add filter pinyin
178 |             filter_pinyin_.insert(std::make_pair("n", 1));
179 |             filter_pinyin_.insert(std::make_pair("ng", 1));
180 |             filter_pinyin_.insert(std::make_pair("m", 1));
181 |             filter_pinyin_.insert(std::make_pair("o", 1));
182 |         }
183 |         // load resource from file
184 |         void LoadResource(const std::string& dir) {
185 |             if (dir.empty()) {
186 |                 std::cout << "directory is not exists!\n";
187 |                 return;
188 |             }
189 | 
190 |             // load pinyin from file
191 |             Load_(dir);
192 |             // build trie
193 |             std::size_t SIZE = pinyin_.size();
194 |             std::vector<std::size_t> lengths(SIZE);
195 |             typedef Darts::DoubleArray::value_type value_type;
196 |             std::vector<value_type> states(SIZE);
197 |             std::vector<const char*> keys(SIZE);
198 |             for (uint32_t i = 0; i < SIZE; ++i) {
199 |                 keys[i] = pinyin_[i].c_str();
200 |                 lengths[i] = pinyin_[i].length();
201 |                 states[i] = i;
202 |             }
203 |             
204 |             assert(keys.size() == pinyin_.size());
205 |             trie_.build(keys.size(), &keys[0], &lengths[0], &states[0]);
206 |             // TODO:
207 |             // save bin file
208 |         }
209 | 
210 |         // Add pinyin and chinese char into map
211 |         void AddPinYinCnMap(const std::string& pinyin, const std::string& cnChar) {
212 |             // Add cnChar
213 |             Cn2PinYinType::iterator cnIter;
214 |             std::vector<UCS2Char> cnChars;
215 |             utf8::utf8to16(cnChar.begin(), cnChar.end(), std::back_inserter(cnChars));
216 |             cnIter = cn2pinyin_.find(cnChars[0]);
217 |             if (cnIter == cn2pinyin_.end()) { // not found in map
218 |                 std::vector<std::string> pinyin_list(1, pinyin);
219 |                 cn2pinyin_.insert(std::make_pair(cnChars[0], pinyin_list));
220 |             } else { // add in the previous list
221 |                 std::vector<std::string>& pinyin_list = cnIter->second;
222 |                 std::vector<std::string>::iterator pyIter;
223 |                 pyIter = std::find(pinyin_list.begin(), pinyin_list.end(), pinyin);
224 |                 if (pyIter == pinyin_list.end()) {
225 |                     pinyin_list.push_back(pinyin);
226 |                 }
227 |             }
228 | 
229 |             // Add pinyin
230 |             PinYin2CnType::iterator pyIter;
231 |             pyIter = pinyin2cn_.find(pinyin);
232 |             if (pyIter == pinyin2cn_.end()) {
233 |                 std::vector<UCS2Char> cnChar_list(1, cnChars[0]);
234 |                 pinyin2cn_.insert(std::make_pair(pinyin, cnChar_list));
235 |             } else {
236 |                 std::vector<UCS2Char>& cnChar_list = pyIter->second;
237 |                 std::vector<UCS2Char>::iterator cnIter;
238 |                 cnIter = std::find(cnChar_list.begin(), cnChar_list.end(), cnChars[0]);
239 |                 if (cnIter == cnChar_list.end()) {
240 |                     cnChar_list.push_back(cnChars[0]);
241 |                 }
242 |             }
243 |         }
244 | 
245 |         // pinyin tokenizer
246 |         void Segment(const std::string& pinyin, std::vector<std::string>& result) {
247 |             if (pinyin.empty())
248 |                 return;
249 |             Fmm(pinyin, result);
250 |             Merge_(result);
251 |             Clean_(result);
252 |         }
253 | 
254 |         // get chinese character based on pinyin string
255 |         bool GetChar(const std::string& pinyin, std::vector<std::string>& result) {
256 |             PinYin2CnType::iterator cnIter;
257 |             cnIter = pinyin2cn_.find(pinyin);
258 |             if (cnIter != pinyin2cn_.end()) {
259 |                 std::vector<UCS2Char> cnChars;
260 |                 cnChars = cnIter->second;
261 |                 uint32_t size = cnChars.size();
262 |                 result.resize(size);
263 |                 for (uint32_t i = 0; i < size; ++i) {
264 |                     std::string utf8str;
265 |                     Normalize::UnicodeToUTF8Str(cnChars[i], utf8str);
266 |                     //std::cout << "T: " << utf8str << std::endl;
267 |                     result[i] = utf8str;
268 |                 }
269 |                 return true;
270 |             }
271 |             return false;
272 |         }
273 | 
274 |         // get pinyin list based on chinese character string
275 |         bool GetPinYin(const std::string& cnChar, std::vector<std::string>& result) {
276 |             result.clear();
277 |             if (cnChar.empty()) {
278 |                 return false;
279 |             }
280 |             std::vector<UCS2Char> cnChars;
281 |             Normalize::ToUnicode(cnChar, cnChars);
282 | 
283 |             GetPinYin_(cnChars, "", result);
284 |             return true;
285 |         }
286 | 
287 |         // input chinese character and pinyin combination and get the pinyin
288 |         // recursive function
289 |         void GetPinYin_(const std::vector<UCS2Char>& cnChars,const std::string& mid_result
290 |                         ,std::vector<std::string>& result_list) {
291 |             if (result_list.size() >= 1024)
292 |                 return;
293 | 
294 |             std::vector<std::string> pinyin_term_list;
295 |             uint32_t offset = 0; 
296 |             // case 1, only chinese and has pinyin
297 |             if (!cnChars.empty() && Normalize::IsChinese(cnChars[0])
298 |                     && (GetPinYinTerm(cnChars[0], pinyin_term_list))) {
299 |                 std::vector<UCS2Char> remain(cnChars.begin()+1,cnChars.end());
300 |                // std::cout << "T1: " << remain.size() << std::endl;
301 |                 std::string new_mid(mid_result);
302 |                 for (uint32_t i = 0; i < pinyin_term_list.size(); ++i) {
303 |                     std::string mid = new_mid + pinyin_term_list[i];
304 |                    // std::cout << "T2: " << mid << std::endl;
305 |                     GetPinYin_(remain, mid, result_list);
306 |                     offset += 1;
307 |                 }
308 |             } else {
309 |                 if (!cnChars.empty() && !Normalize::IsChinese(cnChars[0])) {
310 |                    // std::cout << "T33-1: " << cnChars.size() << std::endl;
311 |                     std::vector<UCS2Char> remain(cnChars.begin()+1,cnChars.end());
312 |                    // std::cout << "T33: " << remain.size() << std::endl;
313 |                     std::string tmp("");
314 |                     if (cnChars.size() != 0)
315 |                         Normalize::UnicodeToUTF8Str(cnChars[0], tmp);
316 |                     std::string mid = mid_result + tmp;
317 |                     //std::cout << "T3: " << mid << std::endl;
318 |                     GetPinYin_(remain, mid, result_list);
319 |                 } else {
320 |                     result_list.push_back(mid_result);
321 |                     //std::cout << "T4: " << mid_result << std::endl;
322 |                 }
323 |             }
324 |         }
325 |         
326 |         // get pinyin list from chinese character map
327 |         bool GetPinYinTerm(const UCS2Char& cnChar, std::vector<std::string>& result) {
328 |             Cn2PinYinType::iterator cnIter;
329 |             cnIter = cn2pinyin_.find(cnChar);
330 |             if (cnIter != cn2pinyin_.end()) {
331 |                 result = cnIter->second;
332 |                 return true;
333 |             }
334 |             return false;
335 |         }
336 | 
337 |         // Reload
338 |         // get pinyin list from chinese character map
339 |         bool GetPinYinTerm(const std::string& cnChar, std::vector<std::string>& result) {
340 |             std::vector<UCS2Char> cnChars;
341 |             utf8::utf8to16(cnChar.begin(), cnChar.end(), std::back_inserter(cnChars));
342 |             Cn2PinYinType::iterator cnIter;
343 |             cnIter = cn2pinyin_.find(cnChars[0]);
344 |             if (cnIter != cn2pinyin_.end()) {
345 |                 result = cnIter->second;
346 |                 return true;
347 |             }
348 |             return false;
349 |         }
350 | 
351 |         // maximum match
352 |         void Fmm(const std::string& line, std::vector<std::string>& r) {
353 |             r.clear();
354 |             std::string uline(line);
355 |             std::vector<uint32_t> lens, cumu_lens;
356 | 
357 |             // remove invalid encoding
358 |             Normalize::RemoveInvalidUTF8(uline);
359 |             std::string::iterator it = uline.begin();
360 |             while (it != uline.end()) {
361 |                 uint32_t code = utf8::next(it, uline.end());
362 |                 std::string _str; // insert from string back
363 |                 utf8::append(code, std::back_inserter(_str));
364 |                 lens.push_back(_str.length());
365 |                 if (cumu_lens.size() > 0) {
366 |                     cumu_lens.push_back(_str.length()+cumu_lens.back());
367 |                 } else {
368 |                     cumu_lens.push_back(_str.length());
369 |                 }
370 |             }
371 | 
372 |             // start maximum match
373 |             std::size_t key_pos = 0;
374 |             for (std::size_t j = 0; j < lens.size(); ++j) {
375 |                 std::size_t last_j = j, jj = j;
376 |                 Trie::value_type last_state = -1;
377 |                 Trie::value_type state;
378 |                 std::size_t node_pos = 0;
379 | 
380 |                 // traverse trie and check the node exist or not
381 |                 while (j < lens.size()
382 |                     && (state=trie_.traverse(uline.c_str(), node_pos, key_pos, cumu_lens[j])) != -2) {
383 |                     j++;
384 |                     if (state < 0)
385 |                         continue;
386 |                     last_state = state;
387 |                     last_j = j -1;
388 |                 }
389 | 
390 |                 // found
391 |                 if (last_state >=0) {
392 |                     std::string py;
393 |                     if ((uint32_t)last_state < pinyin_.size()) {
394 |                         py = std::string(uline.c_str()+cumu_lens[jj]-lens[jj],uline.c_str()+cumu_lens[last_j]);
395 |                         r.push_back(py);
396 |                     }
397 |                 } else {
398 |                     std::string py;
399 |                     py = std::string(uline.c_str()+cumu_lens[jj]-lens[jj], uline.c_str()+cumu_lens[jj]);
400 |                     r.push_back(py);
401 |                 }
402 |                 j = last_j;
403 |                 key_pos = cumu_lens[j];
404 |             }
405 |         }
406 | 
407 | };
408 | 
409 | #endif // dictionary.hpp
410 | 


--------------------------------------------------------------------------------
/include/pinyinConvert.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: pinyinConvert.h
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Wed 09 Mar 2016 10:30:45 AM CST
  7 |  ************************************************************************/
  8 | #ifndef PINYINCONVERT_H
  9 | #define PINYINCONVERT_H
 10 | 
 11 | #include <iostream>
 12 | #include <boost/unordered_map.hpp>
 13 | #include <boost/tuple/tuple.hpp>
 14 | #include "mtrie.h"
 15 | #include "py_types.h"
 16 | 
 17 | class PinYinConvert
 18 | {
 19 |    
 20 |    typedef MTrie<std::string, uint32_t> PinyinDictType;
 21 |    typedef MTrie<std::string, uint32_t, std::string> FuzzyDictType;
 22 |    typedef boost::unordered_map<izenelib::util::UCS2Char, std::vector<std::string> > Cn2PinyinType;
 23 |    typedef boost::unordered_map<std::string, std::vector<izenelib::util::UCS2Char> > Pinyin2CnType;
 24 | 
 25 |    typedef boost::tuple<uint32_t, uint32_t izenelib::util::UString> QueryLogType;
 26 |    typedef std::pair<uint32_t, izenelib::util::UString> PropertyLabelType;
 27 |    struct TransProbType
 28 |    {
 29 |        boost::unordered_map<Unigram, double> u_trans_prob_;
 30 |        boost::unordered_map<Bigram, double> b_trans_prob_;
 31 |        boost::unordered_map<Trigram, double> t_trans_prob_;
 32 |        void clear()
 33 |        {
 34 |            u_trans_prob_.clear();
 35 |            b_trans_prob_.clear();
 36 |            t_trans_prob_.clear();
 37 |        }
 38 |        bool empty() const
 39 |        {
 40 |            return u_trans_prob_.empty() && b_trans_prob_.empty() && t_trans_prob_.empty();
 41 |        }
 42 |    };
 43 | 
 44 | public:
 45 |    explicit PinYinConvert(const std::string& collection_dir = "");
 46 |    
 47 | 
 48 |    // load resource
 49 |    bool load();
 50 |    
 51 |    bool getResult(const izenelib::util::UString& input,
 52 |         std::vector<std::pair<double, std::string> >& pinyin_list,
 53 |         std::vector<izenelib::util::UString>& output);
 54 | 
 55 |    void AddPinyinMap(const std::string& pinyin, const izenelib::util::UCS2Char& cn_char);
 56 | 
 57 |    void getPinyin(const izenelib::util::UString& cn_chars, std::vector<std::string>& result_list);
 58 | 
 59 |    void getChar(const std::string& pinyin, std::vector<std::string>& result_list);
 60 | 
 61 |    void getRelativeList(const izenelib::util::UString& hanzi, 
 62 |             std::vector<std::pair<izenelib::util::UString,uint32_t> >& ResultList);
 63 | 
 64 |    static std::string res_dir_;
 65 | 
 66 | private:
 67 |    void loadRawTextTransProb_(TransProbType& trans_prob, const std::string file);
 68 | 
 69 |    void flushRawTextTransProb_(const std::string& file, const TransProbType& trans_prob);
 70 | 
 71 |    void transProb_(const izenelib::util::UCS2Char& from, const izenelib::util::UCS2Char& to);
 72 | 
 73 |    void updateItem_(TransProbType& trans_prob, const uint32_t df, const izenelib::util::UString& text);
 74 | 
 75 |    int getInputType_(const izenelib::util::UString& input);
 76 | 
 77 |    bool getResultWithScore_(const izenelib::util::UString& input
 78 |         int type,
 79 |         std::vector<std::pair<double, std::string> >& pinyin_list,
 80 |         std::vector<CandidateResult>& output);
 81 | 
 82 |    void getResultByPinyin_(const std::string& pinyin, double pinyin_score,
 83 |         std::vector<CandidateResult>& output);
 84 | 
 85 |    // trigram 
 86 |    void getResultByPinyinT_(const std::string& pinyin, double pinyin_score,
 87 |         std::vector<CandidateResult>& output);
 88 | 
 89 |    void getResultByPinyinTRecur_(const std::string& pinyin,double base_score,
 90 |         std::pair<double, izenelib::UString>& mid_result,
 91 |         std::vector<CandidateResult>& output);
 92 | 
 93 |    double getScore_(const izenelib::util::UString& text,double ori_score,
 94 |             double pinyin_score);
 95 | 
 96 |    bool isCandidate_(const izenelib::util::UString& text, double ori_score,
 97 |         double pinyin_score, double& score);
 98 | 
 99 |    bool isCandidateResult_(const izenelib::util::UString& text, double ori_score,
100 |         double pinyin_score, double& score);
101 | 
102 | 
103 |    static TransProbType global_trans_prob_;
104 |    TransProbType collection_trans_prob_;
105 | 
106 |    std::string collection_dir_;
107 |    double threshold_;
108 |    double mid_threshold_;
109 |    uint16_t max_pinyin_term_;
110 |    
111 |    PinyinDictType pinyin_dict_;
112 |    Pinyin2CnType pinyin2cn_;
113 |    Cn2PinyinType cn2pinyin_;
114 |    boost::unordered_map<std::string, bool> filter_pinyin_;
115 | 
116 |    boost::mutex mutex_;
117 | }
118 | 
119 | 
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/include/pinyinEngine.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: pinyinEngine.h
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Mon 11 Jul 2016 01:50:06 PM CST
 7 |  ************************************************************************/
 8 | #ifndef PINYIN_ENGINE_H
 9 | #define PINYIN_ENGINE_H
10 | 
11 | 
12 | #include <iostream>
13 | #include "util/mtrie.h"
14 | #include "util/py_types.h"
15 | 
16 | 
17 | class PinYinEngine {
18 |     private:
19 |         typedef MTrie<Ngram, uint32_t, double> TrieType;
20 |         
21 |         struct TransProbType {
22 |             // unigram transition probability
23 |             boost::unordered_map<Unigram, double> u_trans_prob_; 
24 |             boost::unordered_map<Bigram, double> b_trans_prob_;  // bigram
25 |             boost::unordered_map<Trigram, double> t_trans_prob_; // trigram
26 | 
27 |             // clear all data
28 |             void clear() {
29 |                 u_trans_prob_.clear();
30 |                 b_trans_prob_.clear();
31 |                 t_trans_prob_.clear();
32 |             }
33 | 
34 |             bool empty() {
35 |                 return u_trans_prob_.empty() && b_trans_prob_.empty() && t_trans_prob_.empty();
36 |             }
37 |         };
38 | 
39 |     public:
40 |         // load resource
41 |         bool Load() {
42 |         }
43 | 
44 |         // get chinese character
45 |         void GetCnChar() {
46 |         }
47 | 
48 |         // get pinyin from chinese character
49 |         void GetPinYin() {
50 |         }
51 | 
52 |         void GetPinYinWithScore() {
53 |         }
54 | 
55 |         // pinyin tokenizer
56 |         void PySegment() {
57 |         }
58 | 
59 |         // fuzzy segmentation
60 |         void FuzzyPySegment() {
61 |         }
62 | 
63 |     private:
64 | 
65 | 
66 | 
67 | 
68 | };
69 | 
70 | #endif // pinyinEngine.h
71 | 


--------------------------------------------------------------------------------
/include/segment/hash_table.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef HASH_TABLE_HPP_
  2 | #define HASH_TABLE_HPP_
  3 | #include <sys/types.h>
  4 | #include <cstdio>
  5 | #include <string>
  6 | #include "kstring.hpp"
  7 | 
  8 | 
  9 | template<class KEY_T, class VALUE_T >
 10 | class KIntegerHashTable
 11 | {
 12 |     typedef struct _NODE_
 13 |     {
 14 |         union{
 15 |             uint8_t key_[sizeof(KEY_T)];
 16 |             KEY_T key_type_value_;
 17 |         };
 18 |         union{
 19 |             uint8_t value_[sizeof(VALUE_T)];
 20 |             VALUE_T value_type_value_;
 21 |         };
 22 |         union{
 23 |             uint8_t next_[sizeof(uint32_t)];
 24 |             uint32_t next_type_value_;
 25 |         };
 26 | 
 27 |         _NODE_(const KEY_T& k, const VALUE_T& v, uint32_t ne = -1)
 28 |         {
 29 |             key_type_value_ = k;
 30 |             value_type_value_ = v;
 31 |             next_type_value_ = ne;
 32 |         }
 33 | 
 34 |         _NODE_()
 35 |         {
 36 |             next_type_value_ = -1;
 37 |         }
 38 | 
 39 |         KEY_T key()const
 40 |         {
 41 |             return key_type_value_;
 42 |         }
 43 | 
 44 |         VALUE_T value()const
 45 |         {
 46 |             return value_type_value_;
 47 |         }
 48 | 
 49 |         KEY_T& key()
 50 |         {
 51 |             return key_type_value_;
 52 |         }
 53 | 
 54 |         VALUE_T& value()
 55 |         {
 56 |             return value_type_value_;
 57 |         }
 58 | 
 59 |         uint32_t& next()
 60 |         {
 61 |             return next_type_value_;
 62 |         }
 63 | 
 64 |         uint32_t next()const
 65 |         {
 66 |             return next_type_value_;
 67 |         }
 68 | 
 69 |         bool operator == (const struct _NODE_& o)const
 70 |         {
 71 |             return key() == o.key();
 72 |         }
 73 | 
 74 |         _NODE_& operator = (const struct _NODE_& o)
 75 |         {
 76 |             key() = o.key();
 77 |             value() = o.value();
 78 |             next() = o.next();
 79 |             return *this;
 80 |         }
 81 |     } node_t;
 82 | 
 83 |     node_t* nodes_;
 84 |     uint32_t nodes_num_;
 85 |     uint32_t* entry_;
 86 |     uint32_t entry_size_;
 87 |     uint32_t avai_i_;
 88 |     uint32_t size_;
 89 | 
 90 |     uint32_t expansion_(uint32_t len)
 91 |     {
 92 |         if (len < 1000)return 2*len;
 93 |         if (len < 10000)return 1.5*len;
 94 |         return 1.1*len;
 95 |     }
 96 | 
 97 |     uint32_t available_()
 98 |     {
 99 |         if (size_+2 < nodes_num_)
100 |             return avai_i_;
101 |         uint32_t nn = expansion_(nodes_num_);
102 |         node_t* n = new node_t[nn];
103 |         memcpy(n, nodes_, nodes_num_*sizeof(node_t));
104 |         for ( uint32_t i=nodes_num_-1; i<nn-1; ++i)
105 |             n[i].next() = i+1;
106 |         delete[] nodes_;
107 |         nodes_ = n;
108 |         nodes_num_ = nn;
109 |         return avai_i_;
110 |     }
111 | 
112 | public:
113 |     KIntegerHashTable(uint32_t ent_size = 100000, uint32_t element_num = 50000)
114 |     {
115 |         nodes_ = new node_t[element_num];
116 |         entry_ = new uint32_t[ent_size];
117 |         for ( uint32_t i=0; i<element_num-1; ++i)
118 |             nodes_[i].next() = i+1;
119 |         for ( uint32_t i=0; i<ent_size; ++i)
120 |             entry_[i] = -1;
121 |         avai_i_ = 0;
122 |         nodes_num_ = element_num;
123 |         entry_size_ = ent_size;
124 |         size_ = 0;
125 |     }
126 | 
127 |     ~KIntegerHashTable()
128 |     {
129 |         delete[] nodes_;
130 |         delete[] entry_;
131 |     }
132 | 
133 |     void reserve(uint32_t ent_size, uint32_t element_num)
134 |     {
135 |         if (ent_size > entry_size_)
136 |         {
137 |             uint32_t* e = new uint32_t[ent_size];
138 |             memcpy(e, entry_, entry_size_*sizeof(uint32_t));
139 |             delete[] entry_;
140 |             entry_ = e;
141 |             entry_size_ = ent_size;
142 |         }
143 |         if (element_num > nodes_num_)
144 |         {
145 |             node_t* n = new node_t[element_num];
146 |             memcpy(n, nodes_, nodes_num_*sizeof(node_t));
147 |             delete[] nodes_;
148 |             for ( uint32_t i=nodes_num_-1; i<element_num-1; ++i)
149 |                 n[i].next() = i+1;
150 |             nodes_ = n;
151 |             nodes_num_ = element_num;
152 |         }
153 |     }
154 | 
155 |     void insert(const KEY_T& k, const VALUE_T& v)
156 |     {
157 |         uint32_t ei = k%entry_size_;
158 |         uint32_t a = available_();
159 |         if (entry_[ei]!=(uint32_t)-1)
160 |         {
161 |             uint32_t next = entry_[ei];
162 |             while(1)
163 |             {
164 |                 assert(next < nodes_num_);
165 |                 if (nodes_[next].key() == k)
166 |                 {
167 |                     nodes_[next].value() = v;
168 |                     return;
169 |                 }
170 |                 if (nodes_[next].next() == (uint32_t)-1)
171 |                     break;
172 |                 next = nodes_[next].next();
173 |             }
174 |             assert(next < nodes_num_);
175 |             nodes_[next].next() = a;
176 |         }
177 |         else
178 |             entry_[ei] = a;
179 | 
180 |         assert(a < nodes_num_);
181 |         avai_i_ = nodes_[a].next();
182 |         nodes_[a] = node_t(k, v);
183 |         size_ ++;
184 |     }
185 | 
186 |     VALUE_T* find(const KEY_T& k)
187 |     {
188 |         uint32_t ei = k%entry_size_;
189 |         if (entry_[ei] == (uint32_t)-1)
190 |             return NULL;
191 | 
192 |         uint32_t next = entry_[ei];
193 |         while(1)
194 |         {
195 |             assert(next < nodes_num_);
196 |             if (nodes_[next].key() == k)
197 |                 return &nodes_[next].value();
198 | 
199 |             if (nodes_[next].next() == (uint32_t)-1)
200 |                 return NULL;
201 |             next = nodes_[next].next();
202 |         }
203 |         assert(false);
204 |         return NULL;
205 |     }
206 | 
207 |     bool erase(const KEY_T& k)
208 |     {
209 |         uint32_t ei = k%entry_size_;
210 |         if (entry_[ei] == (uint32_t)-1)
211 |             return false;
212 | 
213 |         uint32_t next = entry_[ei];
214 |         uint32_t la = -1;
215 |         while(1)
216 |         {
217 |             assert(next < nodes_num_);
218 |             if (nodes_[next].key() == k)
219 |             {
220 |                 if (la == (uint32_t)-1)
221 |                     entry_[ei] = nodes_[next].next();
222 |                 else
223 |                     nodes_[la].next() = nodes_[next].next();
224 | 
225 |                 nodes_[next].next() = avai_i_;
226 |                 avai_i_ = next;
227 |                 --size_;
228 |                 return true;
229 |             }
230 | 
231 |             if (nodes_[next].next() == (uint32_t)-1)
232 |                 return false;
233 |             la = next;
234 |             next = nodes_[next].next();
235 |         }
236 |         assert(false);
237 |         return false;
238 | 
239 |     }
240 | 
241 |     void persistence(const std::string& nm)const
242 |     {
243 |         FILE* f = fopen(nm.c_str(), "w+");
244 |         if (!f)
245 |             throw std::runtime_error("can't open file.");
246 | 
247 |         fwrite(&nodes_num_, sizeof(nodes_num_), 1, f);
248 |         fwrite(&entry_size_, sizeof(entry_size_), 1, f);
249 |         fwrite(&avai_i_, sizeof(avai_i_), 1, f);
250 |         fwrite(&size_, sizeof(size_), 1, f);
251 |         fwrite(entry_, sizeof(uint32_t)*entry_size_, 1, f);
252 |         fwrite(nodes_, sizeof(node_t)*nodes_num_, 1, f);
253 | 
254 |         fclose(f);
255 |     }
256 | 
257 |     void load(const std::string& nm)
258 |     {
259 |         FILE* f = fopen(nm.c_str(), "r");
260 |         if (!f)
261 |             throw std::runtime_error(std::string("can't open file:")+nm);
262 | 
263 |         if(fread(&nodes_num_, sizeof(nodes_num_), 1, f)!=1)throw std::runtime_error("File read error.");
264 |         if(fread(&entry_size_, sizeof(entry_size_), 1, f)!=1)throw std::runtime_error("File read error.");
265 |         if(fread(&avai_i_, sizeof(avai_i_), 1, f)!=1)throw std::runtime_error("File read error.");
266 |         if(fread(&size_, sizeof(size_), 1, f)!=1)throw std::runtime_error("File read error.");
267 | 
268 |         delete[] entry_;
269 |         delete[] nodes_;
270 |         entry_ = new uint32_t[entry_size_];
271 |         nodes_ = new node_t[nodes_num_];
272 | 
273 |         if(fread(entry_, sizeof(uint32_t)*entry_size_, 1, f)!=1)throw std::runtime_error("File read error.");
274 |         if(fread(nodes_, sizeof(node_t)*nodes_num_, 1, f)!=1)throw std::runtime_error("File read error.");
275 | 
276 |         fclose(f);
277 |     }
278 | 
279 |     uint32_t size()const
280 |     {
281 |         return size_;
282 |     }
283 | 
284 |     class iterator
285 |     {
286 |         KIntegerHashTable<KEY_T, VALUE_T>* ptr_;
287 |         uint32_t idx_;
288 |         uint32_t ei_;
289 | 
290 |     public:
291 |         iterator(KIntegerHashTable<KEY_T, VALUE_T>* ptr = NULL, uint32_t idx = -1, uint32_t ei = 0)
292 |             :ptr_(ptr),idx_(idx),ei_(ei)
293 |         {}
294 | 
295 |         iterator& operator ++(int)
296 |         {
297 |             if (!ptr_ || idx_ == (uint32_t)-1)
298 |                 return *this;
299 | 
300 |             idx_ = ptr_->nodes_[idx_].next();
301 |             if(idx_ != (uint32_t)-1)
302 |                 return *this;
303 | 
304 |             ei_++;
305 |             while(ei_ < ptr_->entry_size_ &&  ptr_->entry_[ei_] == (uint32_t)-1)
306 |                 ei_++;
307 | 
308 |             if (ei_ >= ptr_->entry_size_)
309 |                 return *this;
310 | 
311 |             idx_ = ptr_->entry_[ei_];
312 |             return *this;
313 |         }
314 | 
315 |         iterator& operator ++()
316 |         {
317 |             return (*this)++;
318 |         }
319 | 
320 |         KEY_T* key()
321 |         {
322 |             if (idx_ == (uint32_t)-1)
323 |                 return NULL;
324 |             return &(ptr_->nodes_[idx_].key());
325 |         }
326 | 
327 |         VALUE_T* value()
328 |         {
329 |             if (idx_ == (uint32_t)-1)
330 |                 return NULL;
331 |             return &(ptr_->nodes_[idx_].value());
332 |         }
333 | 
334 |         bool operator == (const iterator& o)const
335 |         {
336 |             return ptr_ == o.ptr_ && idx_ == o.idx_;
337 |         }
338 | 
339 |         bool operator != (const iterator& o)const
340 |         {
341 |             return ptr_ != o.ptr_ || idx_ != o.idx_;
342 |         }
343 |     };
344 | 
345 |     iterator begin()
346 |     {
347 |         uint32_t ei = 0;
348 |         while(ei < entry_size_ &&  entry_[ei] == (uint32_t)-1)
349 |             ei++;
350 |         if (ei >= entry_size_)
351 |             return end();
352 |         return iterator(this, entry_[ei], ei);
353 |     }
354 | 
355 |     iterator end()
356 |     {
357 |         return iterator(this, -1);
358 |     }
359 | };
360 | 
361 | template<
362 | class KEY_T,
363 |       class VALUE_T
364 |       >
365 | class KStringHashTable
366 | {
367 |     KIntegerHashTable<uint64_t, VALUE_T> table_;
368 | 
369 |     uint64_t hash_(const std::string& str)const
370 |     {
371 |        // return HashFunction<std::string>::generateHash64(str);
372 |     }
373 | 
374 |     uint64_t hash_(const KString& kstr)const
375 |     {
376 |         std::string str = kstr.get_bytes("utf-8");
377 |         return hash_(str);
378 |         //return izenelib::util::HashFunction<std::string>::generateHash64((char*)kstr.get_bytes(), kstr.length()*sizeof(uint16_t));
379 |     }
380 | 
381 | public:
382 |     KStringHashTable(uint32_t ent_size = 100000, uint32_t element_num = 50000)
383 |         :table_(ent_size, element_num)
384 |     {}
385 | 
386 |     void reserve(uint32_t ent_size, uint32_t element_num)
387 |     {
388 |         table_.reserve(ent_size, element_num);
389 |     }
390 | 
391 |     void insert(const KEY_T& k, const VALUE_T& v)
392 |     {
393 |         uint64_t h = hash_(k);
394 |         table_.insert(h, v);
395 |     }
396 | 
397 |     void insert(const uint64_t& k, const VALUE_T& v)
398 |     {
399 |         table_.insert(k, v);
400 |     }
401 | 
402 |     VALUE_T* find(const KEY_T& k)
403 |     {
404 |         uint64_t h = hash_(k);
405 |         return table_.find(h);
406 |     }
407 |     
408 |     VALUE_T* find(const uint64_t& k)
409 |     {
410 |         return table_.find(k);
411 |     }
412 | 
413 |     bool erase(const KEY_T& k)
414 |     {
415 |         uint64_t h = hash_(k);
416 |         return table_.erase(h);
417 |     }
418 | 
419 |     void persistence(const std::string& nm)const
420 |     {
421 |         table_.persistence(nm);
422 |     }
423 | 
424 |     void load(const std::string& nm)
425 |     {
426 |         table_.load(nm);
427 |     }
428 | 
429 |     uint32_t size()const
430 |     {
431 |         return table_.size();
432 |     }
433 | 
434 |     typedef typename KIntegerHashTable<uint64_t, VALUE_T>::iterator iterator;
435 | 
436 |     iterator begin()
437 |     {
438 |         return table_.begin();
439 |     }
440 | 
441 |     iterator end()
442 |     {
443 |         return table_.end();
444 |     }
445 | 
446 | };
447 | #endif
448 | 
449 | 
450 | 


--------------------------------------------------------------------------------
/include/segment/kstring.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef KSTRING__HPP
  2 | #define KSTRING__HPP
  3 | 
  4 | #include "sys/types.h"
  5 | #include "/usr/include/iconv.h"
  6 | #include <cerrno>
  7 | #include <string>
  8 | #include <string.h>
  9 | #include <stdint.h>
 10 | #include <stdexcept>
 11 | #include <cassert>
 12 | #include <cstdio>
 13 | #include <cstdlib>
 14 | #include <iostream>
 15 | #include <vector>
 16 | 
 17 | #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)
 18 | # include <ext/atomicity.h>
 19 | #else
 20 | # include <bits/atomicity.h>
 21 | #endif
 22 | 
 23 | /**
 24 |  * @brief Unicode string.
 25 |  * Structrue is like this: 
 26 |  * { 
 27 |  *      uint32_t reference_count;
 28 |  *      uint32_t the capacity of buffer in sizeof(uint16_t);
 29 |  *      uint32_t # of chars of string;
 30 |  *      uint16_t* unicode array of string;
 31 |  * }
 32 |  */
 33 | class KString
 34 | {
 35 |     uint32_t* mem_;
 36 | 
 37 | #define CHECK_NULL(mem) {if(!mem)return;}
 38 | 
 39 |     uint32_t& reference_count_()
 40 |     {
 41 |         assert(mem_ != NULL);
 42 |         return (*(uint32_t*)mem_);
 43 |     }
 44 | 
 45 |     uint32_t& char_num_()
 46 |     {
 47 |         assert(mem_ != NULL);
 48 |         return (*(uint32_t*)(mem_+2));
 49 |     }
 50 | 
 51 |     uint32_t& capacity_()
 52 |     {
 53 |        assert(mem_ != NULL);
 54 |         return (*(uint32_t*)(mem_+1));
 55 |     }
 56 | 
 57 |     uint32_t capacity_()const
 58 |     {
 59 |         if (!mem_)return 0;
 60 |         return (*(uint32_t*)(mem_+1));
 61 |     }
 62 | 
 63 |     uint32_t total_bytes_(uint32_t char_num = 0)const
 64 |     {
 65 |         if (char_num == 0 && mem_)
 66 |           return sizeof(uint32_t)*3  +sizeof(uint16_t)*capacity_();
 67 |         return sizeof(uint32_t)*3  +sizeof(uint16_t)*char_num;
 68 |     }
 69 | 
 70 |     uint16_t* unicodes_()
 71 |     {
 72 |         assert(mem_ != NULL);
 73 |         return (uint16_t*)(mem_+3);
 74 |     }
 75 | 
 76 |     uint16_t* unicodes_()const
 77 |     {
 78 |         assert(mem_ != NULL);
 79 |         return (uint16_t*)(mem_+3);
 80 |     }
 81 | 
 82 |     void refer_()
 83 |     {
 84 |         CHECK_NULL(mem_);
 85 |          __gnu_cxx::__atomic_add(( _Atomic_word*)(uint32_t*)mem_, 1);
 86 |     }
 87 | 
 88 |     void defer_()
 89 |     {
 90 |         CHECK_NULL(mem_);
 91 |         if (__sync_add_and_fetch(( _Atomic_word*)(uint32_t*)mem_, -1) <= 0)
 92 |         {
 93 |             free(mem_);
 94 |             mem_ = NULL;
 95 |         }
 96 |     }
 97 | 
 98 |     uint32_t string_len_()const
 99 |     {
100 |         if (!mem_)return 0;
101 |         return (*(uint32_t*)(mem_+2));
102 |     }
103 | 
104 |     uint32_t expansion_(uint32_t t)
105 |     {
106 |         if (t + string_len_() < 100)return 2*(t + string_len_());
107 |         if (t + string_len_() < 1000)return 1.5*(t + string_len_());
108 |         return 1.1*(t + string_len_());
109 |     }
110 | 
111 |     void copy_on_write_()
112 |     {
113 |         CHECK_NULL(mem_);
114 |         assert(reference_count_() > 0);
115 |         if (reference_count_() == 1)
116 |           return;
117 |         uint32_t s = total_bytes_();
118 |         uint32_t* m =  (uint32_t*)malloc(s);
119 |         memcpy(m, mem_, s);
120 |         defer_();
121 |         mem_ = m;
122 |         reference_count_() = 1;
123 |     }
124 | 
125 |     std::string encoding_name_(const std::string encode)
126 |     {
127 |         if (strcmp(encode.c_str(), "utf8") == 0)
128 |           return "utf-8";
129 |         return encode;
130 |     }
131 |     public:
132 |     explicit KString(const std::string& str, const std::string& encode="utf-8//IGNORE")
133 |         :mem_(NULL)
134 |     {
135 |         if (str.length() == 0)
136 |           return;
137 |         reserve(str.length());
138 |         std::size_t inlen = str.length();
139 |         std::size_t outlen = capacity_()*2;
140 |         char* out = (char*)unicodes_();
141 |         char* in = const_cast <char *> (str.c_str());
142 | 
143 |         iconv_t hdl = iconv_open("ucs-2", encode.c_str());//encoding_name_(encode).c_str()) ;
144 |         if (hdl == (iconv_t)-1)
145 |           throw std::runtime_error("Can't initiate iconv handler");
146 |         std::size_t ret;
147 |         while(1)
148 |         {
149 |             ret = iconv(hdl, &in, &inlen, &out, &outlen);
150 |             if (inlen == 0)
151 |               break;
152 |             if (ret == (std::size_t)-1)
153 |             {
154 |                 iconv_close(hdl);
155 |                 if(errno == E2BIG)
156 |                     throw std::runtime_error("Not enough output buffer for conversion.");
157 |                 if (errno == EINVAL)
158 |                     throw std::runtime_error("Incomplete multibyte sequence.");
159 |                 if (errno == EILSEQ)//std::cerr<<"Invalid multibyte sequence.\n";
160 |                     throw std::runtime_error("Invalid multibyte sequence.");
161 |                 throw std::runtime_error("iconv error");
162 |             }
163 |         }
164 |         iconv_close(hdl);
165 |         char_num_() = (capacity_()*2 - outlen)/2;
166 |         reference_count_() = 1;
167 |     }
168 |      ~KString()
169 |      {
170 |          defer_();
171 |      }
172 | 
173 |     explicit KString()
174 |         :mem_(NULL)
175 |     {}
176 | 
177 |     /**
178 |      * @brief 
179 |      *
180 |      * @param s
181 |      * @param e exclusivly
182 |      */
183 |     explicit KString(uint16_t* s, uint16_t* e)
184 |     {
185 |         uint32_t len = (e - s);
186 |         uint32_t b = total_bytes_(len);
187 |         mem_ = (uint32_t*)malloc(b);
188 |         memset(mem_, 0, b);
189 |         memcpy(unicodes_(), s, len*sizeof(uint16_t));
190 |         capacity_() = len;
191 |         char_num_() = len;
192 |         reference_count_() = 1;
193 |     }
194 | 
195 |     KString(const KString& o)
196 |     {
197 |         mem_ = o.mem_;
198 |         refer_();
199 |     }
200 | 
201 |     KString& operator = (const KString& o)
202 |     {
203 |         defer_();
204 |         mem_ = o.mem_;
205 |         refer_();
206 |         return *this;
207 |     }
208 | 
209 |     uint32_t length()const
210 |     {
211 |         return string_len_();
212 |     }
213 | 
214 |     uint16_t char_at(uint32_t i)const
215 |     {
216 |         assert(i < string_len_());
217 |         return unicodes_()[i];
218 |     }
219 | 
220 |     uint16_t& operator [] (uint32_t i)
221 |     {
222 |         assert(i < string_len_());
223 |         copy_on_write_();
224 |         return unicodes_()[i];
225 |     }
226 | 
227 |     uint16_t operator [] (uint32_t i)const
228 |     {
229 |          return char_at(i);
230 |     }
231 | 
232 |     int32_t compare_to(const KString& o)const
233 |     {
234 |         uint32_t i=0;
235 |         for ( ; i<length() && i<o.length(); ++i)
236 |           if (char_at(i) < o.char_at(i))
237 |             return -1;
238 |           else if (char_at(i) > o.char_at(i))
239 |             return 1;
240 | 
241 |         if (length() > i)
242 |           return 1;
243 |         else if (o.length() > i)
244 |           return -1;
245 |         return 0;
246 |     }
247 | 
248 |     void reserve(uint32_t len)
249 |     {
250 |         if (mem_ && len < capacity_())
251 |           return;
252 | 
253 |         uint32_t s = total_bytes_(len);
254 |         assert(s > sizeof(uint32_t)*3);
255 |         uint32_t* m = (uint32_t*)malloc(s);
256 |         memset(m, 0, s);
257 |         if (mem_)
258 |         {
259 |             assert(total_bytes_() > sizeof(uint32_t)*3);
260 |             assert(total_bytes_() < s);
261 |             memcpy(m, mem_, total_bytes_());
262 |             defer_();
263 |         }
264 |         mem_ = m;
265 |         capacity_() = len;
266 |         reference_count_() = 1;
267 |         assert(capacity_() >= char_num_());
268 |     }
269 | 
270 |     void concat(const KString& o)
271 |     {
272 |         if (o.length() == 0)return;
273 |         if (length() == 0)
274 |         {
275 |             *this = o;
276 |             return;
277 |         }
278 |         if (mem_ && length() + o.length() < capacity_())
279 |         {
280 |             copy_on_write_();
281 |             uint32_t l = char_num_();
282 |             char_num_() += o.length();
283 |             for ( uint32_t i=0; i<o.length(); ++i)
284 |               unicodes_()[l+i] = o[i];
285 |             return;
286 |         }
287 |         reserve(expansion_(o.length()));
288 |         copy_on_write_();
289 |         uint32_t l = char_num_();
290 |         char_num_() += o.length();
291 |         for ( uint32_t i=0; i<o.length(); ++i)
292 |            unicodes_()[l+i] = o[i];
293 |     }
294 | 
295 |     void concat(uint16_t c)
296 |     {
297 |         if (mem_ && length() + 1 < capacity_())
298 |         {
299 |             copy_on_write_();
300 |             char_num_() ++;
301 |             unicodes_()[char_num_()-1] = c;
302 |             //std::cout<<(char)c<<":::"<<char_num_()<<std::endl;
303 |             //std::cout<<(*this)<<std::endl;
304 |             return;
305 |         }
306 |         reserve(expansion_(1));
307 |         copy_on_write_();
308 |         char_num_()++;
309 |         unicodes_()[char_num_()-1] = c;
310 |         //std::cout<<(char)c<<"::"<<char_num_()<<std::endl;
311 |         //  std::cout<<(*this)<<std::endl;
312 |     }   
313 | 
314 |     KString& operator += (const KString& o)
315 |     {
316 |         concat(o);
317 |         return *this;
318 |     }
319 | 
320 |     KString& operator += (uint16_t c)
321 |     {
322 |         concat(c);
323 |         return *this;
324 |     }
325 | 
326 |     KString& operator + (const std::string& utf8str)
327 |     {
328 |         concat(KString(utf8str));
329 |         return *this;
330 |     }
331 | 
332 |     KString& operator + (uint16_t c)
333 |     {
334 |         concat(c);
335 |         return *this;
336 |     }
337 | 
338 |     bool end_with(const KString& o)const
339 |     {
340 |         if (length() == 0 || o.length() == 0 || length() < o.length())
341 |           return false;
342 |         for ( int32_t i = o.length() -1, j=length()-1; i >=0 && j>=0; --i,--j)
343 |           if (o[i] != char_at(j))
344 |             return false;
345 |         return true;
346 |     }
347 | 
348 |     bool end_with(const std::string& utf8str)const
349 |     {
350 |         return end_with(KString(utf8str));
351 |     }
352 | 
353 |     bool equals(const KString& o)const
354 |     {
355 |         return compare_to(o) == 0;
356 |     }
357 | 
358 |     bool equals(const std::string& utf8str)const
359 |     {
360 |         return compare_to(KString(utf8str)) == 0;
361 |     }
362 | 
363 |     bool operator == (const KString& o)const
364 |     {
365 |         return equals(o);
366 |     }
367 | 
368 |     bool operator == (const std::string& utf8str)const
369 |     {
370 |         return equals(KString(utf8str));
371 |     }
372 | 
373 |     uint16_t* get_bytes()const
374 |     {
375 |         return unicodes_();
376 |     }
377 | 
378 |     std::string get_bytes(const std::string& encode)const
379 |     {
380 |         if (length() == 0)return "";
381 | 
382 |         char* out = new char[length()*3];
383 |         char* outbuf = out;
384 |         std::size_t inlen = length() *2 ;
385 |         std::size_t outlen = length() *3;
386 |         char* inbuf = (char*)unicodes_();
387 |         std::size_t ret = 0;
388 |         iconv_t hdl = iconv_open(encode.c_str(), "ucs-2") ;
389 |         if (hdl == (iconv_t)-1)
390 |         {
391 |             delete[] out;
392 |             throw std::runtime_error("Can't initiate iconv handler");
393 |         }
394 | 
395 |         while(1)
396 |         {
397 |             ret = iconv(hdl, &inbuf, &inlen, &outbuf, &outlen);
398 |             if (ret == 0)
399 |               break;
400 |             if (ret == (std::size_t)-1 && errno == E2BIG)
401 |             {
402 |                 iconv_close(hdl);
403 |                 delete[] out;
404 |                 throw std::runtime_error("encoding convert error");
405 |             }
406 |             inbuf++;
407 |             inlen--;
408 |         }
409 |         
410 |         iconv_close(hdl);
411 |         if (outlen == (std::size_t)-1){
412 |             delete out;
413 |             throw std::runtime_error("Not malloc enough memory.");
414 |         }
415 | 
416 |         std::string re(out, length()*3-outlen);
417 |         delete[] out;
418 |         return re;
419 |     }
420 | 
421 |     friend std::ostream& operator << (std::ostream& os, const KString& o)
422 |     {
423 |         os << o.get_bytes("utf-8");
424 |         return os;
425 |     }
426 |     
427 |     uint32_t index_of(uint16_t c, uint32_t start_from=0)const
428 |     {
429 |         for ( uint32_t i=start_from; i<length(); ++i)
430 |           if (char_at(i) == c)
431 |             return i;
432 |         return -1;
433 |     }
434 | 
435 |     uint32_t find(const std::string& o)const
436 |     {
437 |         return find(KString(o));
438 |     }
439 | 
440 |     uint32_t find(const KString& o)const
441 |     {
442 |         if (length() == 0 || o.length() == 0 || o.length() > length())
443 |           return -1;
444 |         for ( uint32_t i=0; i<=length()-o.length(); ++i)
445 |           if (char_at(i) == o[0])
446 |           {
447 |               uint32_t j=0;
448 |               for ( ; j<o.length(); ++j)
449 |                 if (char_at(i+j) != o[j])
450 |                   break;
451 |               if (j == o.length())
452 |                 return i;
453 |           }
454 |         return -1;
455 |     }
456 | 
457 |     uint32_t find(int o)const
458 |     {
459 |         if (length() == 0)
460 |           return -1;
461 |         for ( uint32_t i=0; i<length();i++)
462 |             if (char_at(i) == o)
463 |                 return i;
464 |         return -1;
465 |     }
466 | 
467 |     /**
468 |      * @brief Replace all 'a' with 'b'
469 |      *
470 |      * @param a
471 |      * @param b
472 |      */
473 |     void replace(uint16_t a, uint16_t b)
474 |     {
475 |         if (a == b)
476 |           return;
477 |         copy_on_write_();
478 |         for ( uint32_t i=0; i<length(); ++i)
479 |           if (char_at(i) == a)
480 |             unicodes_()[i] = b;
481 |     }
482 | 
483 |     /**
484 |      * @brief Replace all string "a" with "b"
485 |      *
486 |      * @param a
487 |      * @param b
488 |      */
489 |     void replace(const KString& a, const KString& b)
490 |     {
491 |         uint32_t p = find(a);
492 |         if (p == (uint32_t)-1)
493 |           return;
494 |         if (a.length() < b.length())
495 |           reserve(expansion_(length() +  b.length() - a.length()));
496 |         copy_on_write_();
497 |         for ( uint32_t i = length()-1, j=1; i>=p+a.length(); --i,++j)
498 |           unicodes_()[length() +  b.length() - a.length()-j] =  unicodes_()[i];
499 |         for ( uint32_t i=0; i<b.length(); ++i)
500 |           unicodes_()[p+i] = b[i];
501 | 
502 |         char_num_() = length() - a.length() + b.length();
503 |     }
504 | 
505 |     void  replace(const std::string& utf8a, const std::string& utf8b)
506 |     {
507 |         replace(KString(utf8a), KString(utf8b));
508 |     }
509 | 
510 |     /**
511 |      * @brief Get the substr, start from index 's' with the length 'len'.
512 |      *
513 |      * @param s
514 |      * @param len if it's larger than length(), it will be assigned to length()
515 |      *
516 |      * @return the substring.
517 |      */
518 |     KString substr(uint32_t s, uint32_t len = -1)const
519 |     {
520 |         assert(s < length());
521 |         if (length() == 0 || len == 0)return KString();
522 |         if (len == (uint32_t)-1) len = length() - s;
523 |         assert(len + s <= length());
524 |         return KString(unicodes_()+s, unicodes_()+s+len);
525 |     }
526 |     /**
527 |      * @brief Split the string by char c into a vector of string
528 |      *
529 |      * @param c the delimitor
530 |      *
531 |      * @return the vector of string
532 |      */
533 |     std::vector<KString>
534 |         split(uint16_t c)
535 |         {
536 |             std::vector<KString> v;
537 |             uint32_t s = 0;
538 |             uint32_t f = index_of(c);
539 |             while (f != (uint32_t)-1)
540 |             {
541 |                 v.push_back(substr(s, f- s));
542 |                 s = f+1;
543 |                 f = index_of(c, s);
544 |             }
545 |             if (s < length())
546 |               v.push_back(substr(s));
547 |             return v;
548 |         }
549 | 
550 |     void to_lower_case()
551 |     {
552 |         bool f = true;
553 |         for ( uint32_t i=0; i<length(); ++i)
554 |           if (char_at(i) >= 'A' && char_at(i)<='Z')
555 |           {
556 |               f = false;
557 |               break;
558 |           }
559 |         if (f)return;
560 |         copy_on_write_();
561 |         for ( uint32_t i=0; i<length(); ++i)
562 |           if (char_at(i) >= 'A' && char_at(i)<='Z')
563 |             unicodes_()[i] = 'a' + char_at(i) - 'A';
564 |         return;
565 |     }
566 |     
567 |     void to_upper_case()
568 |     {
569 |         bool f = true;
570 |         for ( uint32_t i=0; i<length(); ++i)
571 |           if (char_at(i) >= 'a' && char_at(i)<='z')
572 |           {
573 |               f = false;
574 |               break;
575 |           }
576 |         if (f)return;
577 |         copy_on_write_();
578 |         for ( uint32_t i=0; i<length(); ++i)
579 |           if (char_at(i) >= 'a' && char_at(i)<='z')
580 |             unicodes_()[i] = 'A' + char_at(i) - 'a';
581 |         return;
582 |     }
583 | 
584 |     void to_dbc()
585 |     {
586 |         bool f = true;
587 |         for ( uint32_t i=0; i<length(); ++i)
588 |           if (char_at(i) == 1228 || (char_at(i) > 65280 && char_at(i) < 65375))
589 |           {
590 |               f = false;
591 |               break;
592 |           }
593 |         if (f)return;
594 |         copy_on_write_();
595 |         for ( uint32_t i=0; i<length(); ++i)
596 |           if (char_at(i) == 1228)
597 |             unicodes_()[i] = 32;
598 |           else if (char_at(i) > 65280 && char_at(i) < 65375)
599 |             unicodes_()[i] -= 65248;
600 |     }
601 | 
602 |     void trim(uint16_t space = ' ')
603 |     {
604 |         uint32_t p = index_of(space);
605 |         if (p == (uint32_t)-1)
606 |           return;
607 | 
608 |         copy_on_write_();
609 |         uint32_t r = p +1;
610 |         uint32_t t = 1;
611 |         for(;r<length();++p,++r)
612 |         {
613 |             while(r<length()&&char_at(r) == space)r++,t++;
614 |             unicodes_()[p] = char_at(r);
615 |         }
616 |         char_num_() -= t;
617 |     }
618 | 
619 |     void trim_head_tail(uint16_t space = ' ')
620 |     {
621 |         if (length() == 0 || (length()==1&&char_at(0)!=space))
622 |             return;
623 | 
624 |         uint32_t p = 0;
625 |         while(p<length() && char_at(p) == space)
626 |             p++;
627 |         if (p >= length())
628 |         {
629 |             copy_on_write_(), char_num_() = 0;
630 |             return;
631 |         }
632 |         if (p != 0)
633 |         {
634 |             copy_on_write_();
635 |             for (uint32_t i=0;i+p<length();++i)
636 |                 unicodes_()[i] = char_at(i+p);
637 |         }
638 |         uint32_t t = length()-p;
639 |         for (int32_t i=t-1; i>=0 && char_at(i)==space; --i,--t);
640 | 
641 |         if ( p == 0 && length()!= t)
642 |             copy_on_write_();
643 |         char_num_() = t;
644 |     }
645 | 
646 |     void trim_into_1(uint16_t space = ' ')
647 |     {
648 |         uint32_t f = 0, t = 0, s = -2;
649 |         bool chng = false;
650 |         while (f < length())
651 |         {
652 |             if (char_at(f) == space)
653 |             {
654 |                 if (f - s == 1){f++,s++;continue;}
655 |                 s = f;
656 |             }
657 | 
658 |             if (t != f && !chng)
659 |             {
660 |                 copy_on_write_();
661 |                 chng = true;
662 |             }
663 | 
664 |             if (t != f)
665 |                 unicodes_()[t] = char_at(f);
666 |             t++, f++;
667 |         }
668 |         if (chng) char_num_() = t;
669 |     }
670 | 
671 |     static KString value_of(uint32_t v)
672 |     {
673 |         char buf[125];
674 |         memset(buf, 0, sizeof(buf));
675 |         sprintf(buf, "%d", v);
676 |         return KString(std::string(buf));
677 |     }
678 |     
679 |     static KString value_of(int v)
680 |     {
681 |         char buf[125];
682 |         memset(buf, 0, sizeof(buf));
683 |         sprintf(buf, "%d", v);
684 |         return KString(std::string(buf));
685 |     }
686 |     
687 |     static KString value_of(double v)
688 |     {
689 |         char buf[125];
690 |         memset(buf, 0, sizeof(buf));
691 |         sprintf(buf, "%f", v);
692 |         return KString(std::string(buf));
693 |     }
694 | 
695 |     bool operator < (const KString& o)const
696 |     {
697 |         for ( uint32_t i=0; i<length()&&i<o.length(); ++i)
698 |           if (char_at(i) < o[i])
699 |             return true;
700 |           else if (char_at(i) > o[i])
701 |             return false;
702 | 
703 |         if (length() < o.length())
704 |           return true;
705 |         return false;
706 |     }
707 | 
708 |     static bool is_korean(uint16_t ucs2char)
709 |     {   
710 |         if ((ucs2char>=0x1100 && ucs2char<=0x11FF)
711 |           ||(ucs2char>=0x3130 && ucs2char<=0x318F)
712 |           ||(ucs2char>=0xAC00 && ucs2char<=0xD7AF)
713 |           )return true;
714 |         return false;
715 |     }   
716 | 
717 |     static bool is_chinese(uint16_t ucs2char)
718 | 	{
719 | 		if (((ucs2char>=0x2E80 && ucs2char<=0x2EF3)
720 | 					|| (ucs2char>=0x2F00 && ucs2char<=0x2FD5)
721 | 					|| (ucs2char>=0x3400 && ucs2char<=0x4DB5)
722 | 					|| (ucs2char>=0x4E00 && ucs2char<=0x9FC3)
723 | 					|| (ucs2char>=0xF900 && ucs2char<=0xFAD9))
724 |           && ucs2char!=12289 
725 |           && ucs2char!=12298 
726 |           && ucs2char!=12290 
727 |           && ucs2char!=12299 
728 |           && ucs2char!=65292 
729 |           && ucs2char!=65311 
730 |           && ucs2char!=65281 
731 |           && ucs2char!=65306 
732 |           && ucs2char!=65307 
733 |           && ucs2char!=8220 
734 |           && ucs2char!=8221 
735 |           && ucs2char!=12304 
736 |           && ucs2char!=12305 
737 |           && ucs2char!=65509 
738 |           && ucs2char!=8230 
739 |           && ucs2char!=65288 
740 |           && ucs2char!=65289 
741 |           && ucs2char!=8212
742 |           && ucs2char!=20022) 
743 | 		  return true;
744 | 
745 | 		return false;
746 | 	}
747 | 
748 |     static bool is_chn_numeric(uint16_t ucs2char)
749 |     {
750 |         if (ucs2char == 38646//零
751 |             || ucs2char == 19968//一
752 |             || ucs2char == 20108//二
753 |             || ucs2char == 19977
754 |             || ucs2char == 22235
755 |             || ucs2char == 20116
756 |             || ucs2char == 20845
757 |             || ucs2char == 19971
758 |             || ucs2char == 20843
759 |             || ucs2char == 20061
760 |             || ucs2char == 21313//十
761 |             )
762 |             return true;
763 |         return false;
764 |     }
765 | 
766 | 	static bool is_numeric(uint16_t ucs2char)
767 | 	{
768 | 		static const uint16_t zero('0'), nine('9');
769 | 		    if ( zero <= ucs2char && ucs2char <= nine )
770 | 			          return true;
771 | 			    return false;
772 | 	}	
773 | 
774 | 	static bool is_english(uint16_t ucs2char)
775 | 	{
776 | 		static const uint16_t a('a'), z('z'), A('A'), Z('Z');
777 | 		    if ( ( a <= ucs2char && ucs2char <= z ) || ( A <= ucs2char && ucs2char <= Z ) )
778 | 			          return true;
779 | 			    return false;
780 | 	}
781 | 
782 | };
783 | #endif
784 | 


--------------------------------------------------------------------------------
/include/segment/line_reader.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef LINE_READER_H_
 3 | #define LINE_READER_H_
 4 | 
 5 | #include <sys/types.h>
 6 | 
 7 | #include <iostream>
 8 | #include <vector>
 9 | #include <cstdio>
10 | #include <cstring>
11 | 
12 | namespace util{
13 | 
14 | class LineReader
15 | {
16 |     FILE* f_;
17 |     char* mem_;
18 |     uint64_t bytes_;
19 | 
20 |     bool next_block_()
21 |     {
22 |         if (::feof(f_))
23 |           return false;
24 |         memset(mem_, 0, bytes_);
25 |         uint64_t p = ftell(f_);
26 |         if(fread(mem_, bytes_, 1, f_)!=1);
27 |         //std::cout<<mem_<<"PPPP\n";
28 |         char* m = mem_;
29 |         char* la_n = NULL;
30 |         while((uint64_t)(m - mem_) < bytes_ && *m != 0)
31 |         {
32 |             if (*m == '\n')
33 |               *m = 0, la_n = m;
34 |             m++;
35 |         }
36 | 
37 |         if (m == mem_ && ::feof(f_))
38 |           return false;
39 | 
40 |         if ((uint64_t)(m-mem_)<bytes_ || ::feof(f_))
41 |           return true;
42 | 
43 |         assert(la_n != NULL);
44 |         fseek(f_, p + (la_n - mem_+1), SEEK_SET);
45 |         assert((uint64_t)(la_n - mem_+1) <= bytes_);
46 |         memset(la_n+1, 0, bytes_ -(la_n - mem_+1));
47 |         return true;
48 |     }
49 |     public:
50 |         LineReader(const std::string& nm, uint64_t buf_size = 1000000)
51 |         {
52 |             f_ = fopen(nm.c_str(), "r");
53 |             if (!f_)
54 |               throw std::runtime_error("can't open file.");
55 | 
56 |             bytes_ = buf_size;
57 |             mem_ = new char[buf_size];
58 |         }
59 | 
60 |         ~LineReader()
61 |         {
62 |             if(f_)fclose(f_);
63 |             if (mem_)delete [] mem_;
64 |         }
65 | 
66 |         char* line(char* prev_line = NULL)
67 |         {
68 |             if (!prev_line)
69 |             {
70 |                 fseek(f_, 0, SEEK_SET);
71 |                 if(next_block_())return mem_;
72 |                 return NULL;
73 |             }
74 | 
75 |             assert(prev_line >= mem_);
76 |             assert(prev_line < mem_+bytes_);
77 |             //std::cout<<prev_line<<"LLLL\n";
78 |             //std::cout<<prev_line-mem_<<std::endl;
79 |             while(*prev_line != 0)++prev_line;
80 |             assert(prev_line < mem_+bytes_);
81 | 
82 |             prev_line++;
83 |             if (prev_line >= mem_+bytes_ || *prev_line  == 0)
84 |             {
85 |                 if(next_block_())return mem_;
86 |                 return NULL;
87 |             }
88 |             return prev_line;
89 |         }
90 | };
91 | 
92 | }
93 | 
94 | #endif
95 | 


--------------------------------------------------------------------------------
/include/segment/normalize.h:
--------------------------------------------------------------------------------
 1 | #ifndef NORMALIZE_H_
 2 | #define NORMALIZE_H_
 3 | 
 4 | #include "trd2simp.h"
 5 | #include "kstring.hpp"
 6 | 
 7 | #include <string>
 8 | 
 9 | 
10 | class KNormalize
11 | {
12 |     //static Trad2Simp trd2smp_;
13 | public:
14 |      static void normalize(KString& kstr)
15 |     {
16 | 		static Trad2Simp trd2smp_;
17 |         try{
18 |         kstr.to_dbc();
19 |         kstr.to_lower_case();
20 |         trd2smp_.transform(kstr);
21 |         kstr.trim_into_1();
22 |         kstr.trim_head_tail();
23 |         }catch(...){}
24 |     }
25 | 
26 |      static void normalize(std::string& str)
27 |     {
28 |         try{
29 |         KString kstr(str);
30 |         normalize(kstr);
31 |         str = unicode_to_utf8(kstr);
32 |         }
33 |         catch(...){}
34 |     }
35 | 
36 |      static std::string unicode_to_utf8(const KString& kstr)
37 |     {
38 |         std::string s;
39 |         s.reserve(kstr.length() << 2);
40 |         for(size_t i = 0; i < kstr.length(); ++i)
41 |         {
42 |             uint16_t unic = kstr[i];
43 |             if ( unic <= 0x0000007F )
44 |             {
45 |                 // * U-00000000 - U-0000007F:  0xxxxxxx
46 |                 s.append(1, unic & 0x7F);
47 |             }
48 |             else if ( unic >= 0x00000080 && unic <= 0x000007FF )
49 |             {
50 |                 // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
51 |                 s.append(1, (((unic >> 6) & 0x1F) | 0xC0));
52 |                 s.append(1, ((unic & 0x3F) | 0x80));
53 |             }
54 |             else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
55 |             {
56 |                 // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
57 |                 s.append(1, (((unic >> 12) & 0x0F) | 0xE0));
58 |                 s.append(1, (((unic >>  6) & 0x3F) | 0x80));
59 |                 s.append(1, ((unic & 0x3F) | 0x80));
60 |             }
61 |             else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
62 |             {
63 |                 // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
64 |                 s.append(1, (((unic >> 18) & 0x07) | 0xF0));
65 |                 s.append(1, (((unic >> 12) & 0x3F) | 0x80));
66 |                 s.append(1, (((unic >>  6) & 0x3F) | 0x80));
67 |                 s.append(1, ((unic & 0x3F) | 0x80));
68 |             }
69 |             else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
70 |             {
71 |                 // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
72 |                 s.append(1, (((unic >> 24) & 0x03) | 0xF8));
73 |                 s.append(1, (((unic >> 18) & 0x3F) | 0x80));
74 |                 s.append(1, (((unic >> 12) & 0x3F) | 0x80));
75 |                 s.append(1, (((unic >>  6) & 0x3F) | 0x80));
76 |                 s.append(1, ((unic & 0x3F) | 0x80));
77 |             }
78 |             else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
79 |             {
80 |                 // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
81 |                 s.append(1, (((unic >> 30) & 0x01) | 0xFC));
82 |                 s.append(1, (((unic >> 24) & 0x3F) | 0x80));
83 |                 s.append(1, (((unic >> 18) & 0x3F) | 0x80));
84 |                 s.append(1, (((unic >> 12) & 0x3F) | 0x80));
85 |                 s.append(1, (((unic >>  6) & 0x3F) | 0x80));
86 |                 s.append(1, ((unic & 0x3F) | 0x80));
87 |             }
88 |         }
89 |         return s;
90 |     }
91 | 
92 | 
93 | };
94 | 
95 | #endif
96 | 


--------------------------------------------------------------------------------
/include/segment/segment.h:
--------------------------------------------------------------------------------
  1 | #ifndef NLP_SEGMENT_H_
  2 | #define NLP_SEGMENT_H_
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include <list>
  7 | #include <fstream>
  8 | #include <cstdlib>
  9 | #include <algorithm>
 10 | #include <istream>
 11 | #include <ostream>
 12 | #include <cctype>
 13 | #include <math.h>
 14 | #include <boost/algorithm/string.hpp>
 15 | 
 16 | #include "kstring.hpp"
 17 | #include "normalize.h"
 18 | #include "segment_dict.h"
 19 | #include "line_reader.h"
 20 | 
 21 | namespace knlp
 22 | {
 23 | class HorseTokenize{
 24 |     KDictionary<float> tk_dict_;
 25 |     KDictionary<const char*> rewrite_dict_;
 26 | 
 27 |     bool is_digit_(char c)const
 28 |     {
 29 |         if (c >= '0' && c<='9')
 30 |             return true;
 31 |         if (c == '.' || c == '-' || c == '+' || c == '/' || c == '=' || c== '*' || c== '%'
 32 |           || c == ',' || c == '$' || c == '&' || c == '_')
 33 |             return true;
 34 |         return false;
 35 |     }
 36 | 
 37 |     bool is_punct_(char c)const
 38 |     {
 39 |         if (c == '.' || c == '-' || c == '+' || c == '/' || c == '=' || c== '*' || c== '%'
 40 |           || c == ',' || c == '$' || c == '&' || c == '_')
 41 |             return true;
 42 |         return false;
 43 |     }
 44 | 
 45 |     void merge_(std::vector<std::pair<std::string, float> >& tks)const
 46 |     {
 47 |         std::vector<int32_t> flags(tks.size(), 0);
 48 |         for (uint32_t i=0;i<tks.size();i++)
 49 |         {
 50 |             uint32_t j=0;
 51 |             for (;j<tks[i].first.length();j++)
 52 |                 if (is_digit_(tks[i].first[j])
 53 |                   ||(tks[i].first[j]>='a' && tks[i].first[j]<='z' && tks[i].first.length()<=2))
 54 |                     break;
 55 |             if (j < tks[i].first.length())flags[i] = 1;
 56 |         }
 57 | 
 58 |         for (uint32_t i=0;i<tks.size();i++)if(flags[i])
 59 |         {
 60 |             uint32_t t = i;
 61 |             i++;
 62 |             while(i<tks.size() && flags[i])
 63 |             {
 64 |                 tks[t].first += tks[i].first;
 65 |                 //tks[t].second += tks[i].second;
 66 |                 tks[t].second = tk_dict_.min();
 67 |                 tks.erase(tks.begin()+i);
 68 |                 flags.erase(flags.begin()+i);
 69 |             }
 70 |         }
 71 |     }
 72 | 
 73 |     void rewrite_(std::vector<std::pair<std::string, float> >& tks)const
 74 |     {
 75 |         for (uint32_t i=0;i<tks.size();i++)
 76 |         {
 77 |             const char* p = NULL;
 78 |             if (rewrite_dict_.value(tks[i].first, p, false)!=0)
 79 |                 continue;
 80 |             assert(p != NULL);
 81 |             tks[i].first = std::string(p);
 82 |             tks[i].second = token_weight(tks[i].first);
 83 |         }
 84 |         for (uint32_t i=0;i<tks.size();i++){
 85 |             if (tks[i].first == " ")
 86 |             {
 87 |                 tks.erase(tks.begin()+i);
 88 |                 i--;
 89 |             }
 90 |             int32_t t = tks[i].first.length()-1;
 91 |             while(t>0 && is_punct_(tks[i].first[t]))
 92 |             {
 93 |                 tks[i].first = tks[i].first.substr(0, t);
 94 |                 t--;
 95 |             }
 96 |         }
 97 |     }
 98 | 
 99 | public:
100 |     HorseTokenize(const std::string& dir)
101 |       :tk_dict_(dir+"/token.dict")
102 |        ,rewrite_dict_(dir + "/rewrite.dict")
103 |     {
104 |     }
105 | 
106 |     void tokenize(const std::string& line, 
107 |       std::vector<std::pair<std::string, float> >& tks)const
108 |     {
109 |         tk_dict_.fmm(line, tks);
110 |         //for(uint32_t i=0;i<tks.size();i++)std::cout<<tks[i].first<<"oooooo"<<tks[i].second<<std::endl;
111 |         merge_(tks);
112 |         rewrite_(tks);
113 |     }
114 | 
115 |     void subtokenize(const std::vector<std::pair<std::string, float> >& tks,
116 |       std::vector<std::pair<std::string, float> >& subs)const
117 |     {
118 |         tk_dict_.subtokens(tks, subs);
119 |         for (uint32_t i=0;i<subs.size();++i)
120 |         {
121 |             std::vector<std::string> v;
122 |             boost::split(v, subs[i].first, boost::is_any_of("-, /"));
123 |             std::vector<std::pair<std::string, float> > s;
124 |             for (uint32_t j=0;j<v.size();++j)
125 |                 s.push_back(std::make_pair(v[j], subs[i].second));
126 |             if (s.size() < 2)continue;
127 |             subs.insert(subs.begin()+i+1, s.begin(), s.end());
128 |             subs.erase(subs.begin()+i);
129 |             i += s.size() -1;
130 |         }
131 |     }
132 | 
133 |     void subtokenize(const std::string token,
134 |             std::vector<std::pair<std::string, float> >& subs)const
135 |     {
136 |         std::vector<std::pair<std::string, float> > tks(1, std::make_pair(token, 1.0));
137 |         tk_dict_.subtokens(tks, subs);
138 |         for (uint32_t i=0;i<subs.size();++i)
139 |         {
140 |             std::vector<std::string> v;
141 |             boost::split(v, subs[i].first, boost::is_any_of("-, /"));
142 |             std::vector<std::pair<std::string, float> > s;
143 |             for (uint32_t j=0;j<v.size();++j)
144 |                 s.push_back(std::make_pair(v[j], subs[i].second));
145 |             if (s.size() < 2)continue;
146 |             subs.insert(subs.begin()+i+1, s.begin(), s.end());
147 |             subs.erase(subs.begin()+i);
148 |             i += s.size() -1;
149 |         }
150 |     }
151 | 
152 |     float max()const
153 |     {
154 |         return tk_dict_.max();
155 |     }
156 | 
157 |     float token_weight(const std::string& tk) const
158 |     {
159 |         float w = 0;
160 |         if (tk_dict_.value(tk, w) >=0 )
161 |             return w;
162 |         return tk_dict_.min();
163 |     }
164 | };
165 | }
166 | 
167 | #endif
168 | 


--------------------------------------------------------------------------------
/include/segment/segment_dict.h:
--------------------------------------------------------------------------------
  1 | #ifndef NLP_SEGMENT_DICT_H_
  2 | #define NLP_SEGMENT_DICT_H_
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include <list>
  7 | #include <fstream>
  8 | #include <cstdlib>
  9 | #include <algorithm>
 10 | #include <istream>
 11 | #include <ostream>
 12 | #include <cctype>
 13 | #include <math.h>
 14 | 
 15 | #include "kstring.hpp"
 16 | #include "normalize.h"
 17 | #include "darts.h"
 18 | #include "line_reader.h"
 19 | #include "util/utf8.h"
 20 | 
 21 | namespace knlp
 22 | {
 23 | 
 24 | template<class T>class KDictionary;
 25 | template<class T> void store_values_(const std::vector<std::string>& v, KDictionary<T>& dict);
 26 | template<class T> void save_values_(const std::string& nm, KDictionary<T>& dict);
 27 | template<class T> void load_values_(const std::string& nm, KDictionary<T>& dict);
 28 | 
 29 | template<class T = int32_t>
 30 | class KDictionary
 31 | {
 32 |     Darts::DoubleArray trie_;
 33 |     std::vector<T> values_;
 34 |     T min_, max_;
 35 | 
 36 |     friend void store_values_<T>(const std::vector<std::string>& v, KDictionary<T>& dict);
 37 |     friend void save_values_<T>(const std::string& nm, KDictionary<T>& dict);
 38 |     friend void load_values_<T>(const std::string& nm, KDictionary<T>& dict);
 39 | 
 40 |     void normalize_(std::string& str)const
 41 |     {
 42 |         KNormalize::normalize(str);
 43 |     }
 44 | 
 45 |     void load_(const std::string& nm)
 46 |     {
 47 |         int32_t f = 1;
 48 |         try{
 49 |             if((f = trie_.open((nm+".bin").c_str()))==0)
 50 |                load_values_<T>(nm+".v", *this);
 51 |         }
 52 |         catch(...)
 53 |         {}
 54 | 
 55 |         if (0 == f)
 56 |             return;
 57 | 
 58 |         std::set<std::pair<std::string,std::string> > k_v;
 59 |         std::set<std::string> keySet;
 60 |         char* li = NULL;
 61 |         util::LineReader lr(nm);
 62 |         while((li = lr.line(li)) != NULL)
 63 |         {
 64 |             char* t = strchr(li, '\t');
 65 |             std::string k,v;
 66 |             if(t)
 67 |             {
 68 |                 k = std::string(li, t);
 69 |                 v = std::string(t+1);
 70 |             }else
 71 |                 k = std::string(li);
 72 |             normalize_(k);
 73 |             if (k.length() == 0 || keySet.find(k)!=keySet.end())
 74 |                 continue;
 75 | 
 76 |             k_v.insert(std::make_pair(k, v));
 77 |             keySet.insert(k);
 78 |         }
 79 |         
 80 |         std::vector<const char *> keys(k_v.size());
 81 |         std::vector<std::string> v(k_v.size());
 82 |         std::vector<std::size_t> lengths(k_v.size());
 83 |         std::vector<typename Darts::DoubleArray::value_type> values(k_v.size());
 84 | 
 85 |         uint32_t i = 0, ff = 0;
 86 |         for (std::set<std::pair<std::string,std::string> >::const_iterator it=k_v.begin();it!=k_v.end();++it,i++)
 87 |         {
 88 |             keys[i] = it->first.c_str()
 89 |               ,lengths[i]=it->first.length()
 90 |               ,values[i] = i;
 91 |             if (it->second.length())
 92 |                 v[i]=it->second,ff=1;
 93 |         }
 94 |         assert(keys.size() == v.size());
 95 |         trie_.build(keys.size(), &keys[0], &lengths[0], &values[0]);
 96 |         trie_.save((nm+".bin").c_str());
 97 | 
 98 |         if (ff == 1)
 99 |         {
100 |             store_values_<T>(v, *this);
101 |             save_values_(nm+".v", *this);
102 |         }
103 |     }
104 | 
105 | public:
106 | 
107 |     KDictionary(const std::string& dict_nm)
108 |     {
109 |         load_(dict_nm);
110 |         if (values_.size() == 0)return;
111 |         min_ = *std::min_element(values_.begin(), values_.end());
112 |         max_ = *std::max_element(values_.begin(), values_.end());
113 |     }
114 | 
115 |     ~KDictionary()
116 |     {
117 |     }
118 | 
119 |     /**
120 |      * Return value:
121 |      *  0:  sucess
122 |      *  1:  sucess, but no value for this key.
123 |      *  -1: not found
124 |      * */
125 |     int32_t value(std::string key, T& v, bool nor = true)const
126 |     {
127 |         if (nor)
128 |             normalize_(key);
129 |         Darts::DoubleArray::result_pair_type res;
130 |         trie_.exactMatchSearch(key.c_str(), res, key.length());
131 | 
132 |         if (res.length == 0 && res.value == -1)
133 |             return -1;
134 | 
135 |         if ((std::size_t)res.value < values_.size())
136 |         {
137 |             v = values_[res.value];
138 |             return 0;
139 |         }
140 | 
141 |         return 1;
142 |     }
143 |     
144 |     T min()const
145 |     {
146 |         return min_;
147 |     }
148 | 
149 |     T max()const
150 |     {
151 |         return max_;
152 |     }
153 | 
154 |     bool has_key(std::string key, bool nor = true)const
155 |     {
156 |         T v;
157 |         int32_t r = value(key, v, nor);
158 |         if(r >= 0)
159 |             return true;
160 |         return false;
161 |     }
162 | 
163 |     void fmm(std::string line, std::vector<std::pair<std::string, T> >& r, bool nor = true)const
164 |     {
165 |         r.clear();
166 |         if(nor)normalize_(line);
167 |        // KString kstr(line);
168 |         std::vector<uint32_t> lens, cumu_lens;
169 |         // check encoding
170 |         std::string::iterator str_end = utf8::find_invalid(line.begin(), line.end());
171 |         // get bytes numbers
172 |         line = std::string(line.begin(), str_end);
173 |         std::string::iterator it = line.begin();
174 |         while(it != line.end())
175 |         {
176 |             //std::string str = kstr.substr(i,1).get_bytes("utf-8");
177 |             uint32_t code = utf8::next(it, line.end());
178 |             std::string str;
179 |             utf8::append(code, std::back_inserter(str));
180 |             lens.push_back(str.length());
181 |             if (cumu_lens.size() > 0)
182 |                 cumu_lens.push_back(str.length()+cumu_lens.back());
183 |             else cumu_lens.push_back(str.length());
184 |         }
185 | 
186 |         std::size_t key_pos=0;
187 |         for (std::size_t j = 0; j <lens.size(); ++j) 
188 |         {
189 |             std::size_t last_j = j, jj=j;
190 |             Darts::DoubleArray::value_type last_state = -1;
191 |             typename Darts::DoubleArray::value_type state;
192 |             std::size_t node_pos = 0;
193 |             while(j<lens.size() 
194 |               && (state=trie_.traverse(line.c_str(), node_pos, key_pos, cumu_lens[j]))!=-2
195 |               )
196 |             {
197 |                 //std::cout<<j<<":"<<lens[j]<<":"<<state<<":"<<line.substr(key_pos-lens[j],lens[j])<<"<<<<<<\n";;
198 |                 j++;
199 |                 if(state < 0)continue;
200 |                 last_state = state;
201 |                 last_j = j - 1;
202 |             }
203 | 
204 |             if (last_state >=0)
205 |             {
206 |                 T v = T();if ((uint32_t)last_state < values_.size())v = values_[last_state];
207 |                 r.push_back(std::make_pair(std::string(line.c_str()+cumu_lens[jj]-lens[jj], line.c_str()+cumu_lens[last_j]), v));
208 |             }
209 |             else
210 |                 r.push_back(std::make_pair(std::string(line.c_str()+cumu_lens[jj]-lens[jj], line.c_str()+cumu_lens[jj]), min()));
211 |             j = last_j;
212 |             key_pos = cumu_lens[j];
213 |         }
214 |     }
215 | 
216 |     void subtokens(const std::vector<std::pair<std::string, T> >& tks,
217 |                     std::vector<std::pair<std::string, T> >& subs)const
218 |     {
219 |         subs.clear();
220 |         for (uint32_t i=0; i<tks.size(); ++i)
221 |         {
222 |             KString kstr(tks[i].first);
223 |             if (kstr.length()<=2)
224 |             {
225 |                 subs.push_back(tks[i]);
226 |                 continue;
227 |             }
228 | 
229 |             std::string line = kstr.substr(0, kstr.length()-1).get_bytes("utf-8");
230 |             std::vector<std::pair<std::string, T> > ss;
231 |             do{
232 |                 std::vector<std::pair<std::string, T> > s;
233 |                 fmm(line, s, false);
234 |                 if (ss.size() == 0)line = tks[i].first;
235 |                 ss.push_back(s[0]);
236 |                 line = line.substr(s[0].first.length());
237 |             }while(line.length());
238 | 
239 |             if (ss.size() >= kstr.length())
240 |                  subs.push_back(tks[i]);
241 |             else
242 |                  subs.insert(subs.end(), ss.begin(), ss.end());
243 |         }
244 |     }
245 | };
246 | 
247 | template<class T>
248 |   inline void store_values_(const std::vector<std::string>& v, KDictionary<T>& dict)
249 |   {
250 |       dict.values_.resize(v.size(), 0);
251 |       for (uint32_t i=0;i<v.size();++i)
252 |           dict.values_[i] = atoi(v[i].c_str());
253 |   }
254 | 
255 | template<class T>
256 |   inline void save_values_(const std::string& nm, KDictionary<T>& dict)
257 |   {
258 |       FILE* f = fopen(nm.c_str(), "w+");
259 |       if (!f)
260 |           return;
261 |       uint32_t s = dict.values_.size();
262 |       assert(fwrite(&s, sizeof(s), 1, f) == 1);
263 |       assert(fwrite(&dict.values_[0], dict.values_.size()*sizeof(T), 1, f) == 1);
264 |       fclose(f);
265 |   }
266 | 
267 | template<class T>
268 |   inline void load_values_(const std::string& nm, KDictionary<T>& dict)
269 |   {
270 |       FILE* f = fopen(nm.c_str(), "r");
271 |       if (!f)
272 |           return;
273 |       uint32_t s = 0;
274 |       assert(fread(&s, sizeof(s), 1, f) == 1);
275 |       dict.values_.resize(s);
276 |       assert(fread(&dict.values_[0], dict.values_.size()*sizeof(T), 1, f) == 1);
277 |       fclose(f);
278 |   }
279 | 
280 | template<>
281 |   inline void store_values_<const char*>(const std::vector<std::string>& v, KDictionary<const char*>& dict)
282 |   {
283 |       dict.values_.resize(v.size(), NULL);
284 |       for (uint32_t i=0;i<v.size();++i)
285 |       {
286 |           char* b = new char[v[i].length()+1];
287 |           memset(b, 0, v[i].length()+1);
288 |           strcpy(b, v[i].c_str());
289 |           dict.values_[i] = b;
290 |       }
291 |   }
292 | 
293 | template<>
294 |   inline void store_values_<float>(const std::vector<std::string>& v, KDictionary<float>& dict)
295 |   {
296 |       dict.values_.resize(v.size(), 0);
297 |       for (uint32_t i=0;i<v.size();++i)
298 |           dict.values_[i] = atof(v[i].c_str());
299 |   }
300 | 
301 | template<>
302 |   inline void store_values_<uint32_t>(const std::vector<std::string>& v, KDictionary<uint32_t>& dict)
303 |   {
304 |       dict.values_.resize(v.size(), 0);
305 |       for (uint32_t i=0;i<v.size();++i)
306 |           dict.values_[i] = atoi(v[i].c_str());
307 |   }
308 | 
309 | template<>
310 |   inline void save_values_<const char*>(const std::string& nm, KDictionary<const char*>& dict)
311 |   {
312 |       FILE* f = fopen(nm.c_str(), "w+");
313 |       if (!f)
314 |           return;
315 |       uint32_t s = dict.values_.size();
316 |       fwrite(&s, 1, sizeof(s), f);
317 |       for (uint32_t i=0;i<dict.values_.size();++i)
318 |       {
319 |           s = strlen(dict.values_[i]);
320 |           fwrite(&s, 1, sizeof(s), f);
321 |           fwrite(dict.values_[i], 1, s, f);
322 |       }
323 |       fclose(f);
324 |   }
325 | 
326 | template<>
327 |   inline void load_values_<const char*>(const std::string& nm, KDictionary<const char*>& dict)
328 |   {
329 |       FILE* f = fopen(nm.c_str(), "r");
330 |       if (!f)
331 |           return;
332 |       uint32_t s = 0;
333 |       assert(fread(&s, sizeof(s), 1, f) == 1);
334 |       dict.values_.resize(s);
335 |       for (uint32_t i=0;i<dict.values_.size();++i)
336 |       {
337 |           assert(fread(&s, sizeof(s), 1, f) == 1);
338 |           char* b = new char[s+1];memset(b, 0, s+1);
339 |           assert(fread(b, s, 1, f) == 1);
340 |           dict.values_[i] = b;
341 |       }
342 |       fclose(f);
343 |   }
344 | 
345 | 
346 | }//namespace
347 | 
348 | #endif
349 | 
350 | 


--------------------------------------------------------------------------------
/include/segmentWrapper.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: segmentWrapper.h
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Mon 16 May 2016 11:16:23 AM CST
  7 |  ************************************************************************/
  8 | #ifndef SEGMENTWRAPPER_H
  9 | #define SGEMENTWRAPPER_H
 10 | 
 11 | #include <iostream>
 12 | #include <memory>
 13 | #include "segment/segment.h"
 14 | //#include "PropSharedLock.h"
 15 | 
 16 | #include <boost/unordered_set.hpp>
 17 | #include <boost/unordered_map.hpp>
 18 | #include <boost/algorithm/string.hpp>
 19 | #include <boost/algorithm/string/split.hpp>
 20 | #include <boost/lexical_cast.hpp>
 21 | 
 22 | // Sort by second
 23 | // NOTE: Typename R muse be comparable
 24 | template<typename T, typename R>
 25 | class SORT
 26 | {
 27 |     public:
 28 |         typedef std::pair<T, R> VectorPair;
 29 |         
 30 |         static bool sortDescendBySecond(const VectorPair& lhs, const VectorPair& rhs){
 31 |             return lhs.second > rhs.second;
 32 |         }
 33 |         
 34 |         static bool sortAscendBySecond(const VectorPair& lhs, const VectorPair& rhs){
 35 |             return lhs.second < rhs.second;
 36 |         }
 37 | };
 38 | 
 39 | class SegmentWrapper
 40 | {
 41 |     private:
 42 |         std::string dictDir_;
 43 |         boost::unordered_set<std::string> stopWords_;
 44 |         std::auto_ptr<knlp::HorseTokenize> tok_;
 45 | 
 46 |     public:
 47 |         SegmentWrapper(const std::string& dictDir)
 48 |           :dictDir_(dictDir){
 49 |               tok_.reset(new knlp::HorseTokenize(dictDir));
 50 |               loadSpecialWords((dictDir_+"/stop_words.utf8"));
 51 |           }
 52 | 
 53 |         ~SegmentWrapper(){
 54 |         }
 55 | 
 56 |         bool isDigital(uint16_t uchar){
 57 |             uint16_t zero = '0';
 58 |             uint16_t nine = '9';
 59 |             if(zero <= uchar && uchar <= nine)
 60 |                 return true;
 61 |             return false;
 62 |         }
 63 | 
 64 |         bool isAlpha(uint16_t uchar){
 65 |             uint16_t a = 'a';
 66 |             uint16_t z = 'z';
 67 |             uint16_t A = 'A';
 68 |             uint16_t Z = 'Z';
 69 |             if((a <= uchar && uchar <= z)||(A <= uchar && uchar <= Z))
 70 |                 return true;
 71 |             return false;
 72 |         }
 73 | 
 74 |         // To check a term si alphanumberic after removing dot
 75 |         bool isAlphaNumberic(const std::string& term){
 76 |             std::string nstr = boost::replace_all_copy(term, ".", "");
 77 |             std::string::size_type i;
 78 |             for(i = 0; i < nstr.size(); ++i){
 79 |                 if(isDigital(nstr[i]) || isAlpha(nstr[i]))
 80 |                     continue;
 81 |                 return false;
 82 |             }
 83 |             return true;
 84 |         }
 85 | 
 86 |         bool loadSpecialWords(const std::string& filename){
 87 |             if(filename.empty())
 88 |                 return false;
 89 |             std::ifstream ifs(filename.c_str());
 90 |             std::string line;
 91 |             while(getline(ifs, line)){
 92 |                 if(line.empty())
 93 |                     continue;
 94 |                 boost::algorithm::trim(line);
 95 |                 stopWords_.insert(line);
 96 |             }
 97 |             ifs.close();
 98 |             assert(stopWords_.size());
 99 |             return true;
100 |         }
101 |         
102 |         // Clean rules
103 |         bool isNeedClean_(const std::string& term){
104 |             if(term.empty())
105 |                 return true;
106 |             if(term.length() < 4 || term.length() > 31)
107 |                 return true;
108 |             if(isAlphaNumberic(term))
109 |                 return true;
110 |             return false;
111 |         }
112 | 
113 |         bool isNeedClean(const std::string& token){
114 |             if(stopWords_.end() != stopWords_.find(token) || isNeedClean_(token))
115 |                 return true;
116 |             return false;
117 |         }
118 |         
119 |         // dedup: it's true means you want to remove the deduplicate words afterm segmentation
120 |         void segment(const std::string& line, std::vector<std::string>& token, bool dedup=true){
121 |             token.clear();
122 |             std::vector<std::pair<std::string, float> > tmp;
123 |             try{
124 |                 tok_->tokenize(line, tmp);
125 |             }catch(...){
126 |                 tmp.clear();
127 |             }
128 |             token.resize(tmp.size());
129 |             for(uint32_t i = 0; i < tmp.size(); ++i)
130 |                 token[i] = tmp[i].first;
131 |             if(dedup){
132 |                 std::set<std::string> set_(token.begin(), token.end());
133 |                 token.clear();
134 |                 std::copy(set_.begin(), set_.end(), std::back_inserter(token));
135 |             }
136 |         }
137 | 
138 |         // Segmentation and store tokens
139 |         std::vector<std::string> segment(const std::string& title){
140 |             std::vector<std::string> tokens;
141 |             tokens.clear();
142 |             if(title.empty())
143 |                 return tokens;
144 |             // Lock
145 |            // bool isLock;
146 |            // ScopedWriteBoolLock lock(mutex_, isLock);
147 | 
148 |            // std::vector<std::string> tokens;
149 |             subSegment(title, tokens, false);
150 |             bigramModel(tokens);
151 |             return tokens;
152 |            /* boost::unordered_map<int, std::vector<std::string> >::iterator it;
153 |             if(it != itemID_tokens_.end())
154 |                 it->second = tokens;
155 |             else{
156 |                 itemID_tokens_[itemID] = tokens;
157 |                 itemIDs_.push_back(itemID);
158 |             }*/
159 |         }
160 | 
161 |         // Given itemid ,return title token set
162 |        /* void getTokens(int itemID, std::vector<std::string>& token){
163 |             bool isLock;
164 |             ScopedReadBoolLock lock(mutex_, isLock);
165 |             token.clear();
166 |             boost::unordered_map<int, std::vector<std::string> >::iterator it;
167 |             it = itemID_tokens_.find(itemID);
168 |             if(it != itemID_tokens_.end())
169 |                 token = it->second;
170 |         }
171 |         
172 |         std::vector<int> getItemIDs(){
173 |             bool isLock;
174 |             ScopedReadBoolLock lock(mutex_, isLock);
175 |             return itemIDs_;
176 |         }*/
177 |         // sub tokenize
178 |         void subSegment(const std::vector<std::string>& token, std::vector<std::string>& subtoken){
179 |             subtoken.clear();
180 |             if(token.empty())
181 |                 return;
182 |             std::vector<std::pair<std::string, float> > tmp, subtmp;
183 |             tmp.resize(token.size());
184 |             for(uint32_t i = 0; i < token.size(); ++i)
185 |                 tmp[i] = std::make_pair(token[i], 1.0);
186 |             tok_->subtokenize(tmp,subtmp);
187 |             subtoken.resize(subtmp.size());
188 |             for(uint32_t j = 0; j < subtmp.size(); ++j)
189 |                 subtoken[j] = subtmp[j].first;
190 |         }
191 |         
192 |         // dedup: it's true means you want to remove the deduplicate words afterm segmentation
193 |         void subSegment(const std::string& line, std::vector<std::string>& token, bool dedup=true){
194 |             token.clear();
195 |             std::vector<std::pair<std::string, float> > tmp, subtmp, subtoken;
196 |             try{
197 |                 tok_->tokenize(line, tmp);
198 |             }catch(...){
199 |                 tmp.clear();
200 |             }
201 |             for(uint32_t i = 0; i < tmp.size(); ++i)
202 |             {
203 |                 if(tmp[i].first.length() > 9){
204 |                     tok_->subtokenize(tmp[i].first, subtoken);
205 |                     subtmp.push_back(tmp[i]);
206 |                     for(uint32_t j = 0; j < subtoken.size(); ++j)
207 |                         token.push_back(subtoken[j].first);
208 |                 }
209 |                 else
210 |                     token.push_back(tmp[i].first);
211 |             }
212 |             if(dedup){
213 |                 std::set<std::string> set_(token.begin(), token.end());
214 |                 token.clear();
215 |                 std::copy(set_.begin(), set_.end(), std::back_inserter(token));
216 |             }
217 |         }
218 | 
219 |         // Extend terms based on bigram
220 |         // before extension: t1, t2, t3
221 |         // after extension: t1,t2,t3, t1_t2, t2_t3
222 |         void bigramModel(std::vector<std::string>& token){
223 |             // clean tokens
224 |             std::vector<std::string>::iterator it;
225 |             for(it = token.begin(); it != token.end(); ++it){
226 |                 if((*it).empty() || isNeedClean(*it)){
227 |                     token.erase(it);
228 |                     it--;
229 |                 }
230 |             }
231 |             std::vector<std::string> tmp;
232 |             tmp.swap(token);
233 |             std::size_t i, j, size = tmp.size();
234 |             std::string bigram;
235 |             for(i = 0; i < size; ++i){
236 |                 token.push_back(tmp[i]);
237 |                 j = i + 1;
238 |                 if(j < size){
239 |                     bigram = tmp[i] + tmp[j];
240 |                     token.push_back(bigram);
241 |                 }
242 |             }
243 |         }
244 | 
245 |         // Find the intersection between token1 and token2
246 |         void intersect(const std::vector<std::string>& token1
247 |                         ,const std::vector<std::string>& token2
248 |                         ,std::vector<std::string>& result){
249 |             result.clear();
250 |             if(token1.empty() || token2.empty())
251 |                 return;
252 |             boost::unordered_map<std::string, int> token;
253 |             for(uint32_t i = 0; i < token1.size(); ++i)
254 |                 token.insert(std::make_pair(token1[i], 1));
255 |             for(uint32_t j = 0; j < token2.size(); ++j)
256 |                 if(token.end() != token.find(token2[j]))
257 |                     result.push_back(token2[j]);
258 |         }
259 | 
260 |         // Compute content similarity between t1 and t2, we choose the simplest way
261 |         // sim = (intersection-size / token1.size()) * (intersection-size / token2.size())
262 |         // we don't consider the semantic similarity, so most of the similarity maybe zero. 
263 |         double computeContentSim2(const std::string& t1, const std::string& t2){
264 |             double sim = 0.0;
265 |             if(t1.empty() || t2.empty())
266 |                 return sim;
267 |             std::vector<std::string> token1, token2,result;
268 |             subSegment(t1, token1, false);
269 |             bigramModel(token1);
270 |             subSegment(t2, token2, false);
271 |             bigramModel(token2);
272 |             intersect(token1, token2, result);
273 |             // To ensure the size of token is big than zero
274 |             if(token1.empty() || token2.empty())
275 |                 return sim;
276 |             sim = ((double)result.size()/token1.size()) * ((double)result.size()/token2.size());
277 |             return sim;
278 |         }
279 |         
280 |         // Through ItemID find it's token set and compute it's similarity.
281 |        /* double computeContentSim(int itemID1, int itemID2){
282 |             
283 |             bool isLock;
284 |             ScopedReadBoolLock lock(mutex_, isLock);
285 | 
286 |             double sim = 0.0;
287 |             std::vector<std::string> token1, token2,result;
288 |             boost::unordered_map<int, std::vector<std::string> >::iterator it;
289 |             it = itemID_tokens_.find(itemID1);
290 |             if(it == itemID_tokens_.end() || it->second.empty())
291 |                 return sim;
292 |             token1 = it->second;
293 |             it = itemID_tokens_.find(itemID2);
294 |             if(it == itemID_tokens_.end() || it->second.empty())
295 |                 return sim;
296 |             token2 = it->second;
297 |             
298 |             intersect(token1, token2, result);
299 |             sim = ((double)result.size()/token1.size()) * ((double)result.size()/token2.size());
300 |             return sim;
301 |         }*/
302 | 
303 |         double computeContentSim(const std::vector<std::string>& token1
304 |                             ,const std::vector<std::string>& token2){
305 |             double sim = 0.0;
306 |             if(token1.empty() || token2.empty())
307 |                 return sim;
308 |             std::vector<std::string> result;
309 |             intersect(token1, token2, result);
310 |             sim = ((double)result.size()/token1.size()) * ((double)result.size()/token2.size());
311 |             return sim;
312 |         }
313 | };
314 | 
315 | 
316 | #endif // segmentWrapper.h
317 | 


--------------------------------------------------------------------------------
/include/suggestion.hpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: suggestion.hpp
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Tue 26 Jul 2016 02:32:52 PM CST
  7 |  ************************************************************************/
  8 | // Query suggestion module is used to create index for data module
  9 | // and the output interfaces for query suggestion,which contains data update
 10 | // strategy and the final results.
 11 | 
 12 | #ifndef SUGGESTION_ENGINE_HPP
 13 | #define SUGGESTION_ENGINE_HPP
 14 | 
 15 | #include <iostream>
 16 | #include "buildEngine.hpp"
 17 | 
 18 | // static global resource directory in build engine
 19 | std::string BuildEngine::res_dir_ = "";
 20 | // Suggestion class to used for getting the final result and data updating.
 21 | class Suggestion {
 22 |     private:
 23 |         boost::shared_ptr<BuildEngine> pBuild_;  // data building pointer
 24 |     
 25 |         KeyTermIDsType key_termids_; // prefix and corresponding term ids
 26 |         TermInfoType termsInfo_;     // all the term's information(tf, reserve field,stored based on their ids)
 27 |         
 28 |          std::string res_dir_;        // resource directory
 29 | 
 30 |     public:
 31 |         Suggestion(const std::string& res_dir)
 32 |            : res_dir_(res_dir) {
 33 |             if (!boost::filesystem::exists(res_dir)) {
 34 |                 std::cout << "Resource directory " << res_dir << " not exists!\n";
 35 |                 std::cout << "The resource directory may like \"../resource/\"\n";
 36 |                 return;
 37 |             }
 38 |             BuildEngine::res_dir_ = res_dir_; 
 39 |             pBuild_.reset(new BuildEngine());
 40 |         }
 41 |         
 42 |         ~Suggestion() {
 43 |             key_termids_.clear();
 44 |             termsInfo_.clear();
 45 |         }
 46 | 
 47 |         // get final suggestion results
 48 |         // @key: user input string
 49 |         // @JsonRes: suggestion results in the form of json
 50 |         void GetJsonResult(const std::string& key, std::string& JsonRes) {
 51 | 
 52 |             std::vector<std::string> terms, attrs;
 53 |             GetSuggestion(key, terms, attrs);
 54 | 
 55 |             
 56 |             JsonRes = "[";
 57 |             if (terms.empty()) {
 58 |                 JsonRes += "]";
 59 |                 return;
 60 |             }
 61 | 
 62 |             for (uint32_t idx = 0; idx < terms.size(); ++idx) {
 63 |                 if (terms[idx].empty()) 
 64 |                     continue;
 65 |                 JsonRes += "{\"term\":\"" + terms[idx] + "\"";
 66 |                 JsonRes += ",\"total_count\":" + attrs[idx] + "},";
 67 |             }
 68 | 
 69 |             // delete the last ','
 70 |             if (*JsonRes.rbegin() == ',')
 71 |                 JsonRes.erase(JsonRes.size() - 1);
 72 |             JsonRes += "]";
 73 |         }
 74 | 
 75 |         // remove more space only reserve one space
 76 |         // @input: the input string
 77 |         // @return: return the lowercase
 78 |         std::string RemoveSpace(const std::string& input) {
 79 |             uint32_t state = 0, start = 0, end = input.length();
 80 |             // from start to find the first element which is not space or tab
 81 |             for (; start < input.size(); ++start) {
 82 |                 if (input[start] != ' ' && input[start] != '\t') break;
 83 |             }
 84 |             
 85 |             // from end to find the first element which is not space or tab
 86 |             for (; end > 0;--end) {
 87 |                 if (input[end-1] != ' ' && input[end - 1 ] != '\t') break;
 88 |             }
 89 | 
 90 |             // remove extra spaces keep only one in string
 91 |             std::string res("");
 92 |             for (; start < end; ++start) {
 93 |                 if (input[start] == ' ' || input[start] == '\t')
 94 |                     ++state;
 95 |                 else 
 96 |                     state = 0;
 97 | 
 98 |                 if (state == 0) 
 99 |                     res += input[start];
100 |                 else if (state == 1)
101 |                     res += ' ';
102 |             }
103 | 
104 |             boost::to_lower(res);
105 |             return res;
106 |         }
107 |         
108 |         // get query suggestion according input key
109 |         // @terms: candidate terms
110 |         // @attrs: candidate term attributes
111 |         // TODO: 
112 |         // needs to be locked when read the data
113 |         bool GetSuggestion(const std::string& key
114 |                 ,std::vector<std::string>& terms
115 |                 ,std::vector<std::string>& attrs) {
116 |             terms.clear();
117 |             attrs.clear();
118 | 
119 |             std::string nkey = RemoveSpace(key);
120 |             
121 |             // get results if exists
122 |             KeyTermIDsType::iterator keyIter;
123 |             keyIter = key_termids_.find(key);
124 |             if (keyIter == key_termids_.end()) 
125 |                 return false;
126 | 
127 |             std::vector<uint32_t>& termIds = keyIter->second;
128 |             uint32_t size = termIds.size();
129 |             terms.resize(size);
130 |             attrs.resize(size);
131 |             for (uint32_t i = 0; i < size; ++i) {
132 |                 if (termIds[i] > termsInfo_.size())
133 |                     continue;
134 |                 terms[i] = termsInfo_[termIds[i]].first; // term
135 |                 try {  // term result number
136 |                     terms[i] = boost::lexical_cast<std::string>(termsInfo_[termIds[i]].second.second);
137 |                 } catch(...) {
138 |                     terms[i] = "12";
139 |                 }
140 |             }
141 |             if (terms.size() != attrs.size())
142 |                 return false;
143 | 
144 |             return true;
145 |         }
146 | 
147 |         // get data building module
148 |         void GetDataModule(TermInfoType& termsInfo, KeyTermIDsType& key_termids) {
149 | 
150 |             pBuild_->GetDataModule(termsInfo, key_termids);
151 |         }
152 |         
153 |         // Rebuilding the module and update data
154 |         // TODO:
155 |         // may consider data intergrity and validity so will be use write and read lock
156 |         void ModuleUpate() {
157 |         }
158 | 
159 |         // building data module with specified term file
160 |         // @filename: term files the structure like [term '\t' tf '\t' result_num or other attribute(uint32_t)]
161 |         void Build(const std::string& filename) {
162 |             
163 |             pBuild_->Build(filename);
164 |             GetDataModule(termsInfo_, key_termids_);
165 |         }
166 | 
167 | };
168 | 
169 | #endif // suggestion.hpp
170 | 


--------------------------------------------------------------------------------
/include/util/mtrie.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: mtrie.h
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Wed 16 Mar 2016 11:17:11 AM CST
  7 |  ************************************************************************/
  8 | // This data structure was created based on boost unordered_map, which is 
  9 | // a simpler version of memory trie, the performance may need to be optimized.
 10 | 
 11 | #ifdef MTRIE_H
 12 | #define MTRIE_H
 13 | 
 14 | #ifndef WIN32
 15 | #include <stddef.h>
 16 | #include <sys/types.h>
 17 | #include <stdint.h>
 18 | #else
 19 | #include <sys/types.h>
 20 | #include <wchar.h>
 21 | 
 22 | typedef signed char            int8_t;
 23 | typedef short                  int16_t;
 24 | typedef long                   int16_t;
 25 | typedef __int64                int64_t;
 26 | typedef unsigned char          uint8_t;
 27 | typedef unsigned short         uint16_t;
 28 | typedef unsigned long          uint32_t;
 29 | typedef unsigned __int64       uint64_t;
 30 | 
 31 | #endif // end of WIN32
 32 | 
 33 | #include <iostream>
 34 | #include <string>
 35 | 
 36 | #include <boost/unordered_map.hpp>
 37 | 
 38 | template<typename StringType, typename NodeIDType = uint32_t, class DataType = uint32_t>
 39 | class MTrie
 40 | {
 41 |     private:
 42 |         TrieType trie_;
 43 |         DataStorageType data_;
 44 |         NodeIDType key_;
 45 | 
 46 |         /**
 47 |          * @brief To insert str into mtrie structure
 48 |          */
 49 |         void Add_(const StringType& str, NodeIDType& id)
 50 |         {
 51 |             id = GetRootID();
 52 |             if(str.empty()) return;
 53 |             for(std::size_t i = 0; i < str.size(); ++i)
 54 |             {
 55 |                 bool last = (i == str.size() - 1);
 56 |                 std::pair<NodeIDType, CharType> key_pair(id, str[i]);
 57 |                 typename TrieType::iterator it = trie_.find(key_pair);
 58 |                 if(it != trie_.end())
 59 |                 {
 60 |                     std::pair<NodeIDType, bool>& value_pair = it->second;
 61 |                     id = value_pair.first;
 62 |                     if(last && !value_pair.second)
 63 |                     {
 64 |                         value_pair.second = true;
 65 |                     }
 66 |                 } 
 67 |                 else
 68 |                 {
 69 |                     id = key_;
 70 |                     key_++;
 71 |                     std::pair<NodeIDType, bool> value_pair(id, last);
 72 |                     trie_.insert(std::make_pair(key_pair, value_pair));
 73 |                 }
 74 |             }
 75 |         }
 76 | 
 77 | 
 78 |     public:
 79 |         typedef typename StringType::value_type CharType;
 80 |         typedef boost::unordered_map<std::pair<NodeIDType, CharType>,
 81 |                 std::pair<NodeIDType, bool> > TrieType;
 82 |         typedef boost::unordered_map<NodeIDType, DataType> DataStorageType;
 83 | 
 84 |         MTrie():key_(1)
 85 |         {
 86 |         }
 87 | 
 88 |         /**
 89 |          *  @brief insert string to mtrie structure
 90 |          */
 91 |         void Add(cosnt StringType& str)
 92 |         {
 93 |             NodeIDType id = GetRootID();
 94 |             Add_(str, id);
 95 |         }
 96 | 
 97 |         /**
 98 |          *  @brief add given str into data storage.
 99 |          */
100 |         void Add(const StringType& str, const DataType& data)
101 |         {
102 |            NodeIDType id = GetRootID();
103 |            Add_(str, id);
104 |            data_.insert(std::make_pair(id, data));
105 |         }
106 | 
107 |         /**
108 |          *  @brief get the root id it always zero.
109 |          */
110 |         NodeIDType GetRootID() const
111 |         {
112 |             return 0;
113 |         }
114 | 
115 |         /**
116 |          *  @brief to find the given parameter c and it's parent node id,
117 |          *  if it exists, it will return <true, true>, and sotre the child node
118 |          *  id in parameter childNID.
119 |          */
120 |         std::pair<bool, bool> Find(const CharType& c, const NodeIDType& parentNID,
121 |           NodeIDType& childNID)
122 |         {
123 |             std::pair<bool, bool> result(false, false);
124 |             std::pair<NodeIDType, CharType> key_pair(parentNID, c);
125 |             typename TrieType::iterator it = trie_.find(key_pair);
126 |             std::pair<NodeIDType, bool> value_pair(0, false);
127 |             if(it != trie_.end())
128 |             {
129 |                 value_pair = it->second;
130 |                 result.first = true;
131 |                 result.second = value_pair.second;
132 |                 childNID = value_pair.first;
133 |             }
134 | 
135 |             return result;
136 |         }
137 | 
138 |         /**
139 |          *  @brief get data from the given id, if the id exists, return ture and 
140 |          *  the data will copy to parameter data, else it will return false.
141 |          */
142 |         bool GetData(cosnt NodeIDType& id, DataType& data)
143 |         {
144 |             typename DataStorageType::iterator it = data_.find(id);
145 |             if(it != data_.end())
146 |             {
147 |                 data = it->second;
148 |                 return true;
149 |             }
150 |             return false;
151 |         }
152 | };
153 | 
154 | #endif // mtrie.h
155 | 


--------------------------------------------------------------------------------
/include/util/normalize.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: normalize.h
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Tue 12 Jul 2016 02:02:37 PM CST
  7 |  ************************************************************************/
  8 | // Normalization module contains several functions to convert one string 
  9 | // to a normalized string,like utf8 encoding,lower case,etc. And the encoding
 10 | // was based on a open source "utf8-cpp".
 11 | 
 12 | #ifndef UTIL_NORMALIZE_H
 13 | #define UTIL_NORMALIZE_H
 14 | 
 15 | #include <iostream>
 16 | #include <string>
 17 | #include <stdint.h>
 18 | #include <ctype.h>
 19 | #include "utf8.h"
 20 | 
 21 | typedef uint16_t UnicodeType;
 22 | 
 23 | class Normalize {
 24 |     public:
 25 |         Normalize(){
 26 |         }
 27 |         
 28 |         // IsDigital
 29 |         static bool IsDigit(char c) {
 30 |             if (c >= '0' && c <= '9')
 31 |                 return true;
 32 |             return false;
 33 |         }
 34 | 
 35 |         // IsAlpha
 36 |         static bool IsAlpha(char c) {
 37 |             if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
 38 |                 return true;
 39 |             return false;
 40 |         }
 41 | 
 42 |         // IsConnector
 43 |         static bool IsConnector(char c) {
 44 |             if ( c == '.' || c == '-' || c == '+')
 45 |                 return true;
 46 |             return false;
 47 |         }
 48 | 
 49 |         // IsBreakPunctuation
 50 |         static bool IsBreakPunct(char c) {
 51 |             if (!IsPunctuation(c))
 52 |                 return false;
 53 |             if (c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}')
 54 |                 return true;
 55 | 
 56 |             return false;
 57 |         }
 58 | 
 59 |         // IsPunctuation
 60 |         static bool IsPunctuation(char c) {
 61 |             return ispunct(c);
 62 |         }
 63 | 
 64 |         // Convert a string to utf8 encoding, replace any invalid codes by unicode
 65 |         static bool ToUTF8(std::string& str) {
 66 |             if (str.empty()) 
 67 |                 return false;
 68 |             std::string temp;
 69 |             utf8::replace_invalid(str.begin(), str.end(), std::back_inserter(temp));
 70 |             
 71 |             str = temp;
 72 |             return true;
 73 |         }
 74 | 
 75 |         // TODO:
 76 |         // remove invalid utf8 encoding if it exists.
 77 |         static bool RemoveInvalidUTF8(std::string& str) {
 78 |             std::string::iterator iter = utf8::find_invalid(str.begin(), str.end());
 79 |             if (iter == str.end())
 80 |                 return false;
 81 |             std::string temp(str.begin(), iter);
 82 |             str = temp;
 83 |             return true;
 84 |         }
 85 | 
 86 |         // Convert a string to lower case
 87 |         static void ToLower(std::string& str) {
 88 |             std::string ustr(str);
 89 |             str = "";
 90 |             std::string::size_type idx;
 91 |             for (idx = 0; idx < ustr.length(); ++idx) {
 92 |                 str += ToLower_(ustr[idx]);
 93 |             }
 94 |         }
 95 |         
 96 |         // Convert a string to upper case
 97 |         static void ToUpper(std::string& str) {
 98 |             std::string ustr(str);
 99 |             str = "";
100 |             std::string::size_type idx;
101 |             for (idx = 0; idx < ustr.length(); ++idx) {
102 |                 str += ToUpper_(ustr[idx]);
103 |             }
104 |         }
105 |         
106 |         // Check a string whether it is a valid utf8 encoding string
107 |         static bool IsValidUTF8(const std::string& str) {
108 |             if (str.empty())
109 |                 return false;
110 |             std::string::const_iterator iter = utf8::find_invalid(str.begin(), str.end());
111 |             if (iter != str.end()) {
112 |                 std::cout << "Invalid code found!" << std::endl;
113 |                 std::string temp(str.begin(), iter);
114 |                 std::cout << "This part is fine: " << temp << std::endl;
115 |                 return false;
116 |             }
117 |             return true;
118 |         }
119 | 
120 |         // Check string is utf8 encode
121 |         static bool GetUS2Char(const std::string& str, std::vector<uint32_t>& uChars) {
122 |             uChars.clear();
123 |             std::string ustr(str);
124 |             // Avoid throwing exceptions
125 |             if (!RemoveInvalidUTF8(ustr))
126 |                 return false;
127 |             std::string::iterator iter = ustr.begin();
128 |             while (iter != ustr.end()) {
129 |                 uint32_t code = utf8::next(iter, ustr.end());
130 |                 uChars.push_back(code);
131 |             }
132 |             return true;
133 |         }
134 | 
135 |         template<typename T>
136 |         static bool RemoveElement(std::vector<T>& lVec, uint32_t pos
137 |                         ,std::vector<T>& rVec) {
138 |             return true;
139 |         }
140 | 
141 |         // Determine whether a string is a chinese characters
142 |        static bool IsChinese(const std::string& str) {
143 |             if (!IsValidUTF8(str)) {
144 |                // std::cout << "string is a invalid utf8 encoding!\n";
145 |                 return false;
146 |             }
147 |             std::vector<UnicodeType> unicodes;
148 |             utf8::utf8to16(str.begin(), str.end(), std::back_inserter(unicodes));
149 |             //std::cout << "size: " << unicodes.size() << std::endl;
150 |             for (uint32_t i = 0; i < unicodes.size(); ++i) {
151 |                 if (!IsChineseChar_(unicodes[i])) {
152 |                     return false;
153 |                 }
154 |             }
155 |             return true;
156 |         }   
157 | 
158 |        // Reload
159 |        static bool IsChinese(const UnicodeType& UCS2Char) {
160 |            return IsChineseChar_(UCS2Char);
161 |        }
162 | 
163 |        // convert an utf16 encoding(UnicodeType) to a utf8 string
164 |        static bool UnicodeToUTF8Str(const std::vector<UnicodeType>& unicodes, std::string& utf8str) {
165 |            utf8str = "";
166 |            if (unicodes.empty()) {
167 |                return false;
168 |            }
169 |            
170 |            utf8::utf16to8(unicodes.begin(), unicodes.end(), std::back_inserter(utf8str));
171 |            return true;
172 |        }
173 | 
174 |        // convert an utf16 encoding(UnicodeType) to a utf8 string
175 |        static bool UnicodeToUTF8Str(const UnicodeType& unicode, std::string& utf8str) {
176 |            utf8str = "";
177 |            std::vector<UnicodeType> unicodes(1, unicode);
178 |            if (unicodes.empty()) {
179 |                return false;
180 |            }
181 |            
182 |            utf8::utf16to8(unicodes.begin(), unicodes.end(), std::back_inserter(utf8str));
183 |            return true;
184 |        }
185 | 
186 |        // convert a string to unicode encoding, unicode vector
187 |        static bool ToUnicode(const std::string& str, std::vector<UnicodeType>& unicodes) {
188 |            unicodes.clear();
189 |            std::string ustr(str);
190 |            // Avoid throwing exceptions
191 |            RemoveInvalidUTF8(ustr);
192 |            utf8::utf8to16(ustr.begin(), ustr.end(), std::back_inserter(unicodes));
193 |            
194 |            return true;
195 |        }
196 | 
197 |        //
198 |     private:
199 |         // Determine whether a UnicodeType char is a chinese character
200 |         static bool IsChineseChar_(UnicodeType ucs2char) {
201 |             if(((ucs2char >= 0x2E80 && ucs2char <= 0x2EF3) // CJK Radicals
202 |                   ||(ucs2char >= 0x2F00 && ucs2char <= 0x2FD5) // Kangxi Radicals Range: 0x2F00 - 0X2FDF
203 |                   ||(ucs2char >= 0x3400 && ucs2char <= 0x4DB5) // CJK Unified Ideographs Extension A
204 |                   ||(ucs2char >= 0x4E00 && ucs2char <= 0x9FC3) // CJK Unified Ideographs
205 |                   ||(ucs2char >= 0xF900 && ucs2char <= 0xFAD9))// CJK Compatibility Ideographs  
206 |               && ucs2char != 12289 // Chinese Punctuaion Unicode encoding, 、 
207 |               && ucs2char != 12298 //《
208 |               && ucs2char != 12290 // 。
209 |               && ucs2char != 12299 // 》
210 |               && ucs2char != 65292 // ，
211 |               && ucs2char != 65311 // ？
212 |               && ucs2char != 65281 // ！
213 |               && ucs2char != 65306 // ：
214 |               && ucs2char != 65307 // ；
215 |               && ucs2char != 8220  // “
216 |               && ucs2char != 8221  // ”
217 |               && ucs2char != 12304 // 【
218 |               && ucs2char != 12305 // 】
219 |               && ucs2char != 65509 // ￥
220 |               && ucs2char != 8230  // …
221 |               && ucs2char != 65288 // （
222 |               && ucs2char != 65289 // ）
223 |               && ucs2char != 8212  // —
224 |               && ucs2char != 20022 )// 、
225 |                 return true;
226 | 
227 |             return false;
228 |         }
229 | 
230 |         // Convert a char to lower case
231 |        static inline char ToUpper_(char chConv) {
232 |             return (chConv >= 'a' && chConv <= 'z') ? (chConv & 0xdf) : chConv;
233 |         }
234 |        static inline wchar_t ToUpper_(wchar_t chConv) {
235 |             return (chConv >= L'a' && chConv <= L'z') ? (chConv & 0x00df) : chConv;
236 |         }
237 | 
238 |         // Convert a char to upper case
239 |        static inline char ToLower_(char chConv) {
240 |             return (chConv >= 'A' && chConv <= 'Z') ? (chConv | 0x20) : chConv;
241 |         }
242 |        static inline wchar_t ToLower_(wchar_t chConv) {
243 |             return (chConv >= L'A' && chConv <= 'Z') ? (chConv | 0x0020) : chConv;
244 |         }
245 | };
246 | 
247 | 
248 | #endif // util/normalize.h
249 | 


--------------------------------------------------------------------------------
/include/util/py_types.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: py_tyes.h
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Wed 16 Mar 2016 10:39:19 AM CST
 7 |  ************************************************************************/
 8 | #ifdef PY_TYPES_H
 9 | #define PY_TYPES_H
10 | 
11 | #include <iostream>
12 | #include <string>
13 | #include <vector>
14 | 
15 | typedef uint16_t UCS2Char;
16 | typedef UCS2Char Unigram;
17 | typedef std::pair<Unigram, Unigram> Bigram;
18 | typedef std::pair<Bigram, Unigram> Trigram;
19 | typedef std::vector<Uingram> Ngram;
20 | 
21 | 
22 | template <class T>
23 | struct ScoreItem
24 | {
25 |     T value;
26 |     double score;
27 | 
28 |     bool operator<(const ScoreItem<T>& other) cosnt
29 |     {
30 |         return score > other.score;
31 |     }
32 | };
33 | 
34 | typedef ScoreItem<std::string> CandidateResult;
35 | 
36 | struct ViterbiItemT
37 | {
38 |     std::string text;
39 |     double score;
40 | 
41 |     const UCS2Char& GetLastChar() const
42 |     {
43 |         return text[text.length() -1];
44 |     }
45 | };
46 | 
47 | #endif // py_types.h
48 | 


--------------------------------------------------------------------------------
/include/util/types.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: types.h
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Thu 10 Mar 2016 05:15:30 PM CST
 7 |  ************************************************************************/
 8 | #ifndef TYPES_H
 9 | #define TYPES_H
10 | 
11 | #include <sys/types.h>
12 | 
13 | #endif
14 | 
15 | 


--------------------------------------------------------------------------------
/include/util/utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Nemanja Trifunovic
 2 | 
 3 | /*
 4 | Permission is hereby granted, free of charge, to any person or organization
 5 | obtaining a copy of the software and accompanying documentation covered by
 6 | this license (the "Software") to use, reproduce, display, distribute,
 7 | execute, and transmit the Software, and to prepare derivative works of the
 8 | Software, and to permit third-parties to whom the Software is furnished to
 9 | do so, all subject to the following:
10 | 
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 | 
27 | 
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 | 
31 | #include "utf8/checked.h"
32 | #include "utf8/unchecked.h"
33 | 
34 | #endif // header guard
35 | 


--------------------------------------------------------------------------------
/include/util/utf8/checked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | #include <stdexcept>
 33 | 
 34 | namespace utf8
 35 | {
 36 |     // Base for the exceptions that may be thrown from the library
 37 |     class exception : public ::std::exception {
 38 |     };
 39 | 
 40 |     // Exceptions that may be thrown from the library functions.
 41 |     class invalid_code_point : public exception {
 42 |         uint32_t cp;
 43 |     public:
 44 |         invalid_code_point(uint32_t cp) : cp(cp) {}
 45 |         virtual const char* what() const throw() { return "Invalid code point"; }
 46 |         uint32_t code_point() const {return cp;}
 47 |     };
 48 | 
 49 |     class invalid_utf8 : public exception {
 50 |         uint8_t u8;
 51 |     public:
 52 |         invalid_utf8 (uint8_t u) : u8(u) {}
 53 |         virtual const char* what() const throw() { return "Invalid UTF-8"; }
 54 |         uint8_t utf8_octet() const {return u8;}
 55 |     };
 56 | 
 57 |     class invalid_utf16 : public exception {
 58 |         uint16_t u16;
 59 |     public:
 60 |         invalid_utf16 (uint16_t u) : u16(u) {}
 61 |         virtual const char* what() const throw() { return "Invalid UTF-16"; }
 62 |         uint16_t utf16_word() const {return u16;}
 63 |     };
 64 | 
 65 |     class not_enough_room : public exception {
 66 |     public:
 67 |         virtual const char* what() const throw() { return "Not enough space"; }
 68 |     };
 69 | 
 70 |     /// The library API - functions intended to be called by the users
 71 | 
 72 |     template <typename octet_iterator>
 73 |     octet_iterator append(uint32_t cp, octet_iterator result)
 74 |     {
 75 |         if (!utf8::internal::is_code_point_valid(cp))
 76 |             throw invalid_code_point(cp);
 77 | 
 78 |         if (cp < 0x80)                        // one octet
 79 |             *(result++) = static_cast<uint8_t>(cp);
 80 |         else if (cp < 0x800) {                // two octets
 81 |             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
 82 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 83 |         }
 84 |         else if (cp < 0x10000) {              // three octets
 85 |             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
 86 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 87 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 88 |         }
 89 |         else {                                // four octets
 90 |             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
 91 |             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
 92 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 93 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 94 |         }
 95 |         return result;
 96 |     }
 97 | 
 98 |     template <typename octet_iterator, typename output_iterator>
 99 |     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
100 |     {
101 |         while (start != end) {
102 |             octet_iterator sequence_start = start;
103 |             internal::utf_error err_code = utf8::internal::validate_next(start, end);
104 |             switch (err_code) {
105 |                 case internal::UTF8_OK :
106 |                     for (octet_iterator it = sequence_start; it != start; ++it)
107 |                         *out++ = *it;
108 |                     break;
109 |                 case internal::NOT_ENOUGH_ROOM:
110 |                     throw not_enough_room();
111 |                 case internal::INVALID_LEAD:
112 |                     out = utf8::append (replacement, out);
113 |                     ++start;
114 |                     break;
115 |                 case internal::INCOMPLETE_SEQUENCE:
116 |                 case internal::OVERLONG_SEQUENCE:
117 |                 case internal::INVALID_CODE_POINT:
118 |                     out = utf8::append (replacement, out);
119 |                     ++start;
120 |                     // just one replacement mark for the sequence
121 |                     while (start != end && utf8::internal::is_trail(*start))
122 |                         ++start;
123 |                     break;
124 |             }
125 |         }
126 |         return out;
127 |     }
128 | 
129 |     template <typename octet_iterator, typename output_iterator>
130 |     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
131 |     {
132 |         static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
133 |         return utf8::replace_invalid(start, end, out, replacement_marker);
134 |     }
135 | 
136 |     template <typename octet_iterator>
137 |     uint32_t next(octet_iterator& it, octet_iterator end)
138 |     {
139 |         uint32_t cp = 0;
140 |         internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
141 |         switch (err_code) {
142 |             case internal::UTF8_OK :
143 |                 break;
144 |             case internal::NOT_ENOUGH_ROOM :
145 |                 throw not_enough_room();
146 |             case internal::INVALID_LEAD :
147 |             case internal::INCOMPLETE_SEQUENCE :
148 |             case internal::OVERLONG_SEQUENCE :
149 |                 throw invalid_utf8(*it);
150 |             case internal::INVALID_CODE_POINT :
151 |                 throw invalid_code_point(cp);
152 |         }
153 |         return cp;
154 |     }
155 | 
156 |     template <typename octet_iterator>
157 |     uint32_t peek_next(octet_iterator it, octet_iterator end)
158 |     {
159 |         return utf8::next(it, end);
160 |     }
161 | 
162 |     template <typename octet_iterator>
163 |     uint32_t prior(octet_iterator& it, octet_iterator start)
164 |     {
165 |         // can't do much if it == start
166 |         if (it == start)
167 |             throw not_enough_room();
168 | 
169 |         octet_iterator end = it;
170 |         // Go back until we hit either a lead octet or start
171 |         while (utf8::internal::is_trail(*(--it)))
172 |             if (it == start)
173 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
174 |         return utf8::peek_next(it, end);
175 |     }
176 | 
177 |     /// Deprecated in versions that include "prior"
178 |     template <typename octet_iterator>
179 |     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180 |     {
181 |         octet_iterator end = it;
182 |         while (utf8::internal::is_trail(*(--it)))
183 |             if (it == pass_start)
184 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
185 |         octet_iterator temp = it;
186 |         return utf8::next(temp, end);
187 |     }
188 | 
189 |     template <typename octet_iterator, typename distance_type>
190 |     void advance (octet_iterator& it, distance_type n, octet_iterator end)
191 |     {
192 |         for (distance_type i = 0; i < n; ++i)
193 |             utf8::next(it, end);
194 |     }
195 | 
196 |     template <typename octet_iterator>
197 |     typename std::iterator_traits<octet_iterator>::difference_type
198 |     distance (octet_iterator first, octet_iterator last)
199 |     {
200 |         typename std::iterator_traits<octet_iterator>::difference_type dist;
201 |         for (dist = 0; first < last; ++dist)
202 |             utf8::next(first, last);
203 |         return dist;
204 |     }
205 | 
206 |     template <typename u16bit_iterator, typename octet_iterator>
207 |     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208 |     {
209 |         while (start != end) {
210 |             uint32_t cp = utf8::internal::mask16(*start++);
211 |             // Take care of surrogate pairs first
212 |             if (utf8::internal::is_lead_surrogate(cp)) {
213 |                 if (start != end) {
214 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
215 |                     if (utf8::internal::is_trail_surrogate(trail_surrogate))
216 |                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217 |                     else
218 |                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 |                 }
220 |                 else
221 |                     throw invalid_utf16(static_cast<uint16_t>(cp));
222 | 
223 |             }
224 |             // Lone trail surrogate
225 |             else if (utf8::internal::is_trail_surrogate(cp))
226 |                 throw invalid_utf16(static_cast<uint16_t>(cp));
227 | 
228 |             result = utf8::append(cp, result);
229 |         }
230 |         return result;
231 |     }
232 | 
233 |     template <typename u16bit_iterator, typename octet_iterator>
234 |     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235 |     {
236 |         while (start < end) {
237 |             uint32_t cp = utf8::next(start, end);
238 |             if (cp > 0xffff) { //make a surrogate pair
239 |                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
240 |                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 |             }
242 |             else
243 |                 *result++ = static_cast<uint16_t>(cp);
244 |         }
245 |         return result;
246 |     }
247 | 
248 |     template <typename octet_iterator, typename u32bit_iterator>
249 |     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 |     {
251 |         while (start != end)
252 |             result = utf8::append(*(start++), result);
253 | 
254 |         return result;
255 |     }
256 | 
257 |     template <typename octet_iterator, typename u32bit_iterator>
258 |     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 |     {
260 |         while (start < end)
261 |             (*result++) = utf8::next(start, end);
262 | 
263 |         return result;
264 |     }
265 | 
266 |     // The iterator class
267 |     template <typename octet_iterator>
268 |     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
269 |       octet_iterator it;
270 |       octet_iterator range_start;
271 |       octet_iterator range_end;
272 |       public:
273 |       iterator () {}
274 |       explicit iterator (const octet_iterator& octet_it,
275 |                          const octet_iterator& range_start,
276 |                          const octet_iterator& range_end) :
277 |                it(octet_it), range_start(range_start), range_end(range_end)
278 |       {
279 |           if (it < range_start || it > range_end)
280 |               throw std::out_of_range("Invalid utf-8 iterator position");
281 |       }
282 |       // the default "big three" are OK
283 |       octet_iterator base () const { return it; }
284 |       uint32_t operator * () const
285 |       {
286 |           octet_iterator temp = it;
287 |           return utf8::next(temp, range_end);
288 |       }
289 |       bool operator == (const iterator& rhs) const
290 |       {
291 |           if (range_start != rhs.range_start || range_end != rhs.range_end)
292 |               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
293 |           return (it == rhs.it);
294 |       }
295 |       bool operator != (const iterator& rhs) const
296 |       {
297 |           return !(operator == (rhs));
298 |       }
299 |       iterator& operator ++ ()
300 |       {
301 |           utf8::next(it, range_end);
302 |           return *this;
303 |       }
304 |       iterator operator ++ (int)
305 |       {
306 |           iterator temp = *this;
307 |           utf8::next(it, range_end);
308 |           return temp;
309 |       }
310 |       iterator& operator -- ()
311 |       {
312 |           utf8::prior(it, range_start);
313 |           return *this;
314 |       }
315 |       iterator operator -- (int)
316 |       {
317 |           iterator temp = *this;
318 |           utf8::prior(it, range_start);
319 |           return temp;
320 |       }
321 |     }; // class iterator
322 | 
323 | } // namespace utf8
324 | 
325 | #endif //header guard
326 | 
327 | 
328 | 


--------------------------------------------------------------------------------
/include/util/utf8/core.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include <iterator>
 32 | 
 33 | namespace utf8
 34 | {
 35 |     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 36 |     // You may need to change them to match your system.
 37 |     // These typedefs have the same names as ones from cstdint, or boost/cstdint
 38 |     typedef unsigned char   uint8_t;
 39 |     typedef unsigned short  uint16_t;
 40 |     typedef unsigned int    uint32_t;
 41 | 
 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time
 43 | namespace internal
 44 | {
 45 |     // Unicode constants
 46 |     // Leading (high) surrogates: 0xd800 - 0xdbff
 47 |     // Trailing (low) surrogates: 0xdc00 - 0xdfff
 48 |     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 49 |     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 50 |     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 51 |     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 52 |     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
 53 |     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
 54 | 
 55 |     // Maximum valid value for a Unicode code point
 56 |     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 57 | 
 58 |     template<typename octet_type>
 59 |     inline uint8_t mask8(octet_type oc)
 60 |     {
 61 |         return static_cast<uint8_t>(0xff & oc);
 62 |     }
 63 |     template<typename u16_type>
 64 |     inline uint16_t mask16(u16_type oc)
 65 |     {
 66 |         return static_cast<uint16_t>(0xffff & oc);
 67 |     }
 68 |     template<typename octet_type>
 69 |     inline bool is_trail(octet_type oc)
 70 |     {
 71 |         return ((utf8::internal::mask8(oc) >> 6) == 0x2);
 72 |     }
 73 | 
 74 |     template <typename u16>
 75 |     inline bool is_lead_surrogate(u16 cp)
 76 |     {
 77 |         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 78 |     }
 79 | 
 80 |     template <typename u16>
 81 |     inline bool is_trail_surrogate(u16 cp)
 82 |     {
 83 |         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 84 |     }
 85 | 
 86 |     template <typename u16>
 87 |     inline bool is_surrogate(u16 cp)
 88 |     {
 89 |         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 90 |     }
 91 | 
 92 |     template <typename u32>
 93 |     inline bool is_code_point_valid(u32 cp)
 94 |     {
 95 |         return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
 96 |     }
 97 | 
 98 |     template <typename octet_iterator>
 99 |     inline typename std::iterator_traits<octet_iterator>::difference_type
100 |     sequence_length(octet_iterator lead_it)
101 |     {
102 |         uint8_t lead = utf8::internal::mask8(*lead_it);
103 |         if (lead < 0x80)
104 |             return 1;
105 |         else if ((lead >> 5) == 0x6)
106 |             return 2;
107 |         else if ((lead >> 4) == 0xe)
108 |             return 3;
109 |         else if ((lead >> 3) == 0x1e)
110 |             return 4;
111 |         else
112 |             return 0;
113 |     }
114 | 
115 |     template <typename octet_difference_type>
116 |     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 |     {
118 |         if (cp < 0x80) {
119 |             if (length != 1) 
120 |                 return true;
121 |         }
122 |         else if (cp < 0x800) {
123 |             if (length != 2) 
124 |                 return true;
125 |         }
126 |         else if (cp < 0x10000) {
127 |             if (length != 3) 
128 |                 return true;
129 |         }
130 | 
131 |         return false;
132 |     }
133 | 
134 |     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135 | 
136 |     /// Helper for get_sequence_x
137 |     template <typename octet_iterator>
138 |     utf_error increase_safely(octet_iterator& it, octet_iterator end)
139 |     {
140 |         if (++it == end)
141 |             return NOT_ENOUGH_ROOM;
142 | 
143 |         if (!utf8::internal::is_trail(*it))
144 |             return INCOMPLETE_SEQUENCE;
145 |         
146 |         return UTF8_OK;
147 |     }
148 | 
149 |     #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
150 | 
151 |     /// get_sequence_x functions decode utf-8 sequences of the length x
152 |     template <typename octet_iterator>
153 |     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
154 |     {
155 |         if (it == end)
156 |             return NOT_ENOUGH_ROOM;
157 | 
158 |         code_point = utf8::internal::mask8(*it);
159 | 
160 |         return UTF8_OK;
161 |     }
162 | 
163 |     template <typename octet_iterator>
164 |     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
165 |     {
166 |         if (it == end) 
167 |             return NOT_ENOUGH_ROOM;
168 |         
169 |         code_point = utf8::internal::mask8(*it);
170 | 
171 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
172 | 
173 |         code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174 | 
175 |         return UTF8_OK;
176 |     }
177 | 
178 |     template <typename octet_iterator>
179 |     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
180 |     {
181 |         if (it == end)
182 |             return NOT_ENOUGH_ROOM;
183 |             
184 |         code_point = utf8::internal::mask8(*it);
185 | 
186 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
187 | 
188 |         code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189 | 
190 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
191 | 
192 |         code_point += (*it) & 0x3f;
193 | 
194 |         return UTF8_OK;
195 |     }
196 | 
197 |     template <typename octet_iterator>
198 |     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 |     {
200 |         if (it == end)
201 |            return NOT_ENOUGH_ROOM;
202 | 
203 |         code_point = utf8::internal::mask8(*it);
204 | 
205 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
206 | 
207 |         code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208 | 
209 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
210 | 
211 |         code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212 | 
213 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
214 | 
215 |         code_point += (*it) & 0x3f;
216 | 
217 |         return UTF8_OK;
218 |     }
219 | 
220 |     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
221 | 
222 |     template <typename octet_iterator>
223 |     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
224 |     {
225 |         // Save the original value of it so we can go back in case of failure
226 |         // Of course, it does not make much sense with i.e. stream iterators
227 |         octet_iterator original_it = it;
228 | 
229 |         uint32_t cp = 0;
230 |         // Determine the sequence length based on the lead octet
231 |         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 |         const octet_difference_type length = utf8::internal::sequence_length(it);
233 | 
234 |         // Get trail octets and calculate the code point
235 |         utf_error err = UTF8_OK;
236 |         switch (length) {
237 |             case 0: 
238 |                 return INVALID_LEAD;
239 |             case 1:
240 |                 err = utf8::internal::get_sequence_1(it, end, cp);
241 |                 break;
242 |             case 2:
243 |                 err = utf8::internal::get_sequence_2(it, end, cp);
244 |             break;
245 |             case 3:
246 |                 err = utf8::internal::get_sequence_3(it, end, cp);
247 |             break;
248 |             case 4:
249 |                 err = utf8::internal::get_sequence_4(it, end, cp);
250 |             break;
251 |         }
252 | 
253 |         if (err == UTF8_OK) {
254 |             // Decoding succeeded. Now, security checks...
255 |             if (utf8::internal::is_code_point_valid(cp)) {
256 |                 if (!utf8::internal::is_overlong_sequence(cp, length)){
257 |                     // Passed! Return here.
258 |                     code_point = cp;
259 |                     ++it;
260 |                     return UTF8_OK;
261 |                 }
262 |                 else
263 |                     err = OVERLONG_SEQUENCE;
264 |             }
265 |             else 
266 |                 err = INVALID_CODE_POINT;
267 |         }
268 | 
269 |         // Failure branch - restore the original value of the iterator
270 |         it = original_it;
271 |         return err;
272 |     }
273 | 
274 |     template <typename octet_iterator>
275 |     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
276 |         uint32_t ignored;
277 |         return utf8::internal::validate_next(it, end, ignored);
278 |     }
279 | 
280 | } // namespace internal
281 | 
282 |     /// The library API - functions intended to be called by the users
283 | 
284 |     // Byte order mark
285 |     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
286 | 
287 |     template <typename octet_iterator>
288 |     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
289 |     {
290 |         octet_iterator result = start;
291 |         while (result != end) {
292 |             utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
293 |             if (err_code != internal::UTF8_OK)
294 |                 return result;
295 |         }
296 |         return result;
297 |     }
298 | 
299 |     template <typename octet_iterator>
300 |     inline bool is_valid(octet_iterator start, octet_iterator end)
301 |     {
302 |         return (utf8::find_invalid(start, end) == end);
303 |     }
304 | 
305 |     template <typename octet_iterator>
306 |     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
307 |     {
308 |         return (
309 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
310 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
311 |             ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
312 |            );
313 |     }
314 | 	
315 |     //Deprecated in release 2.3 
316 |     template <typename octet_iterator>
317 |     inline bool is_bom (octet_iterator it)
318 |     {
319 |         return (
320 |             (utf8::internal::mask8(*it++)) == bom[0] &&
321 |             (utf8::internal::mask8(*it++)) == bom[1] &&
322 |             (utf8::internal::mask8(*it))   == bom[2]
323 |            );
324 |     }
325 | } // namespace utf8
326 | 
327 | #endif // header guard
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/include/util/utf8/unchecked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | 
 33 | namespace utf8
 34 | {
 35 |     namespace unchecked 
 36 |     {
 37 |         template <typename octet_iterator>
 38 |         octet_iterator append(uint32_t cp, octet_iterator result)
 39 |         {
 40 |             if (cp < 0x80)                        // one octet
 41 |                 *(result++) = static_cast<uint8_t>(cp);  
 42 |             else if (cp < 0x800) {                // two octets
 43 |                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
 44 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 45 |             }
 46 |             else if (cp < 0x10000) {              // three octets
 47 |                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
 48 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 49 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 50 |             }
 51 |             else {                                // four octets
 52 |                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
 53 |                 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
 54 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 55 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 56 |             }
 57 |             return result;
 58 |         }
 59 | 
 60 |         template <typename octet_iterator>
 61 |         uint32_t next(octet_iterator& it)
 62 |         {
 63 |             uint32_t cp = utf8::internal::mask8(*it);
 64 |             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
 65 |             switch (length) {
 66 |                 case 1:
 67 |                     break;
 68 |                 case 2:
 69 |                     it++;
 70 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
 71 |                     break;
 72 |                 case 3:
 73 |                     ++it; 
 74 |                     cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
 75 |                     ++it;
 76 |                     cp += (*it) & 0x3f;
 77 |                     break;
 78 |                 case 4:
 79 |                     ++it;
 80 |                     cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
 81 |                     ++it;
 82 |                     cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
 83 |                     ++it;
 84 |                     cp += (*it) & 0x3f; 
 85 |                     break;
 86 |             }
 87 |             ++it;
 88 |             return cp;        
 89 |         }
 90 | 
 91 |         template <typename octet_iterator>
 92 |         uint32_t peek_next(octet_iterator it)
 93 |         {
 94 |             return utf8::unchecked::next(it);    
 95 |         }
 96 | 
 97 |         template <typename octet_iterator>
 98 |         uint32_t prior(octet_iterator& it)
 99 |         {
100 |             while (utf8::internal::is_trail(*(--it))) ;
101 |             octet_iterator temp = it;
102 |             return utf8::unchecked::next(temp);
103 |         }
104 | 
105 |         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
106 |         template <typename octet_iterator>
107 |         inline uint32_t previous(octet_iterator& it)
108 |         {
109 |             return utf8::unchecked::prior(it);
110 |         }
111 | 
112 |         template <typename octet_iterator, typename distance_type>
113 |         void advance (octet_iterator& it, distance_type n)
114 |         {
115 |             for (distance_type i = 0; i < n; ++i)
116 |                 utf8::unchecked::next(it);
117 |         }
118 | 
119 |         template <typename octet_iterator>
120 |         typename std::iterator_traits<octet_iterator>::difference_type
121 |         distance (octet_iterator first, octet_iterator last)
122 |         {
123 |             typename std::iterator_traits<octet_iterator>::difference_type dist;
124 |             for (dist = 0; first < last; ++dist) 
125 |                 utf8::unchecked::next(first);
126 |             return dist;
127 |         }
128 | 
129 |         template <typename u16bit_iterator, typename octet_iterator>
130 |         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
131 |         {       
132 |             while (start != end) {
133 |                 uint32_t cp = utf8::internal::mask16(*start++);
134 |             // Take care of surrogate pairs first
135 |                 if (utf8::internal::is_lead_surrogate(cp)) {
136 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
137 |                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
138 |                 }
139 |                 result = utf8::unchecked::append(cp, result);
140 |             }
141 |             return result;         
142 |         }
143 | 
144 |         template <typename u16bit_iterator, typename octet_iterator>
145 |         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
146 |         {
147 |             while (start < end) {
148 |                 uint32_t cp = utf8::unchecked::next(start);
149 |                 if (cp > 0xffff) { //make a surrogate pair
150 |                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
151 |                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
152 |                 }
153 |                 else
154 |                     *result++ = static_cast<uint16_t>(cp);
155 |             }
156 |             return result;
157 |         }
158 | 
159 |         template <typename octet_iterator, typename u32bit_iterator>
160 |         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
161 |         {
162 |             while (start != end)
163 |                 result = utf8::unchecked::append(*(start++), result);
164 | 
165 |             return result;
166 |         }
167 | 
168 |         template <typename octet_iterator, typename u32bit_iterator>
169 |         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
170 |         {
171 |             while (start < end)
172 |                 (*result++) = utf8::unchecked::next(start);
173 | 
174 |             return result;
175 |         }
176 | 
177 |         // The iterator class
178 |         template <typename octet_iterator>
179 |           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
180 |             octet_iterator it;
181 |             public:
182 |             iterator () {}
183 |             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
184 |             // the default "big three" are OK
185 |             octet_iterator base () const { return it; }
186 |             uint32_t operator * () const
187 |             {
188 |                 octet_iterator temp = it;
189 |                 return utf8::unchecked::next(temp);
190 |             }
191 |             bool operator == (const iterator& rhs) const 
192 |             { 
193 |                 return (it == rhs.it);
194 |             }
195 |             bool operator != (const iterator& rhs) const
196 |             {
197 |                 return !(operator == (rhs));
198 |             }
199 |             iterator& operator ++ () 
200 |             {
201 |                 ::std::advance(it, utf8::internal::sequence_length(it));
202 |                 return *this;
203 |             }
204 |             iterator operator ++ (int)
205 |             {
206 |                 iterator temp = *this;
207 |                 ::std::advance(it, utf8::internal::sequence_length(it));
208 |                 return temp;
209 |             }  
210 |             iterator& operator -- ()
211 |             {
212 |                 utf8::unchecked::prior(it);
213 |                 return *this;
214 |             }
215 |             iterator operator -- (int)
216 |             {
217 |                 iterator temp = *this;
218 |                 utf8::unchecked::prior(it);
219 |                 return temp;
220 |             }
221 |           }; // class iterator
222 | 
223 |     } // namespace utf8::unchecked
224 | } // namespace utf8 
225 | 
226 | 
227 | #endif // header guard
228 | 
229 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | PUB_DIR:=../include/
 3 | SRC_INC:= ./
 4 | 
 5 | PROG1 = t_normalize_unit
 6 | PROG2 = t_dictionary_unit
 7 | PROG3 = t_segment_unit
 8 | PROG4 = t_build_unit
 9 | PROG5 = t_suggestion_unit
10 | 
11 | CFLAGS = -W -Wall -I../ 
12 | 
13 | CC=g++
14 | #all: $(PROG1) $(PROG2) $(PROG3) $(PROG4) $(PROG5)
15 | all: $(PROG5)
16 | 
17 | LDFLAGS=-lboost_system -lboost_serialization -lboost_filesystem -lboost_unit_test_framework
18 | 
19 | 
20 | # normalize unit test
21 | #$(PROG1): $(PROG1).cc
22 | #	$(CC) -g -o $(PROG1) $(PROG1).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR)
23 | 
24 | # dictionary unit test
25 | #$(PROG2): $(PROG2).cc
26 | #	$(CC) -g -o $(PROG2) $(PROG2).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR)
27 | 
28 | # segment unit test
29 | #$(PROG3): $(PROG3).cc
30 | #	$(CC) -g -o $(PROG3) $(PROG3).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR)
31 | 
32 | # build engine unit test
33 | #$(PROG4): $(PROG4).cc
34 | #	$(CC) -g -o $(PROG4) $(PROG4).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR)
35 | 
36 | # suggestion unit test
37 | $(PROG5): $(PROG5).cc
38 | 	$(CC) -g -o $(PROG5) $(PROG5).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR)
39 | 
40 | run:
41 | 	./$(PROG1)
42 | 	./$(PROG2)
43 | 	./$(PROG3)
44 | 	./$(PROG4)
45 | 	./$(PROG5)
46 | 
47 | clean:
48 | 	rm -rf $(PROG1) $(PROG2) $(PROG3) $(PROG4) $(PROG5) *.exe *.dSYM *.obj *.exp .*o *.lib .*.txt
49 | 


--------------------------------------------------------------------------------
/test/t_build_unit.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: t_build_unit.cc
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Thu 21 Jul 2016 04:21:35 PM CST
  7 |  ************************************************************************/
  8 | #define BOOST_TEST_DYN_LINK
  9 | #define BOOST_TEST_MODULE DataBuildEngineTest
 10 | 
 11 | #include <iostream>
 12 | #include <string>
 13 | #include "unit_test.h"
 14 | #include "buildEngine.hpp"
 15 | 
 16 | 
 17 | 
 18 | std::string BuildEngine::res_dir_("../resource/"); // resource dir
 19 | BOOST_AUTO_TEST_SUITE (BuildEngineTest)
 20 | 
 21 | //std::string BuildEngine::res_dir_("../resource/"); // resource dir
 22 | boost::shared_ptr<BuildEngine> pBuild(new BuildEngine());
 23 | 
 24 | 
 25 | // construct
 26 | //pBuild.reset(new BuildEngine());
 27 | 
 28 | 
 29 | // helper
 30 | void print_vector(const std::vector<std::string>& vec) {
 31 |     for (uint32_t i = 0; i < vec.size(); ++i) {
 32 |         std::cout << vec[i] << ",";
 33 |     }
 34 |     std::cout << "\n";
 35 | }
 36 | 
 37 | 
 38 | void genByPrefix(const std::string& str) {
 39 |     std::vector<std::string> chars, words, keys;
 40 |     pBuild->Parse(str, chars, words);
 41 | 
 42 |     pBuild->GenerateByPrefix(chars, keys, 10);
 43 |     std::cout << "==========GenerateByPrefix==========\nInput: " << str << std::endl;
 44 |     print_vector(keys);
 45 | }
 46 | 
 47 | void genByWordInfix(const std::string& str) {
 48 |     std::vector<std::string> chars, words, keys;
 49 |     pBuild->Parse(str, chars, words);
 50 | 
 51 |     pBuild->GenerateByWordInfix(words, keys, 10);
 52 |     std::cout << "=========GenerateByWordInfix===========\nInput: " << str << std::endl;
 53 |     print_vector(keys);
 54 | }
 55 | 
 56 | 
 57 | void genByWordSuffix(const std::string& str) {
 58 |     std::vector<std::string> chars, words, keys;
 59 |     pBuild->Parse(str, chars, words);
 60 | 
 61 |     pBuild->GenerateByWordSuffix(words, keys, 10);
 62 |     std::cout << "=========GenerateByWordSuffix===========\nInput: " << str << std::endl;
 63 |     print_vector(keys);
 64 | }
 65 | 
 66 | 
 67 | void genByPinYinPrefix(const std::string& str) {
 68 |     std::vector<std::string>  keys;
 69 | 
 70 |     pBuild->GenerateByPinYinPrefix(str, keys, 10);
 71 |     std::cout << "=========GenerateByPinYinPrefix===========\nInput: " << str << std::endl;
 72 |     print_vector(keys);
 73 | }
 74 | 
 75 | void genByShengMuPrefix(const std::string& str) {
 76 |     std::vector<std::string>  keys;
 77 | 
 78 |     pBuild->GenerateByShengMuPrefix(str, keys, 10);
 79 |     std::cout << "=========GenerateByPinYinPrefix===========\nInput: " << str << std::endl;
 80 |     print_vector(keys);
 81 | }
 82 | 
 83 | 
 84 | // test BuildEngine::parse
 85 | void test_parse(const std::string& str) {
 86 |     std::vector<std::string> chars, words;
 87 | 
 88 |     pBuild->Parse(str, chars, words);
 89 |     std::cout << "==========Parse==========\nInput: " << str << std::endl;
 90 |     std::cout << "Chars: ";
 91 |     print_vector(chars);
 92 |     std::cout << "Words: ";
 93 |     print_vector(words);
 94 |     std::cout << "====================\n";
 95 | }
 96 | 
 97 | // generate test terms
 98 | void gendata() {
 99 | 
100 |     // construct data
101 |     std::ofstream ofs(".terms.txt");
102 |     if (!ofs) {
103 |         std::cout << "Open file .terms.txt file error!\n";
104 |     }
105 |     // write test data
106 |     ofs << "贝贝德皮诺" << "\t" << 300 << "\t" << 123 << std::endl;
107 |     ofs << "bebedepino" << "\t" << 250 << "\t" << 231 << std::endl;
108 |     ofs << "背背佳" << "\t" << 130 << "\t" << 42 <<std::endl;
109 |     ofs << "brand25" << "\t" << 50 << "\t" << 83 << std::endl; 
110 |     ofs.close();
111 | }
112 | 
113 | // data building
114 | void build() {
115 |    
116 |     gendata();
117 |     // building
118 |     pBuild->Build(".terms.txt");
119 | 
120 |     // flush
121 |     pBuild->Flush(".term.txt", ".key_terms.txt");
122 | }
123 | 
124 | // get building results
125 | void getDataModule() {
126 |     gendata();
127 | 
128 |     TermInfoType terms;
129 |     KeyTermIDsType key_termids;
130 |     pBuild->GetDataModule(terms, key_termids);
131 | 
132 |     std::cout << "terms size: " << terms.size() << "\tkey_term id size: " << key_termids.size() << std::endl;
133 | }
134 | 
135 | // --------------------------------
136 | 
137 | // Case 1, BuildEngine::Parse()
138 | BOOST_AUTO_TEST_CASE (Parse) {
139 | 
140 |     // 1
141 |     test_parse("贝贝德皮诺");
142 |     test_parse("bebedepino");
143 |     test_parse("贝贝德皮诺bebedepino");
144 |     test_parse("贝贝德皮诺》《bebed*)epino");
145 | }
146 | 
147 | // Case 2, BuildEngine::GenerateByPrefix()
148 | BOOST_AUTO_TEST_CASE (GenerateByPrefix) {
149 |     genByPrefix("贝贝德=皮诺");
150 |     genByPrefix("bebede)pino");
151 |     genByPrefix("贝贝德皮诺bebedep");
152 | }
153 | 
154 | // Case 3, BuildEngine::GenerateByWordInfix()
155 | BOOST_AUTO_TEST_CASE (GenerateByWordInfix) {
156 |     
157 |     genByWordInfix("贝贝德皮诺");
158 | }
159 | 
160 | // Case 4, BuildEnginie::GenerateByWordSuffix()
161 | BOOST_AUTO_TEST_CASE (GenByWordSuffix) {
162 |         
163 |     genByWordSuffix("贝贝德皮诺");
164 | }
165 | 
166 | // Case 5, BuildEngine::GenerateByPinYinPrefix()
167 | BOOST_AUTO_TEST_CASE (GenByPinYinPrefix) {
168 |     
169 |     genByPinYinPrefix("贝贝德皮诺");
170 | }
171 | 
172 | // Case 6, BuildEngine::GenerateByShengMuPrefix()
173 | BOOST_AUTO_TEST_CASE (GenByShengMuPrefix) {
174 | 
175 |     genByShengMuPrefix("银行");
176 | }
177 | 
178 | // Case 7, BuildEngine::Build() and BuildEngine::Flush()
179 | BOOST_AUTO_TEST_CASE (BuildingAndFlush) {
180 |     
181 |     build();
182 | }
183 | 
184 | // Case 8, BuildEngine::GetDataModule()
185 | BOOST_AUTO_TEST_CASE (GetDataModule) {
186 | 
187 |     getDataModule();
188 | }
189 | 
190 | BOOST_AUTO_TEST_SUITE_END()
191 | 
192 | 


--------------------------------------------------------------------------------
/test/t_dictionary_unit.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: t_dictionary_unit.cc
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Fri 15 Jul 2016 10:50:56 AM CST
  7 |  ************************************************************************/
  8 | #define BOOST_TEST_DYN_LINK
  9 | #define BOOST_TEST_MODULE DictionaryTest
 10 | #include <iostream>
 11 | #include "unit_test.h"
 12 | #include "dictionary.hpp"
 13 | 
 14 | // name of the test suite is DictionaryTest
 15 | BOOST_AUTO_TEST_SUITE (DictionaryTest)
 16 | 
 17 | // Dictionary object
 18 | std::string dir("../resource/cn");
 19 | Dictionary seg(dir);
 20 | 
 21 | // show segmentation results
 22 | void display_tokens(const std::string& input) {
 23 |     std::vector<std::string> vec;
 24 |     seg.Segment(input, vec);
 25 |     std::cout << "Input:" << input << std::endl;
 26 |     for (uint32_t i = 0; i < vec.size(); ++i) {
 27 |         std::cout << "Result: " << vec[i] << ",";
 28 |     }
 29 |     std::cout << std::endl;
 30 | }
 31 | 
 32 | // show pinyin conversion to chinese results
 33 | void display_convert_cn(const std::string& input) {
 34 |     std::vector<std::string> vec;
 35 |     if (!seg.GetChar(input, vec))
 36 |         return;
 37 |     
 38 |     std::cout << "Input PinYin:" << input << std::endl;
 39 |     for (uint32_t i = 0; i < vec.size(); ++i) {
 40 |         std::cout << "Result: " << vec[i] << ",";
 41 |     }
 42 |     std::cout << std::endl;
 43 | }
 44 | 
 45 | // show chinese conversion to pinyin results
 46 | void display_convert_py(const std::string& input) {
 47 |     std::vector<std::string> vec;
 48 |     if (!seg.GetPinYin(input, vec))
 49 |         return;
 50 |     
 51 |     std::cout << "Input Cn:" << input << std::endl;
 52 |     for (uint32_t i = 0; i < vec.size(); ++i) {
 53 |         std::cout << "Result: " << vec[i] << ",";
 54 |     }
 55 |     std::cout << std::endl;
 56 | }
 57 | 
 58 | // show single chinese conversion to pinyin results
 59 | void display_cn_convert_py(const std::string& input) {
 60 |     std::vector<std::string> vec;
 61 |     if (!seg.GetPinYinTerm(input, vec))
 62 |         return;
 63 |     
 64 |     std::cout << "Input Cn:" << input << std::endl;
 65 |     for (uint32_t i = 0; i < vec.size(); ++i) {
 66 |         std::cout << "Result: " << vec[i] << ",";
 67 |     }
 68 |     std::cout << std::endl;
 69 | }
 70 | 
 71 | // Case 1 , pinyin segmentation
 72 | BOOST_AUTO_TEST_CASE(PinYinSegment) {
 73 |    display_tokens("nanaodeye"); 
 74 |    display_tokens("mangzuoni"); 
 75 |    display_tokens("woyaochitang"); 
 76 |    display_tokens("woyaochitan"); 
 77 |    display_tokens("congxin"); 
 78 |    
 79 |    display_tokens("yinhangjiaapple"); 
 80 |    display_tokens("gongsicompany"); 
 81 |    display_tokens("shangshi123"); 
 82 |    
 83 |    display_tokens("qingguangxu15nian"); 
 84 |    display_tokens("sanxingSUMSONG"); 
 85 | 
 86 | }
 87 | 
 88 | // Case 2, pinyin convert to chinese characters
 89 | BOOST_AUTO_TEST_CASE(PinYin2Cn) {
 90 |     display_convert_cn("zhen");
 91 |     display_convert_cn("zi");
 92 | }
 93 | 
 94 | // Case 3, single chinese words conver to pinyin
 95 | BOOST_AUTO_TEST_CASE(Cn2PinYin) {
 96 |     display_cn_convert_py("白");
 97 |     display_cn_convert_py("爱");
 98 |     display_cn_convert_py("鬼");
 99 | }
100 | 
101 | // Case 4, chinese string convers to pinyin
102 | BOOST_AUTO_TEST_CASE(CnStr2PinYin) {
103 |     display_convert_py("我们的爱");
104 |     display_convert_py("你在哪儿");
105 |     display_convert_py("中过");
106 |     display_convert_py("哈哈");
107 |     
108 |     display_convert_py("银行apple");
109 |     display_convert_py("女王大人1234");
110 |     display_convert_py("女王520么么哒");
111 |     display_convert_py("尹汝杰541帮五买");
112 | }
113 | 
114 | BOOST_AUTO_TEST_SUITE_END()
115 | 


--------------------------------------------------------------------------------
/test/t_normalize_unit.cc:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  @ File Name: t_normalize_unit.cc
  3 |  @ Method: 
  4 |  @ Author: Jerry Shi
  5 |  @ Mail: jerryshi0110@gmail.com
  6 |  @ Created Time: Tue 12 Jul 2016 05:30:00 PM CST
  7 |  ************************************************************************/
  8 | #define BOOST_TEST_DYN_LINK
  9 | #define BOOST_TEST_MODULE normalizeTest
 10 | //#include <boost/test/unit_test.hpp>
 11 | #include <iostream>
 12 | #include "unit_test.h"
 13 | #include "util/normalize.h"
 14 | 
 15 | // name of the test suite is normalizeTest
 16 | BOOST_AUTO_TEST_SUITE (normalizeTest)
 17 | 
 18 | // 1 Normalize::ToLower()
 19 | // Convert string to lower case
 20 | BOOST_AUTO_TEST_CASE (ToLower) {
 21 |     std::string str("ABCD-HK.3SG");
 22 |     Normalize::ToLower(str);
 23 |     BOOST_CHECK_MESSAGE(str == "abcd-hk.3sg", "ToLower result: " << str);
 24 | }
 25 | 
 26 | // 2 Normalize::ToUpper()
 27 | // Convert string to upper case
 28 | BOOST_AUTO_TEST_CASE (ToUpper) {
 29 |     std::string str("i love you 小红!");
 30 |     Normalize::ToUpper(str);
 31 |     BOOST_CHECK_MESSAGE(str == "I LOVE YOU 小红!", "ToUpper result: " << str);
 32 | }
 33 | 
 34 | // 3 Normalize::ToUTF8()
 35 | // Convert to utf8 encoding
 36 | BOOST_AUTO_TEST_CASE (UTF8Encoding) {
 37 |     std::string str("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
 38 |     Normalize::ToUTF8(str);
 39 |     BOOST_CHECK_MESSAGE(str == "a����z", "To utf8 result:" << str);
 40 |     str = "把";
 41 |    // Normalize::ToUTF8(str);
 42 |    // std::cout << "T: " << str << "\t T[0]:" << tt << std::endl;
 43 | }
 44 | 
 45 | // 4 Normalize::IsValidUTF8()
 46 | // Check a string is a valid utf8 encoding
 47 | BOOST_AUTO_TEST_CASE (IsUTF8Encoding) {
 48 |     // unknown encoding 日ш
 49 |     std::string str("\xe6\x97\xa5\xd1\x88\xfa");
 50 |     bool flag = Normalize::IsValidUTF8(str);
 51 |     BOOST_CHECK_MESSAGE(flag == false, "IsUTF8Encoding result: " << flag);
 52 | }
 53 | 
 54 | // 5 Normalize::IsChinese()
 55 | // Determine whether a string is a chinese characters
 56 | BOOST_AUTO_TEST_CASE (IsChinese) {
 57 |     // not chinese as it contains "》"
 58 |     std::string str("我爱中国》");
 59 |     bool flag = Normalize::IsChinese(str);
 60 |     BOOST_CHECK_MESSAGE( flag == false, str << "IsChinese result: " << flag);
 61 |     str = "青青原上草";
 62 |     flag = Normalize::IsChinese(str);
 63 |     BOOST_CHECK_MESSAGE( flag == true, str << " IsChinese result: " << flag);
 64 | }
 65 | 
 66 | // 6 Normalize::ToUnicode()
 67 | // Get utf16 encoding arrary of a string
 68 | BOOST_AUTO_TEST_CASE (ToUnicode) {
 69 |     std::string str("大智若愚");
 70 |     std::vector<uint16_t> unicodes;
 71 |     Normalize::ToUnicode(str, unicodes);
 72 |     BOOST_CHECK_MESSAGE( unicodes.size() == 4, str << "To utf16 size: " << unicodes.size());
 73 |     
 74 |     str = "银行abc";
 75 |     Normalize::ToUnicode(str, unicodes);
 76 |     BOOST_CHECK_MESSAGE( unicodes.size() == 5, str << "To utf16 size: " << unicodes.size());
 77 | }
 78 | 
 79 | // 7 Normalize::UnicodeToUTF8Str()
 80 | // Convert a chinese character with utf16 encoding(uint16_t) to a utf8 string
 81 | BOOST_AUTO_TEST_CASE (UnicodesToUTF8Str) {
 82 |     std::string str("中华人民共和国");
 83 |     std::vector<uint16_t> unicodes;
 84 |     Normalize::ToUnicode(str, unicodes);
 85 |     std::string utf8str;
 86 |     Normalize::UnicodeToUTF8Str(unicodes, utf8str);
 87 |     BOOST_CHECK_MESSAGE( str == utf8str, str << "After utf16 encoding and decoding: " << utf8str);
 88 |     
 89 |     str = "连衣裙Love";
 90 |     Normalize::ToUnicode(str, unicodes);
 91 |     Normalize::UnicodeToUTF8Str(unicodes, utf8str);
 92 |     BOOST_CHECK_MESSAGE( str == utf8str, str << "After utf16 encoding and decoding: " << utf8str);
 93 | }
 94 | 
 95 | // 8 Normalize::UnicodeToUTF8Str()
 96 | // Convert a chinese character with utf16 encoding(uint16_t) to a utf8 string
 97 | BOOST_AUTO_TEST_CASE (UnicodeToUTF8Str) {
 98 |     std::string str("爱");
 99 |     std::vector<uint16_t> unicodes;
100 |     Normalize::ToUnicode(str, unicodes);
101 |     std::string utf8str;
102 |     Normalize::UnicodeToUTF8Str(unicodes[0], utf8str);
103 |     BOOST_CHECK_MESSAGE( str == utf8str, str << "After utf16 encoding and decoding: " << utf8str);
104 | }
105 | 
106 | // 9 Normalize::IsDigit()
107 | BOOST_AUTO_TEST_CASE (IsDigit) {
108 |     bool flag = false;
109 |    flag =  Normalize::IsDigit('2');
110 |    BOOST_CHECK_MESSAGE( flag == true, "'s' IsDigit result: " << flag);
111 | 
112 |    flag = Normalize::IsDigit('w');
113 |    BOOST_CHECK_MESSAGE(flag == false, "'w' IsDigit result: " << flag);
114 | }
115 | 
116 | // 10 Normalize::IsAlpha()
117 | BOOST_AUTO_TEST_CASE (IsAlpha) {
118 |    bool flag = false;
119 |    flag =  Normalize::IsAlpha('z');
120 |    BOOST_CHECK_MESSAGE( flag == true, "'z' IsDigit result: " << flag);
121 | 
122 |    flag = Normalize::IsAlpha('3');
123 |    BOOST_CHECK_MESSAGE(flag == false, "'3' IsDigit result: " << flag);
124 | }
125 | 
126 | // 11 Normalize::IsConnector()
127 | BOOST_AUTO_TEST_CASE (IsConnector) {
128 |    bool flag = false;
129 |    flag =  Normalize::IsConnector('-');
130 |    BOOST_CHECK_MESSAGE( flag == true, "'-' IsDigit result: " << flag);
131 | 
132 |    flag = Normalize::IsConnector('.');
133 |    BOOST_CHECK_MESSAGE(flag == true, "'.' IsDigit result: " << flag);
134 |    
135 |    flag = Normalize::IsConnector('+');
136 |    BOOST_CHECK_MESSAGE(flag == true, "'+' IsDigit result: " << flag);
137 |    
138 |    flag = Normalize::IsConnector('=');
139 |    BOOST_CHECK_MESSAGE(flag == false, "'=' IsDigit result: " << flag);
140 | }
141 | 
142 | // 12 Normalize::IsBreakPunct()
143 | BOOST_AUTO_TEST_CASE (IsPunct) {
144 |    bool flag = false;
145 |    flag =  Normalize::IsBreakPunct('[');
146 |    BOOST_CHECK_MESSAGE( flag == true, "'[' IsDigit result: " << flag);
147 | 
148 |    flag = Normalize::IsBreakPunct(']');
149 |    BOOST_CHECK_MESSAGE(flag == true, "']' IsDigit result: " << flag);
150 |    
151 |    flag = Normalize::IsBreakPunct('(');
152 |    BOOST_CHECK_MESSAGE(flag == true, "'(' IsDigit result: " << flag);
153 |    
154 |    flag = Normalize::IsBreakPunct(')');
155 |    BOOST_CHECK_MESSAGE(flag == true, "')' IsDigit result: " << flag);
156 |    
157 |    flag = Normalize::IsBreakPunct('{');
158 |    BOOST_CHECK_MESSAGE(flag == true, "'{' IsDigit result: " << flag);
159 |    
160 |    flag = Normalize::IsBreakPunct('}');
161 |    BOOST_CHECK_MESSAGE(flag == true, "'}' IsDigit result: " << flag);
162 |    
163 |    flag = Normalize::IsBreakPunct('*');
164 |    BOOST_CHECK_MESSAGE(flag == false, "'*' IsDigit result: " << flag);
165 | }
166 | 
167 | // 13 Normalize::IsPunctuation()
168 | BOOST_AUTO_TEST_CASE (IsPunctuation) {
169 |    bool flag = false;
170 |    flag =  Normalize::IsPunctuation('[');
171 |    BOOST_CHECK_MESSAGE( flag == true, "'[' IsDigit result: " << flag);
172 |    
173 |    flag =  Normalize::IsPunctuation(',');
174 |    BOOST_CHECK_MESSAGE( flag == true, "',' IsDigit result: " << flag);
175 |    
176 |    flag =  Normalize::IsPunctuation('?');
177 |    BOOST_CHECK_MESSAGE( flag == true, "'?' IsDigit result: " << flag);
178 | }
179 | 
180 | BOOST_AUTO_TEST_SUITE_END()
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/test/t_segment_unit.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: t_segment_unit.cc
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Wed 20 Jul 2016 02:54:25 PM CST
 7 |  ************************************************************************/
 8 | #define BOOST_TEST_DYN_LINK
 9 | #define BOOST_TEST_MODULE SegmentWrapper
10 | 
11 | #include <iostream>
12 | #include "unit_test.h"
13 | #include "segmentWrapper.h"
14 | 
15 | BOOST_AUTO_TEST_SUITE (segment)
16 | 
17 | std::string resDir("../resource/dict");
18 | std::auto_ptr<SegmentWrapper> segWrapper_(new SegmentWrapper(resDir));
19 | 
20 | 
21 | // show results
22 | void show_tokens(const std::string& str) {
23 |     std::vector<std::string> tokens;
24 |     segWrapper_->segment(str, tokens, false);
25 | 
26 |     std::cout <<"Input:" <<str << "\ntoken results:\n";
27 |     for (uint32_t i = 0; i < tokens.size(); ++i) {
28 |         std::cout << tokens[i] << ",";
29 |     }
30 |     std::cout << std::endl;
31 | }
32 | 
33 | // case 1, segment unit test
34 | BOOST_AUTO_TEST_CASE (segment) {
35 |     show_tokens("中兴WP826A电信CDMA天翼4G家用办公无线座机固话插卡电话老人机");
36 |     show_tokens("美迪惠尔可莱丝NMF针剂水库面膜 官方正品");
37 |     show_tokens("韩国正品九朵云奇迹马油膏淡化痘印补水保湿面霜70g");
38 |     show_tokens("保宁 婴幼儿衣物强力杀菌斑点喷雾祛除剂 500ml 母婴用品");
39 | }
40 | 
41 | BOOST_AUTO_TEST_SUITE_END()
42 | 


--------------------------------------------------------------------------------
/test/t_suggestion_unit.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: t_suggestion_unit.cc
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Tue 26 Jul 2016 05:50:25 PM CST
 7 |  ************************************************************************/
 8 | #define BOOST_TEST_DYN_LINK
 9 | #define BOOST_TEST_MODULE SuggestionTest
10 | 
11 | #include "unit_test.h"
12 | #include <iostream>
13 | #include "suggestion.hpp"
14 | 
15 | boost::shared_ptr<Suggestion> pSuggest(new Suggestion("../resource/"));
16 | 
17 | BOOST_AUTO_TEST_SUITE (SuggestionTest)
18 | 
19 | // generate test terms
20 | void gendata() {
21 | 
22 |     // construct data
23 |     std::ofstream ofs(".terms.txt");
24 |     if (!ofs) {
25 |         std::cout << "Open file .terms.txt file error!\n";
26 |     }
27 |     // write test data
28 |     ofs << "贝贝德皮诺" << "\t" << 300 << "\t" << 123 << std::endl;
29 |     ofs << "bebedepino" << "\t" << 250 << "\t" << 231 << std::endl;
30 |     ofs << "背背佳" << "\t" << 130 << "\t" << 42 <<std::endl;
31 |     ofs << "brand25" << "\t" << 50 << "\t" << 83 << std::endl; 
32 |     ofs.close();
33 | }
34 | 
35 | // remove space
36 | void print_str(const std::string& str) {
37 |     std::cout << "===========RemoveSpace===========\nInput:" << str << std::endl;
38 |     std::cout << "Result:" << pSuggest->RemoveSpace(str) << std::endl;
39 | }
40 | 
41 | // Case 1, Suggestion::RemoveSpace()
42 | BOOST_AUTO_TEST_CASE (removeSpace) {
43 |     
44 |    print_str("be bedi    pino"); 
45 | }
46 | 
47 | BOOST_AUTO_TEST_SUITE_END()
48 | 


--------------------------------------------------------------------------------
/test/unit_test.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  @ File Name: unit_test.h
 3 |  @ Method: 
 4 |  @ Author: Jerry Shi
 5 |  @ Mail: jerryshi0110@gmail.com
 6 |  @ Created Time: Wed 13 Jul 2016 01:22:10 PM CST
 7 |  ************************************************************************/
 8 | #ifndef UNIT_TEST_H
 9 | #define UNIT_TEST_H
10 | 
11 | #include <boost/test/unit_test.hpp>
12 | //#include <boost/test/included/unit_test.hpp>
13 | #include <boost/shared_ptr.hpp>
14 | #endif // unit_test.h
15 | 
16 | 


--------------------------------------------------------------------------------