├── .gitignore ├── README.md ├── doc └── qs_workflow.png ├── include ├── buildEngine.hpp ├── correctionEngine.h ├── dictionary.hpp ├── pinyinConvert.h ├── pinyinEngine.h ├── segment │ ├── darts.h │ ├── hash_table.hpp │ ├── kstring.hpp │ ├── line_reader.h │ ├── normalize.h │ ├── segment.h │ ├── segment_dict.h │ └── trd2simp.h ├── segmentWrapper.h ├── suggestion.hpp └── util │ ├── darts.h │ ├── mtrie.h │ ├── normalize.h │ ├── py_types.h │ ├── types.h │ ├── utf8.h │ └── utf8 │ ├── checked.h │ ├── core.h │ └── unchecked.h └── test ├── Makefile ├── t_build_unit.cc ├── t_dictionary_unit.cc ├── t_normalize_unit.cc ├── t_segment_unit.cc ├── t_suggestion_unit.cc └── unit_test.h /.gitignore: -------------------------------------------------------------------------------- 1 | include/*.swp 2 | /resource/ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Query Suggestion 2 | ### 1. Intoduction 3 | This a query auto-completion system can be used in any searching scenario.I found query suggestion is an useful application in project,but there was 4 | no more documents or codes ,so I want to rebuild this project, andd make it more light and easy to use, and support different interface. 5 | 6 | ### 2. System Framework 7 | Query Suggestion here was a very simple version, I had not adopt many algorithms,bellow is the system workflow. 8 | ![SystemFramework](./doc/qs_workflow.png) 9 | 10 | As shown in figure, the main process of this project was the offline, we generated prefixes(contain pinyin,shengmu,chinese words) maps, at online situtation, 11 | use the input query(maybe one word, one alphabet, not a completed query) as key to find in maps,and recall it's keywords list, last we rank the candidated 12 | keywords and return. The advantages of the project was **efficient and convenient**, you just only need to prepare the corpus, which may the query logs in 13 | search engine, or titles of product or news and other corpus.But the disadvantages also obvious, real-time and incomplete coverage because you can not supply \ 14 | an enough big corpus so the suggestion words you want may not recommended.Anyway, we already use it in our E-commerce search and got a better result. 15 | ### 2. Algorithms and Model 16 | 17 | ### 3. Usage 18 | 19 | ### 4. Roadmap 20 | - ~~1. Combine normalization module in include/segment/normalize.h and include/util/normalize.h~~ 21 | - ~~2. Add more unit test for normalization module~~ 22 | - ~~3. Design the old algorithm system framwork.~~ 23 | - 4 . Supply Python interface for old algorithm. 24 | - 5 . Collect algorithm papers and documents. 25 | - 6 . Choose one more effectient algorithm. 26 | 27 | ### 5. Contact 28 | If you have any questions ,suggestions or ideas you can contact me with *jerryshi0110@gmail.com* 29 | -------------------------------------------------------------------------------- /doc/qs_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syw2014/query-suggestion/235ef793333c8b44911f1bc3e86a09d277e8441b/doc/qs_workflow.png -------------------------------------------------------------------------------- /include/buildEngine.hpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: buildEngine.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Mon 11 Jul 2016 01:42:29 PM CST 7 | ************************************************************************/ 8 | // Data building module contains data process,tokenize, prefix gerenation 9 | // 10 | 11 | #ifndef MODULE_BUILD_ENGINE_HPP 12 | #define MODULE_BUILD_ENGINE_HPP 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | // segment 30 | #include "segmentWrapper.h" 31 | #include "dictionary.hpp" 32 | 33 | typedef std::vector > TermIDTFType; 34 | typedef boost::unordered_map KeyInfoType; 35 | typedef boost::unordered_map > KeyTermIDsType; 36 | typedef std::pair TFResPairType; // term frequency and reserve , now the reserve used as searching result number 37 | typedef std::vector > TermInfoType; 38 | 39 | // Build engine for data module build 40 | // Provide interfaces for data module building. 41 | class BuildEngine { 42 | private: 43 | static const double prefix_w_ = 10000.0; // different weight for different prefix type 44 | static const double pinyin_prefix_w_ = 1000.0; 45 | static const double shengmu_w_ = 100.0; 46 | static const double word_infix_w_ = 10.0; 47 | static const double word_suffix_w_ = 1.0; 48 | 49 | static const uint32_t topK_ = 15; // default number of candidate words 50 | 51 | //static std::string res_dir_; // resource directory, pinyin and token resource 52 | 53 | // main data structure 54 | std::auto_ptr segWrapper_; // chinese string tokenizer 55 | std::auto_ptr pySegDict_; // pinyin tokenizer 56 | 57 | TermInfoType termsInfo_; // total term set and it's corresponding infos(tf, numbers) generated from corpus 58 | std::vector tf_; // term frequency , it was corresponded to words in terms_ 59 | KeyTermIDsType key_termIds_; // key(prefix) to term ids map 60 | 61 | std::set shm_ ; // total sheng mu set 62 | public: 63 | 64 | static std::string res_dir_; // resource directory, pinyin and token resource 65 | private: 66 | 67 | // extract sheng mu from pinyin 68 | // @pinyin: pinyin string vector 69 | // @shm: sheng mu extract form pinyin 70 | // eg: 行 hang, xing shengmu: h,x 71 | void GetShengMuByPinYin_(const std::vector& pinyin 72 | ,std::vector& shm) { 73 | shm.clear(); 74 | if (pinyin.empty()) 75 | return; 76 | 77 | std::set head; 78 | for (uint32_t i = 0; i < pinyin.size(); ++i) { 79 | const std::string& py = pinyin[i]; 80 | if (py.empty()) 81 | continue; 82 | else { 83 | std::string ch; 84 | ch += py[0]; // first char 85 | //TODO, To be confirm this logistic 86 | if ((char)py[0] >= 0) { 87 | if (head.insert(ch).second) 88 | shm.push_back(ch); 89 | } else { 90 | if (head.insert(py).second) 91 | shm.push_back(py); 92 | } 93 | } 94 | } 95 | } 96 | 97 | // store elements and it's weight in vector into map 98 | // @keys: 99 | // @key_info: key information map 100 | // @termid: the index id of string which generated keys 101 | // @weight: keys weight 102 | void StoreInKeyMap(const std::vector& keys 103 | ,KeyInfoType& key_info 104 | ,const uint32_t termid 105 | ,const double weight) { 106 | if (keys.empty()) 107 | return; 108 | for (uint32_t idx = 0; idx < keys.size(); ++idx) { 109 | key_info[keys[idx]].push_back(std::make_pair(termid, weight)); 110 | } 111 | } 112 | 113 | public: 114 | BuildEngine() { 115 | // resource directory check 116 | if (!boost::filesystem::exists(res_dir_)) { 117 | std::cout << "resouce directory: " << res_dir_ << "is not exist!" << std::endl; 118 | std::cout << "Tips: resource directory may like: \"../resource/\"" << std::endl; 119 | return; 120 | } 121 | 122 | // construct tokenizer 123 | segWrapper_.reset(new SegmentWrapper(res_dir_+"dict")); 124 | pySegDict_.reset(new Dictionary(res_dir_+"cn")); 125 | 126 | // load shengmu from files. 127 | // b,p,m,f,d,t,n,l,g,k,j,q,x,zh,ch,sh,r,z,c,s,y,w 128 | std::ifstream ifs((res_dir_+"cn/ShengMu.txt").c_str()); 129 | if (!ifs) { 130 | std::cout << "Open file " << (res_dir_+"cn/ShengMu.txt") << "failed!" << std::endl; 131 | return; 132 | } 133 | std::string line; 134 | while (getline(ifs, line)) { 135 | boost::algorithm::trim(line); 136 | if (line.empty()) 137 | continue; 138 | shm_.insert(line); 139 | } 140 | ifs.close(); 141 | } 142 | 143 | ~BuildEngine() { 144 | termsInfo_.clear(); 145 | tf_.clear(); 146 | key_termIds_.clear(); 147 | } 148 | 149 | // get data building results 150 | void GetDataModule(TermInfoType& termsInfo 151 | ,KeyTermIDsType& key_termids) { 152 | termsInfo.clear(); 153 | key_termids.clear(); 154 | 155 | termsInfo.swap(termsInfo_); 156 | key_termids.swap(key_termIds_); 157 | } 158 | 159 | // build data module from file 160 | // Notes: 161 | // data structure in file must be like: 162 | // term \t freq \t result_num 163 | // the separator is tab 164 | bool Build(const std::string& nm) { 165 | termsInfo_.clear(); 166 | tf_.clear(); 167 | key_termIds_.clear(); 168 | 169 | std::ifstream ifs(nm.c_str()); 170 | if(!ifs) { 171 | std::cout << "File " << nm << "open failed!" << std::endl; 172 | return false; 173 | } 174 | 175 | std::cout << "Start building...\n"; 176 | std::string line; 177 | std::map t_freqRes; // term , tf, result_num 178 | // extract term, it's freq and result number 179 | while (getline(ifs, line)) { 180 | if (line.empty()) 181 | continue; 182 | boost::algorithm::trim(line); 183 | boost::to_lower(line); 184 | std::size_t pos = line.find("\t"); 185 | if (pos == std::string::npos) 186 | continue; 187 | std::vector vec; 188 | std::string term; 189 | boost::algorithm::split(vec, line, boost::algorithm::is_any_of("\t")); 190 | // the data must contians term and it's frequency 191 | if (vec.size() < 2) 192 | continue; 193 | term = vec[0]; 194 | double freq = 0.0; 195 | uint32_t result_num = 0; 196 | // result num or other meanings 197 | if (vec.size() != 3) 198 | result_num = 0; 199 | else { 200 | try { 201 | result_num = boost::lexical_cast(vec[2]); 202 | } catch(...) { 203 | result_num = 0; 204 | } 205 | } 206 | // term frequency 207 | try { 208 | freq = boost::lexical_cast(vec[1]); 209 | } catch(...) { 210 | // freq = 1.0; 211 | std::cout << "bad line in:" << nm << ":" << line << std::endl; 212 | continue; 213 | } 214 | t_freqRes[term] = std::make_pair(freq, result_num); 215 | } 216 | ifs.close(); 217 | //std::cout << "T: " << t_freq.size() << std::endl; 218 | // step2, store term and it's freq 219 | std::map::iterator it = t_freqRes.begin(); 220 | uint32_t size = t_freqRes.size(); 221 | termsInfo_.resize(size); 222 | tf_.resize(size); 223 | for (uint32_t idx = 0; it != t_freqRes.end() && idx < size; ++it, ++idx) { 224 | termsInfo_[idx] = std::make_pair(it->first, it->second); // store term info 225 | tf_[idx] = it->second.first; // tf 226 | } 227 | t_freqRes.clear(); 228 | //std::cout << "TT: " << terms_.size() << "\t " << tf_.size() << std::endl; 229 | // step3, generate keys 230 | KeyInfoType key_info; 231 | for (uint32_t idx = 0; idx < termsInfo_.size(); ++idx) { 232 | Generate(termsInfo_[idx].first, idx, key_info); 233 | } 234 | 235 | // step4 , compuate score for every key 236 | KeyInfoType::iterator iter; 237 | for (iter = key_info.begin(); iter != key_info.end(); ++iter) { 238 | std::vector >& info = iter->second; 239 | // score = tf * weight; 240 | for (uint32_t i = 0; i < info.size(); ++i) { 241 | if (info[i].first > tf_.size()) { 242 | info[i].second = 1.1; 243 | continue; 244 | } 245 | info[i].second = tf_[info[i].first] * info[i].second; 246 | } 247 | sort(info.begin(), info.end(), SORT::sortDescendBySecond); 248 | // result deduplication 249 | std::vector termsid; 250 | std::set uniq_ids; 251 | for (uint32_t i = 0; i < info.size() && uniq_ids.size() <= 15; ++i) { 252 | if (uniq_ids.insert(info[i].first).second) { 253 | termsid.push_back(info[i].first); 254 | } 255 | } 256 | key_termIds_[iter->first].swap(termsid); 257 | uniq_ids.clear(); 258 | } 259 | 260 | std::cout << "key_termids size: " << key_termIds_.size() << std::endl; 261 | std::cout << "Building completed!\n"; 262 | } 263 | 264 | // build from vector 265 | bool Build(const std::vector& termVec) { 266 | return true; 267 | } 268 | 269 | // store results to files 270 | // @termFile: file to store all terms 271 | // @keyFile: store all keys and term ids 272 | bool Flush(const std::string& termFile, const std::string& keyFile) { 273 | std::ofstream ofs_term(termFile.c_str()); 274 | if (!ofs_term) { 275 | std::cout << "open file " << termFile << "failed!\n"; 276 | return false; 277 | } 278 | std::ofstream ofs_key(keyFile.c_str()); 279 | if (!ofs_key) { 280 | std::cout << "open file " << keyFile << "failed!\n"; 281 | return false; 282 | } 283 | // store terms 284 | for (uint32_t i = 0; i < termsInfo_.size(); ++i) { 285 | ofs_term << i << "\t" << termsInfo_[i].first << "\t" 286 | << termsInfo_[i].second.first << "\t" 287 | << termsInfo_[i].second.second << "\n"; 288 | } 289 | ofs_term.close(); 290 | 291 | // store keys 292 | KeyTermIDsType::iterator iter; 293 | for (iter = key_termIds_.begin(); iter != key_termIds_.end(); ++iter) { 294 | std::vector& ids = iter->second; 295 | // candidate is itself 296 | // do not suggestion itself 297 | if (ids.size() == 1 && ids[0] < termsInfo_.size() && termsInfo_[ids[0]].first == iter->first) 298 | continue; 299 | ofs_key << iter->first; // key 300 | for (uint32_t i = 0; i < ids.size(); ++i) { 301 | // make sure id is in the range of term vector 302 | if (ids[i] > termsInfo_.size()) 303 | continue; 304 | // do not suggest itself 305 | if (iter->first == termsInfo_[ids[i]].first) 306 | continue; 307 | ofs_key << "\t" << termsInfo_[ids[i]].first; 308 | } 309 | ofs_key << "\n"; 310 | } 311 | ofs_key.close(); 312 | 313 | return true; 314 | } 315 | 316 | // parse string into chars and words 317 | // @chars: single unicode 318 | // @words: string segmentation results 319 | bool Parse(const std::string& str 320 | ,std::vector& chars 321 | ,std::vector& words) { 322 | if (str.empty()) 323 | return false; 324 | 325 | // extract chars 326 | chars.clear(); 327 | std::vector unicodes; 328 | if (Normalize::ToUnicode(str, unicodes)) { 329 | //if(unicodes.empty()) 330 | // return false; 331 | chars.resize(unicodes.size()); 332 | for (uint32_t i = 0; i < unicodes.size(); ++i) { 333 | std::string unicode; 334 | Normalize::UnicodeToUTF8Str(unicodes[i], unicode); 335 | chars[i] = unicode; 336 | } 337 | } 338 | 339 | // extract words 340 | words.clear(); 341 | segWrapper_->segment(str, words, false); 342 | 343 | return true; 344 | } 345 | 346 | // generate prefix 347 | // @str: segment tokens 348 | // @termid: token index 349 | // @key_info: index key, it's corresponding term id and tf 350 | // @num: number of keys 351 | bool Generate(const std::string& str 352 | ,const uint32_t termid 353 | ,KeyInfoType& key_info 354 | ,const uint32_t num = 10) { 355 | if (str.empty()) 356 | return false; 357 | 358 | std::vector chars; 359 | std::vector words; 360 | 361 | Parse(str, chars, words); 362 | 363 | std::vector keys; 364 | 365 | // Start generation 366 | GenerateByPrefix(chars, keys, num); 367 | StoreInKeyMap(keys, key_info, termid, prefix_w_); 368 | 369 | GenerateByPinYinPrefix(str, keys, num); 370 | StoreInKeyMap(keys, key_info, termid, pinyin_prefix_w_); 371 | 372 | GenerateByShengMuPrefix(str, keys, num); 373 | StoreInKeyMap(keys, key_info, termid, shengmu_w_); 374 | 375 | GenerateByWordInfix(words, keys, num); 376 | StoreInKeyMap(keys, key_info, termid, word_infix_w_); 377 | 378 | GenerateByWordSuffix(words, keys, num); 379 | StoreInKeyMap(keys, key_info, termid, word_suffix_w_); 380 | 381 | return true; 382 | } 383 | 384 | // generate key by prefix 385 | // @chars: unicodes for input string 386 | // @keys: index key generated based on every unicode 387 | // @num: the number of unicodes chosen to be stored in keys 388 | bool GenerateByPrefix(const std::vector& chars 389 | ,std::vector& keys 390 | ,const uint32_t num ) { 391 | keys.clear(); 392 | std::string prefix(""); 393 | for (uint32_t i = 0; i < chars.size() && i < num; ++i) { 394 | prefix += chars[i]; 395 | keys.push_back(prefix); 396 | } 397 | 398 | return true; 399 | } 400 | 401 | // generate key by word infix, 402 | // @words: segment tokens 403 | // @keys: index key generated based on infix in token terms, if the token number < 3 404 | // then return directly, as there are only two chinese characters in string, no infix. 405 | // @num: the number of words chosen to be processed. 406 | bool GenerateByWordInfix(const std::vector& words 407 | ,std::vector& keys 408 | ,const uint32_t num) { 409 | keys.clear(); 410 | uint32_t size = words.size(); 411 | if (size < 3) 412 | return false; 413 | 414 | for (uint32_t i = 0; i < size - 1 && i < num; ++i ) 415 | keys.push_back(words[i]); 416 | 417 | return true; 418 | } 419 | 420 | // generate key by last term(suffix) 421 | // @workds: segment tokens 422 | // @keys: index keys 423 | // @num: the number of words chosen to be processed 424 | bool GenerateByWordSuffix(const std::vector& words 425 | ,std::vector& keys 426 | ,const uint32_t num) { 427 | keys.clear(); 428 | if (words.size() > 1) 429 | keys.push_back(words.back()); 430 | else 431 | return false; 432 | 433 | return true; 434 | } 435 | 436 | // generate key by pinyin prefix 437 | // @str: input string, can be any combination 438 | // @keys: index key generated based on pinyin prefix which generated by Dictionary::GetPinYin() 439 | // @num: the length of prefix 440 | bool GenerateByPinYinPrefix(const std::string& str 441 | ,std::vector& keys 442 | ,const uint32_t num) { 443 | keys.clear(); 444 | if (str.empty()) 445 | return false; 446 | // get pinyin from pinyin module 447 | std::vector pinyin; 448 | pySegDict_->GetPinYin(str, pinyin); 449 | 450 | // extract prefixes from pinyin string 451 | std::set prefix; 452 | for (uint32_t i = 0; i < pinyin.size(); ++i) { 453 | std::string pre(""); 454 | // process every pinyin 455 | for (uint32_t j = 0; j < pinyin[i].size(); ++j) { 456 | pre += pinyin[i][j]; // each letter 457 | if (prefix.insert(pre).second && pre.length() < num) 458 | keys.push_back(pre); 459 | } 460 | } 461 | 462 | return true; 463 | } 464 | 465 | // generate key by sheng mu prefix 466 | // @str: input string 467 | // @keys: index key generated based on sheng mu prefix of pinyin 468 | // @len: 469 | // eg: 银行 ShengMu 470 | // yin -> y 471 | // hang xing -> y x(polyphone) 472 | // result: y, yx 473 | bool GenerateByShengMuPrefix(const std::string& str 474 | ,std::vector& keys 475 | ,const uint32_t len) { 476 | keys.clear(); 477 | if (str.empty()) 478 | return false; 479 | 480 | std::vector > shm_list; 481 | // convert string to unicodes 482 | std::vector unicodes; 483 | Normalize::ToUnicode(str, unicodes); 484 | if (unicodes.empty()) 485 | return false; 486 | for (uint32_t i = 0; i < unicodes.size(); ++i) { 487 | // get one letter 488 | UCS2Char uchar = unicodes[i]; 489 | std::string ustr; 490 | Normalize::UnicodeToUTF8Str(uchar, ustr); 491 | // type inditification 492 | // space 493 | if (ustr == " ") { 494 | shm_list.push_back(std::vector(1, " ")); // insert a space 495 | } else { 496 | // is chinese char, 497 | if (Normalize::IsChinese(uchar)) { 498 | std::vector shm; 499 | std::vector pinyin; 500 | // std::cout < "T -> pinyin: " << ustr << std::endl; 501 | pySegDict_->GetPinYin(ustr, pinyin); // extract pinyin 502 | GetShengMuByPinYin_(pinyin, shm); // extract sheng mu 503 | shm_list.push_back(std::vector()); 504 | shm_list.back().swap(shm); 505 | } else { 506 | // is alphabet / digital 507 | shm_list.push_back(std::vector(1,ustr)); 508 | } 509 | } 510 | } 511 | // get all shengmu combination includes polyphone 512 | if (shm_list.size() < 1) 513 | return false; 514 | std::vector shengmus = shm_list[0]; // pinyin of the first word 515 | for (uint32_t idx = 1; idx < shm_list.size(); ++idx) { 516 | std::vector& current_shm = shm_list[idx]; 517 | if (current_shm.size() == 1) { // no polyphone 518 | for (uint32_t id = 0; id < shengmus.size(); ++id ) { 519 | shengmus[id] += current_shm[0]; 520 | } 521 | } else { // with polyphone 522 | std::vector pre_shm; 523 | pre_shm.swap(shengmus); 524 | // combine the previous and current 525 | for (uint32_t pre_idx = 0; pre_idx < pre_shm.size(); ++pre_idx) { 526 | for (uint32_t cur_idx = 0; cur_idx < current_shm.size(); ++cur_idx) { 527 | shengmus.push_back( pre_shm[pre_idx] + current_shm[cur_idx] ); 528 | } 529 | } 530 | } 531 | } 532 | 533 | // get all shengmu prefix remove the duplication 534 | std::set prefixes; 535 | for (uint32_t idx = 0; idx < shengmus.size(); ++idx) { 536 | std::string prefix; 537 | for (uint32_t py_id = 0; py_id < shengmus[idx].size(); ++py_id) { 538 | prefix += shengmus[idx][py_id]; 539 | if (prefixes.insert(prefix).second && prefix.length() < len) 540 | keys.push_back(prefix); 541 | } 542 | } 543 | 544 | return true; 545 | } 546 | }; 547 | 548 | #endif // buildEngine.hpp 549 | -------------------------------------------------------------------------------- /include/correctionEngine.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: correctionEngine.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 13 Jul 2016 05:04:48 PM CST 7 | ************************************************************************/ 8 | // Corrrection engine contains pinyin segmentation, pinyin and chinese 9 | // character interconversion, pinyin correction, english words correction 10 | // chinese words correction,etc. 11 | 12 | #ifndef CORRECTION_ENGINE_H 13 | #define CORRECTION_ENGINE_H 14 | 15 | #include 16 | #include "" 17 | 18 | 19 | 20 | #endif // correctionEngine.h 21 | -------------------------------------------------------------------------------- /include/dictionary.hpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: dictionary.hpp 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 13 Jul 2016 05:41:24 PM CST 7 | ************************************************************************/ 8 | // Trie struct for quick find, pinyin tokenizer, pinyin -> chinese character 9 | // chinese -> pinyin, fuzzy pinyin map, pinyin filter map. 10 | 11 | #ifndef DICTIONARY_HPP 12 | #define DICTIONARY_HPP 13 | 14 | #include 15 | #include 16 | 17 | #include "util/normalize.h" 18 | #include "util/darts.h" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | typedef Darts::DoubleArray Trie; 28 | typedef uint16_t UCS2Char; 29 | typedef boost::unordered_map > Cn2PinYinType; // chinese word to pinyin list 30 | typedef boost::unordered_map > PinYin2CnType; // pinyin to word list 31 | 32 | 33 | // class dictionary was created based on trie , usage of this class 34 | // Dictioanry::Segment(): given a string, sement it into pinyin tokens, like "yinhang" , tokens:'yin','hang'; 35 | // Dictionary::GetChar(): given pinyin return it's chinese character list 36 | // Dictionary::GetPinYin(): given a chinese char return it's pinyin list, do not distinguish ployphone. 37 | class Dictionary 38 | { 39 | private: 40 | Trie trie_; // trie tree 41 | Cn2PinYinType cn2pinyin_; // chinese character -> pinyin list 42 | PinYin2CnType pinyin2cn_; // pinyin -> chinese character list 43 | std::vector pinyin_; // all single pinyin 44 | boost::unordered_map filter_pinyin_; // filter pinyin 45 | 46 | // load pinyin and chinese character from file 47 | // @dir: the resource path 48 | void Load_(const std::string& dir) { 49 | 50 | // TODO: 51 | // quick load 52 | /* int32_t flag = 1; // to check if the bin file opened successfully. 53 | try { 54 | if ((flag = trie_.open((dir+"/pinyin.bin").c_str())) == 0) { 55 | } 56 | } catch(...) { // do not throw exception 57 | } */ 58 | 59 | std::ifstream ifs((dir+"/pinyin.txt").c_str()); 60 | if(!ifs.is_open()) { 61 | std::cout << "Open " << (dir+"/pinyin.txt") << "failed!\n"; 62 | return; 63 | } 64 | std::string line; 65 | std::set pinyinSet; 66 | while (getline(ifs, line)) { 67 | boost::algorithm::trim(line); 68 | std::vector vec; 69 | boost::algorithm::split(vec, line, boost::is_any_of(" ")); 70 | if (vec.size() != 2) 71 | continue; 72 | std::string cnChar = vec[0]; 73 | std::string pinyin = vec[1].substr(0, vec[1].length()-1); 74 | Normalize::ToUTF8(cnChar); 75 | // filter pinyin 76 | if (filter_pinyin_.find(pinyin) == filter_pinyin_.end()) 77 | AddPinYinCnMap(pinyin, cnChar); 78 | pinyinSet.insert(pinyin); 79 | } 80 | pinyin_.insert(pinyin_.end(), pinyinSet.begin(), pinyinSet.end()); 81 | std::cout << "Resouces loaded pinyin size: " << pinyin_.size() << std::endl; 82 | } 83 | 84 | // Merge 85 | void Merge_( std::vector& tks) { 86 | std::vector flags(tks.size(), 0); 87 | for (uint32_t i = 0; i < tks.size(); ++i) { 88 | uint32_t j = 0; 89 | for (; j < tks[i].length(); ++j) { 90 | if (Normalize::IsDigit(tks[i][j]) 91 | || (Normalize::IsAlpha(tks[i][j]) 92 | && tks[i].length() < 2)) 93 | break; 94 | } 95 | if (j < tks[i].length()) flags[i] = 1; 96 | } 97 | 98 | // TODO: Rule1 99 | // eg: a, b, cd, e 100 | // a, b , e needs to be merged 101 | // we make a conclsion that abcde is one word. 102 | for (uint32_t i = 0; i < flags.size(); ++i) { 103 | if ((i>=1) && (i+1) < flags.size()) { 104 | if (flags[i-1] == 1 && flags[i+1] == 1) 105 | flags[i] = 1; 106 | } 107 | } 108 | // TODO: Rule2 109 | // eg: a,p,p,le 110 | // we know it's english word, if the penult token needs to be merged 111 | // we also set the merge flag of the last token to true. 112 | uint32_t k = flags.size() - 1; 113 | if (k >= 1) { 114 | if (flags[k-1] == 1) 115 | flags[k] = 1; 116 | } 117 | 118 | // merge 119 | for (uint32_t i = 0; i < tks.size(); ++i) { 120 | if (flags[i]) { 121 | uint32_t t = i; 122 | ++i; 123 | while (i < tks.size() && flags[i]) { 124 | tks[t] += tks[i]; 125 | // std::cout << "TTT:: " << tks[t] << std::endl; 126 | tks.erase(tks.begin()+i); 127 | flags.erase(flags.begin()+i); 128 | } 129 | } 130 | } 131 | } 132 | 133 | // Clean, remove punctuation string 134 | void Clean_(std::vector& tks) { 135 | for (uint32_t i = 0; i < tks.size(); ++i) { 136 | boost::algorithm::trim(tks[i]); 137 | if (tks[i].length() > 0 138 | && Normalize::IsPunctuation(tks[i][0])) { 139 | tks[i].clear(); 140 | } 141 | } 142 | } 143 | public: 144 | Dictionary(const std::string& dir) { 145 | cn2pinyin_.clear(); 146 | pinyin2cn_.clear(); 147 | pinyin_.clear(); 148 | filter_pinyin_.clear(); 149 | 150 | Init(); 151 | LoadResource(dir); 152 | } 153 | ~Dictionary() { 154 | } 155 | 156 | void Init() { 157 | 158 | // add other pinyin 159 | /* pinyin_.push_back("chon"); 160 | pinyin_.push_back("con"); 161 | pinyin_.push_back("don"); 162 | pinyin_.push_back("gon"); 163 | pinyin_.push_back("hon"); 164 | pinyin_.push_back("jion"); 165 | pinyin_.push_back("kon"); 166 | pinyin_.push_back("lon"); 167 | pinyin_.push_back("non"); 168 | pinyin_.push_back("qion"); 169 | pinyin_.push_back("ron"); 170 | pinyin_.push_back("son"); 171 | pinyin_.push_back("ton"); 172 | pinyin_.push_back("xion"); 173 | pinyin_.push_back("yon"); 174 | pinyin_.push_back("zhon"); 175 | pinyin_.push_back("zon"); 176 | */ 177 | // add filter pinyin 178 | filter_pinyin_.insert(std::make_pair("n", 1)); 179 | filter_pinyin_.insert(std::make_pair("ng", 1)); 180 | filter_pinyin_.insert(std::make_pair("m", 1)); 181 | filter_pinyin_.insert(std::make_pair("o", 1)); 182 | } 183 | // load resource from file 184 | void LoadResource(const std::string& dir) { 185 | if (dir.empty()) { 186 | std::cout << "directory is not exists!\n"; 187 | return; 188 | } 189 | 190 | // load pinyin from file 191 | Load_(dir); 192 | // build trie 193 | std::size_t SIZE = pinyin_.size(); 194 | std::vector lengths(SIZE); 195 | typedef Darts::DoubleArray::value_type value_type; 196 | std::vector states(SIZE); 197 | std::vector keys(SIZE); 198 | for (uint32_t i = 0; i < SIZE; ++i) { 199 | keys[i] = pinyin_[i].c_str(); 200 | lengths[i] = pinyin_[i].length(); 201 | states[i] = i; 202 | } 203 | 204 | assert(keys.size() == pinyin_.size()); 205 | trie_.build(keys.size(), &keys[0], &lengths[0], &states[0]); 206 | // TODO: 207 | // save bin file 208 | } 209 | 210 | // Add pinyin and chinese char into map 211 | void AddPinYinCnMap(const std::string& pinyin, const std::string& cnChar) { 212 | // Add cnChar 213 | Cn2PinYinType::iterator cnIter; 214 | std::vector cnChars; 215 | utf8::utf8to16(cnChar.begin(), cnChar.end(), std::back_inserter(cnChars)); 216 | cnIter = cn2pinyin_.find(cnChars[0]); 217 | if (cnIter == cn2pinyin_.end()) { // not found in map 218 | std::vector pinyin_list(1, pinyin); 219 | cn2pinyin_.insert(std::make_pair(cnChars[0], pinyin_list)); 220 | } else { // add in the previous list 221 | std::vector& pinyin_list = cnIter->second; 222 | std::vector::iterator pyIter; 223 | pyIter = std::find(pinyin_list.begin(), pinyin_list.end(), pinyin); 224 | if (pyIter == pinyin_list.end()) { 225 | pinyin_list.push_back(pinyin); 226 | } 227 | } 228 | 229 | // Add pinyin 230 | PinYin2CnType::iterator pyIter; 231 | pyIter = pinyin2cn_.find(pinyin); 232 | if (pyIter == pinyin2cn_.end()) { 233 | std::vector cnChar_list(1, cnChars[0]); 234 | pinyin2cn_.insert(std::make_pair(pinyin, cnChar_list)); 235 | } else { 236 | std::vector& cnChar_list = pyIter->second; 237 | std::vector::iterator cnIter; 238 | cnIter = std::find(cnChar_list.begin(), cnChar_list.end(), cnChars[0]); 239 | if (cnIter == cnChar_list.end()) { 240 | cnChar_list.push_back(cnChars[0]); 241 | } 242 | } 243 | } 244 | 245 | // pinyin tokenizer 246 | void Segment(const std::string& pinyin, std::vector& result) { 247 | if (pinyin.empty()) 248 | return; 249 | Fmm(pinyin, result); 250 | Merge_(result); 251 | Clean_(result); 252 | } 253 | 254 | // get chinese character based on pinyin string 255 | bool GetChar(const std::string& pinyin, std::vector& result) { 256 | PinYin2CnType::iterator cnIter; 257 | cnIter = pinyin2cn_.find(pinyin); 258 | if (cnIter != pinyin2cn_.end()) { 259 | std::vector cnChars; 260 | cnChars = cnIter->second; 261 | uint32_t size = cnChars.size(); 262 | result.resize(size); 263 | for (uint32_t i = 0; i < size; ++i) { 264 | std::string utf8str; 265 | Normalize::UnicodeToUTF8Str(cnChars[i], utf8str); 266 | //std::cout << "T: " << utf8str << std::endl; 267 | result[i] = utf8str; 268 | } 269 | return true; 270 | } 271 | return false; 272 | } 273 | 274 | // get pinyin list based on chinese character string 275 | bool GetPinYin(const std::string& cnChar, std::vector& result) { 276 | result.clear(); 277 | if (cnChar.empty()) { 278 | return false; 279 | } 280 | std::vector cnChars; 281 | Normalize::ToUnicode(cnChar, cnChars); 282 | 283 | GetPinYin_(cnChars, "", result); 284 | return true; 285 | } 286 | 287 | // input chinese character and pinyin combination and get the pinyin 288 | // recursive function 289 | void GetPinYin_(const std::vector& cnChars,const std::string& mid_result 290 | ,std::vector& result_list) { 291 | if (result_list.size() >= 1024) 292 | return; 293 | 294 | std::vector pinyin_term_list; 295 | uint32_t offset = 0; 296 | // case 1, only chinese and has pinyin 297 | if (!cnChars.empty() && Normalize::IsChinese(cnChars[0]) 298 | && (GetPinYinTerm(cnChars[0], pinyin_term_list))) { 299 | std::vector remain(cnChars.begin()+1,cnChars.end()); 300 | // std::cout << "T1: " << remain.size() << std::endl; 301 | std::string new_mid(mid_result); 302 | for (uint32_t i = 0; i < pinyin_term_list.size(); ++i) { 303 | std::string mid = new_mid + pinyin_term_list[i]; 304 | // std::cout << "T2: " << mid << std::endl; 305 | GetPinYin_(remain, mid, result_list); 306 | offset += 1; 307 | } 308 | } else { 309 | if (!cnChars.empty() && !Normalize::IsChinese(cnChars[0])) { 310 | // std::cout << "T33-1: " << cnChars.size() << std::endl; 311 | std::vector remain(cnChars.begin()+1,cnChars.end()); 312 | // std::cout << "T33: " << remain.size() << std::endl; 313 | std::string tmp(""); 314 | if (cnChars.size() != 0) 315 | Normalize::UnicodeToUTF8Str(cnChars[0], tmp); 316 | std::string mid = mid_result + tmp; 317 | //std::cout << "T3: " << mid << std::endl; 318 | GetPinYin_(remain, mid, result_list); 319 | } else { 320 | result_list.push_back(mid_result); 321 | //std::cout << "T4: " << mid_result << std::endl; 322 | } 323 | } 324 | } 325 | 326 | // get pinyin list from chinese character map 327 | bool GetPinYinTerm(const UCS2Char& cnChar, std::vector& result) { 328 | Cn2PinYinType::iterator cnIter; 329 | cnIter = cn2pinyin_.find(cnChar); 330 | if (cnIter != cn2pinyin_.end()) { 331 | result = cnIter->second; 332 | return true; 333 | } 334 | return false; 335 | } 336 | 337 | // Reload 338 | // get pinyin list from chinese character map 339 | bool GetPinYinTerm(const std::string& cnChar, std::vector& result) { 340 | std::vector cnChars; 341 | utf8::utf8to16(cnChar.begin(), cnChar.end(), std::back_inserter(cnChars)); 342 | Cn2PinYinType::iterator cnIter; 343 | cnIter = cn2pinyin_.find(cnChars[0]); 344 | if (cnIter != cn2pinyin_.end()) { 345 | result = cnIter->second; 346 | return true; 347 | } 348 | return false; 349 | } 350 | 351 | // maximum match 352 | void Fmm(const std::string& line, std::vector& r) { 353 | r.clear(); 354 | std::string uline(line); 355 | std::vector lens, cumu_lens; 356 | 357 | // remove invalid encoding 358 | Normalize::RemoveInvalidUTF8(uline); 359 | std::string::iterator it = uline.begin(); 360 | while (it != uline.end()) { 361 | uint32_t code = utf8::next(it, uline.end()); 362 | std::string _str; // insert from string back 363 | utf8::append(code, std::back_inserter(_str)); 364 | lens.push_back(_str.length()); 365 | if (cumu_lens.size() > 0) { 366 | cumu_lens.push_back(_str.length()+cumu_lens.back()); 367 | } else { 368 | cumu_lens.push_back(_str.length()); 369 | } 370 | } 371 | 372 | // start maximum match 373 | std::size_t key_pos = 0; 374 | for (std::size_t j = 0; j < lens.size(); ++j) { 375 | std::size_t last_j = j, jj = j; 376 | Trie::value_type last_state = -1; 377 | Trie::value_type state; 378 | std::size_t node_pos = 0; 379 | 380 | // traverse trie and check the node exist or not 381 | while (j < lens.size() 382 | && (state=trie_.traverse(uline.c_str(), node_pos, key_pos, cumu_lens[j])) != -2) { 383 | j++; 384 | if (state < 0) 385 | continue; 386 | last_state = state; 387 | last_j = j -1; 388 | } 389 | 390 | // found 391 | if (last_state >=0) { 392 | std::string py; 393 | if ((uint32_t)last_state < pinyin_.size()) { 394 | py = std::string(uline.c_str()+cumu_lens[jj]-lens[jj],uline.c_str()+cumu_lens[last_j]); 395 | r.push_back(py); 396 | } 397 | } else { 398 | std::string py; 399 | py = std::string(uline.c_str()+cumu_lens[jj]-lens[jj], uline.c_str()+cumu_lens[jj]); 400 | r.push_back(py); 401 | } 402 | j = last_j; 403 | key_pos = cumu_lens[j]; 404 | } 405 | } 406 | 407 | }; 408 | 409 | #endif // dictionary.hpp 410 | -------------------------------------------------------------------------------- /include/pinyinConvert.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: pinyinConvert.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 09 Mar 2016 10:30:45 AM CST 7 | ************************************************************************/ 8 | #ifndef PINYINCONVERT_H 9 | #define PINYINCONVERT_H 10 | 11 | #include 12 | #include 13 | #include 14 | #include "mtrie.h" 15 | #include "py_types.h" 16 | 17 | class PinYinConvert 18 | { 19 | 20 | typedef MTrie PinyinDictType; 21 | typedef MTrie FuzzyDictType; 22 | typedef boost::unordered_map > Cn2PinyinType; 23 | typedef boost::unordered_map > Pinyin2CnType; 24 | 25 | typedef boost::tuple QueryLogType; 26 | typedef std::pair PropertyLabelType; 27 | struct TransProbType 28 | { 29 | boost::unordered_map u_trans_prob_; 30 | boost::unordered_map b_trans_prob_; 31 | boost::unordered_map t_trans_prob_; 32 | void clear() 33 | { 34 | u_trans_prob_.clear(); 35 | b_trans_prob_.clear(); 36 | t_trans_prob_.clear(); 37 | } 38 | bool empty() const 39 | { 40 | return u_trans_prob_.empty() && b_trans_prob_.empty() && t_trans_prob_.empty(); 41 | } 42 | }; 43 | 44 | public: 45 | explicit PinYinConvert(const std::string& collection_dir = ""); 46 | 47 | 48 | // load resource 49 | bool load(); 50 | 51 | bool getResult(const izenelib::util::UString& input, 52 | std::vector >& pinyin_list, 53 | std::vector& output); 54 | 55 | void AddPinyinMap(const std::string& pinyin, const izenelib::util::UCS2Char& cn_char); 56 | 57 | void getPinyin(const izenelib::util::UString& cn_chars, std::vector& result_list); 58 | 59 | void getChar(const std::string& pinyin, std::vector& result_list); 60 | 61 | void getRelativeList(const izenelib::util::UString& hanzi, 62 | std::vector >& ResultList); 63 | 64 | static std::string res_dir_; 65 | 66 | private: 67 | void loadRawTextTransProb_(TransProbType& trans_prob, const std::string file); 68 | 69 | void flushRawTextTransProb_(const std::string& file, const TransProbType& trans_prob); 70 | 71 | void transProb_(const izenelib::util::UCS2Char& from, const izenelib::util::UCS2Char& to); 72 | 73 | void updateItem_(TransProbType& trans_prob, const uint32_t df, const izenelib::util::UString& text); 74 | 75 | int getInputType_(const izenelib::util::UString& input); 76 | 77 | bool getResultWithScore_(const izenelib::util::UString& input 78 | int type, 79 | std::vector >& pinyin_list, 80 | std::vector& output); 81 | 82 | void getResultByPinyin_(const std::string& pinyin, double pinyin_score, 83 | std::vector& output); 84 | 85 | // trigram 86 | void getResultByPinyinT_(const std::string& pinyin, double pinyin_score, 87 | std::vector& output); 88 | 89 | void getResultByPinyinTRecur_(const std::string& pinyin,double base_score, 90 | std::pair& mid_result, 91 | std::vector& output); 92 | 93 | double getScore_(const izenelib::util::UString& text,double ori_score, 94 | double pinyin_score); 95 | 96 | bool isCandidate_(const izenelib::util::UString& text, double ori_score, 97 | double pinyin_score, double& score); 98 | 99 | bool isCandidateResult_(const izenelib::util::UString& text, double ori_score, 100 | double pinyin_score, double& score); 101 | 102 | 103 | static TransProbType global_trans_prob_; 104 | TransProbType collection_trans_prob_; 105 | 106 | std::string collection_dir_; 107 | double threshold_; 108 | double mid_threshold_; 109 | uint16_t max_pinyin_term_; 110 | 111 | PinyinDictType pinyin_dict_; 112 | Pinyin2CnType pinyin2cn_; 113 | Cn2PinyinType cn2pinyin_; 114 | boost::unordered_map filter_pinyin_; 115 | 116 | boost::mutex mutex_; 117 | } 118 | 119 | 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /include/pinyinEngine.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: pinyinEngine.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Mon 11 Jul 2016 01:50:06 PM CST 7 | ************************************************************************/ 8 | #ifndef PINYIN_ENGINE_H 9 | #define PINYIN_ENGINE_H 10 | 11 | 12 | #include 13 | #include "util/mtrie.h" 14 | #include "util/py_types.h" 15 | 16 | 17 | class PinYinEngine { 18 | private: 19 | typedef MTrie TrieType; 20 | 21 | struct TransProbType { 22 | // unigram transition probability 23 | boost::unordered_map u_trans_prob_; 24 | boost::unordered_map b_trans_prob_; // bigram 25 | boost::unordered_map t_trans_prob_; // trigram 26 | 27 | // clear all data 28 | void clear() { 29 | u_trans_prob_.clear(); 30 | b_trans_prob_.clear(); 31 | t_trans_prob_.clear(); 32 | } 33 | 34 | bool empty() { 35 | return u_trans_prob_.empty() && b_trans_prob_.empty() && t_trans_prob_.empty(); 36 | } 37 | }; 38 | 39 | public: 40 | // load resource 41 | bool Load() { 42 | } 43 | 44 | // get chinese character 45 | void GetCnChar() { 46 | } 47 | 48 | // get pinyin from chinese character 49 | void GetPinYin() { 50 | } 51 | 52 | void GetPinYinWithScore() { 53 | } 54 | 55 | // pinyin tokenizer 56 | void PySegment() { 57 | } 58 | 59 | // fuzzy segmentation 60 | void FuzzyPySegment() { 61 | } 62 | 63 | private: 64 | 65 | 66 | 67 | 68 | }; 69 | 70 | #endif // pinyinEngine.h 71 | -------------------------------------------------------------------------------- /include/segment/hash_table.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HASH_TABLE_HPP_ 2 | #define HASH_TABLE_HPP_ 3 | #include 4 | #include 5 | #include 6 | #include "kstring.hpp" 7 | 8 | 9 | template 10 | class KIntegerHashTable 11 | { 12 | typedef struct _NODE_ 13 | { 14 | union{ 15 | uint8_t key_[sizeof(KEY_T)]; 16 | KEY_T key_type_value_; 17 | }; 18 | union{ 19 | uint8_t value_[sizeof(VALUE_T)]; 20 | VALUE_T value_type_value_; 21 | }; 22 | union{ 23 | uint8_t next_[sizeof(uint32_t)]; 24 | uint32_t next_type_value_; 25 | }; 26 | 27 | _NODE_(const KEY_T& k, const VALUE_T& v, uint32_t ne = -1) 28 | { 29 | key_type_value_ = k; 30 | value_type_value_ = v; 31 | next_type_value_ = ne; 32 | } 33 | 34 | _NODE_() 35 | { 36 | next_type_value_ = -1; 37 | } 38 | 39 | KEY_T key()const 40 | { 41 | return key_type_value_; 42 | } 43 | 44 | VALUE_T value()const 45 | { 46 | return value_type_value_; 47 | } 48 | 49 | KEY_T& key() 50 | { 51 | return key_type_value_; 52 | } 53 | 54 | VALUE_T& value() 55 | { 56 | return value_type_value_; 57 | } 58 | 59 | uint32_t& next() 60 | { 61 | return next_type_value_; 62 | } 63 | 64 | uint32_t next()const 65 | { 66 | return next_type_value_; 67 | } 68 | 69 | bool operator == (const struct _NODE_& o)const 70 | { 71 | return key() == o.key(); 72 | } 73 | 74 | _NODE_& operator = (const struct _NODE_& o) 75 | { 76 | key() = o.key(); 77 | value() = o.value(); 78 | next() = o.next(); 79 | return *this; 80 | } 81 | } node_t; 82 | 83 | node_t* nodes_; 84 | uint32_t nodes_num_; 85 | uint32_t* entry_; 86 | uint32_t entry_size_; 87 | uint32_t avai_i_; 88 | uint32_t size_; 89 | 90 | uint32_t expansion_(uint32_t len) 91 | { 92 | if (len < 1000)return 2*len; 93 | if (len < 10000)return 1.5*len; 94 | return 1.1*len; 95 | } 96 | 97 | uint32_t available_() 98 | { 99 | if (size_+2 < nodes_num_) 100 | return avai_i_; 101 | uint32_t nn = expansion_(nodes_num_); 102 | node_t* n = new node_t[nn]; 103 | memcpy(n, nodes_, nodes_num_*sizeof(node_t)); 104 | for ( uint32_t i=nodes_num_-1; i entry_size_) 136 | { 137 | uint32_t* e = new uint32_t[ent_size]; 138 | memcpy(e, entry_, entry_size_*sizeof(uint32_t)); 139 | delete[] entry_; 140 | entry_ = e; 141 | entry_size_ = ent_size; 142 | } 143 | if (element_num > nodes_num_) 144 | { 145 | node_t* n = new node_t[element_num]; 146 | memcpy(n, nodes_, nodes_num_*sizeof(node_t)); 147 | delete[] nodes_; 148 | for ( uint32_t i=nodes_num_-1; i* ptr_; 287 | uint32_t idx_; 288 | uint32_t ei_; 289 | 290 | public: 291 | iterator(KIntegerHashTable* ptr = NULL, uint32_t idx = -1, uint32_t ei = 0) 292 | :ptr_(ptr),idx_(idx),ei_(ei) 293 | {} 294 | 295 | iterator& operator ++(int) 296 | { 297 | if (!ptr_ || idx_ == (uint32_t)-1) 298 | return *this; 299 | 300 | idx_ = ptr_->nodes_[idx_].next(); 301 | if(idx_ != (uint32_t)-1) 302 | return *this; 303 | 304 | ei_++; 305 | while(ei_ < ptr_->entry_size_ && ptr_->entry_[ei_] == (uint32_t)-1) 306 | ei_++; 307 | 308 | if (ei_ >= ptr_->entry_size_) 309 | return *this; 310 | 311 | idx_ = ptr_->entry_[ei_]; 312 | return *this; 313 | } 314 | 315 | iterator& operator ++() 316 | { 317 | return (*this)++; 318 | } 319 | 320 | KEY_T* key() 321 | { 322 | if (idx_ == (uint32_t)-1) 323 | return NULL; 324 | return &(ptr_->nodes_[idx_].key()); 325 | } 326 | 327 | VALUE_T* value() 328 | { 329 | if (idx_ == (uint32_t)-1) 330 | return NULL; 331 | return &(ptr_->nodes_[idx_].value()); 332 | } 333 | 334 | bool operator == (const iterator& o)const 335 | { 336 | return ptr_ == o.ptr_ && idx_ == o.idx_; 337 | } 338 | 339 | bool operator != (const iterator& o)const 340 | { 341 | return ptr_ != o.ptr_ || idx_ != o.idx_; 342 | } 343 | }; 344 | 345 | iterator begin() 346 | { 347 | uint32_t ei = 0; 348 | while(ei < entry_size_ && entry_[ei] == (uint32_t)-1) 349 | ei++; 350 | if (ei >= entry_size_) 351 | return end(); 352 | return iterator(this, entry_[ei], ei); 353 | } 354 | 355 | iterator end() 356 | { 357 | return iterator(this, -1); 358 | } 359 | }; 360 | 361 | template< 362 | class KEY_T, 363 | class VALUE_T 364 | > 365 | class KStringHashTable 366 | { 367 | KIntegerHashTable table_; 368 | 369 | uint64_t hash_(const std::string& str)const 370 | { 371 | // return HashFunction::generateHash64(str); 372 | } 373 | 374 | uint64_t hash_(const KString& kstr)const 375 | { 376 | std::string str = kstr.get_bytes("utf-8"); 377 | return hash_(str); 378 | //return izenelib::util::HashFunction::generateHash64((char*)kstr.get_bytes(), kstr.length()*sizeof(uint16_t)); 379 | } 380 | 381 | public: 382 | KStringHashTable(uint32_t ent_size = 100000, uint32_t element_num = 50000) 383 | :table_(ent_size, element_num) 384 | {} 385 | 386 | void reserve(uint32_t ent_size, uint32_t element_num) 387 | { 388 | table_.reserve(ent_size, element_num); 389 | } 390 | 391 | void insert(const KEY_T& k, const VALUE_T& v) 392 | { 393 | uint64_t h = hash_(k); 394 | table_.insert(h, v); 395 | } 396 | 397 | void insert(const uint64_t& k, const VALUE_T& v) 398 | { 399 | table_.insert(k, v); 400 | } 401 | 402 | VALUE_T* find(const KEY_T& k) 403 | { 404 | uint64_t h = hash_(k); 405 | return table_.find(h); 406 | } 407 | 408 | VALUE_T* find(const uint64_t& k) 409 | { 410 | return table_.find(k); 411 | } 412 | 413 | bool erase(const KEY_T& k) 414 | { 415 | uint64_t h = hash_(k); 416 | return table_.erase(h); 417 | } 418 | 419 | void persistence(const std::string& nm)const 420 | { 421 | table_.persistence(nm); 422 | } 423 | 424 | void load(const std::string& nm) 425 | { 426 | table_.load(nm); 427 | } 428 | 429 | uint32_t size()const 430 | { 431 | return table_.size(); 432 | } 433 | 434 | typedef typename KIntegerHashTable::iterator iterator; 435 | 436 | iterator begin() 437 | { 438 | return table_.begin(); 439 | } 440 | 441 | iterator end() 442 | { 443 | return table_.end(); 444 | } 445 | 446 | }; 447 | #endif 448 | 449 | 450 | -------------------------------------------------------------------------------- /include/segment/kstring.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KSTRING__HPP 2 | #define KSTRING__HPP 3 | 4 | #include "sys/types.h" 5 | #include "/usr/include/iconv.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) 18 | # include 19 | #else 20 | # include 21 | #endif 22 | 23 | /** 24 | * @brief Unicode string. 25 | * Structrue is like this: 26 | * { 27 | * uint32_t reference_count; 28 | * uint32_t the capacity of buffer in sizeof(uint16_t); 29 | * uint32_t # of chars of string; 30 | * uint16_t* unicode array of string; 31 | * } 32 | */ 33 | class KString 34 | { 35 | uint32_t* mem_; 36 | 37 | #define CHECK_NULL(mem) {if(!mem)return;} 38 | 39 | uint32_t& reference_count_() 40 | { 41 | assert(mem_ != NULL); 42 | return (*(uint32_t*)mem_); 43 | } 44 | 45 | uint32_t& char_num_() 46 | { 47 | assert(mem_ != NULL); 48 | return (*(uint32_t*)(mem_+2)); 49 | } 50 | 51 | uint32_t& capacity_() 52 | { 53 | assert(mem_ != NULL); 54 | return (*(uint32_t*)(mem_+1)); 55 | } 56 | 57 | uint32_t capacity_()const 58 | { 59 | if (!mem_)return 0; 60 | return (*(uint32_t*)(mem_+1)); 61 | } 62 | 63 | uint32_t total_bytes_(uint32_t char_num = 0)const 64 | { 65 | if (char_num == 0 && mem_) 66 | return sizeof(uint32_t)*3 +sizeof(uint16_t)*capacity_(); 67 | return sizeof(uint32_t)*3 +sizeof(uint16_t)*char_num; 68 | } 69 | 70 | uint16_t* unicodes_() 71 | { 72 | assert(mem_ != NULL); 73 | return (uint16_t*)(mem_+3); 74 | } 75 | 76 | uint16_t* unicodes_()const 77 | { 78 | assert(mem_ != NULL); 79 | return (uint16_t*)(mem_+3); 80 | } 81 | 82 | void refer_() 83 | { 84 | CHECK_NULL(mem_); 85 | __gnu_cxx::__atomic_add(( _Atomic_word*)(uint32_t*)mem_, 1); 86 | } 87 | 88 | void defer_() 89 | { 90 | CHECK_NULL(mem_); 91 | if (__sync_add_and_fetch(( _Atomic_word*)(uint32_t*)mem_, -1) <= 0) 92 | { 93 | free(mem_); 94 | mem_ = NULL; 95 | } 96 | } 97 | 98 | uint32_t string_len_()const 99 | { 100 | if (!mem_)return 0; 101 | return (*(uint32_t*)(mem_+2)); 102 | } 103 | 104 | uint32_t expansion_(uint32_t t) 105 | { 106 | if (t + string_len_() < 100)return 2*(t + string_len_()); 107 | if (t + string_len_() < 1000)return 1.5*(t + string_len_()); 108 | return 1.1*(t + string_len_()); 109 | } 110 | 111 | void copy_on_write_() 112 | { 113 | CHECK_NULL(mem_); 114 | assert(reference_count_() > 0); 115 | if (reference_count_() == 1) 116 | return; 117 | uint32_t s = total_bytes_(); 118 | uint32_t* m = (uint32_t*)malloc(s); 119 | memcpy(m, mem_, s); 120 | defer_(); 121 | mem_ = m; 122 | reference_count_() = 1; 123 | } 124 | 125 | std::string encoding_name_(const std::string encode) 126 | { 127 | if (strcmp(encode.c_str(), "utf8") == 0) 128 | return "utf-8"; 129 | return encode; 130 | } 131 | public: 132 | explicit KString(const std::string& str, const std::string& encode="utf-8//IGNORE") 133 | :mem_(NULL) 134 | { 135 | if (str.length() == 0) 136 | return; 137 | reserve(str.length()); 138 | std::size_t inlen = str.length(); 139 | std::size_t outlen = capacity_()*2; 140 | char* out = (char*)unicodes_(); 141 | char* in = const_cast (str.c_str()); 142 | 143 | iconv_t hdl = iconv_open("ucs-2", encode.c_str());//encoding_name_(encode).c_str()) ; 144 | if (hdl == (iconv_t)-1) 145 | throw std::runtime_error("Can't initiate iconv handler"); 146 | std::size_t ret; 147 | while(1) 148 | { 149 | ret = iconv(hdl, &in, &inlen, &out, &outlen); 150 | if (inlen == 0) 151 | break; 152 | if (ret == (std::size_t)-1) 153 | { 154 | iconv_close(hdl); 155 | if(errno == E2BIG) 156 | throw std::runtime_error("Not enough output buffer for conversion."); 157 | if (errno == EINVAL) 158 | throw std::runtime_error("Incomplete multibyte sequence."); 159 | if (errno == EILSEQ)//std::cerr<<"Invalid multibyte sequence.\n"; 160 | throw std::runtime_error("Invalid multibyte sequence."); 161 | throw std::runtime_error("iconv error"); 162 | } 163 | } 164 | iconv_close(hdl); 165 | char_num_() = (capacity_()*2 - outlen)/2; 166 | reference_count_() = 1; 167 | } 168 | ~KString() 169 | { 170 | defer_(); 171 | } 172 | 173 | explicit KString() 174 | :mem_(NULL) 175 | {} 176 | 177 | /** 178 | * @brief 179 | * 180 | * @param s 181 | * @param e exclusivly 182 | */ 183 | explicit KString(uint16_t* s, uint16_t* e) 184 | { 185 | uint32_t len = (e - s); 186 | uint32_t b = total_bytes_(len); 187 | mem_ = (uint32_t*)malloc(b); 188 | memset(mem_, 0, b); 189 | memcpy(unicodes_(), s, len*sizeof(uint16_t)); 190 | capacity_() = len; 191 | char_num_() = len; 192 | reference_count_() = 1; 193 | } 194 | 195 | KString(const KString& o) 196 | { 197 | mem_ = o.mem_; 198 | refer_(); 199 | } 200 | 201 | KString& operator = (const KString& o) 202 | { 203 | defer_(); 204 | mem_ = o.mem_; 205 | refer_(); 206 | return *this; 207 | } 208 | 209 | uint32_t length()const 210 | { 211 | return string_len_(); 212 | } 213 | 214 | uint16_t char_at(uint32_t i)const 215 | { 216 | assert(i < string_len_()); 217 | return unicodes_()[i]; 218 | } 219 | 220 | uint16_t& operator [] (uint32_t i) 221 | { 222 | assert(i < string_len_()); 223 | copy_on_write_(); 224 | return unicodes_()[i]; 225 | } 226 | 227 | uint16_t operator [] (uint32_t i)const 228 | { 229 | return char_at(i); 230 | } 231 | 232 | int32_t compare_to(const KString& o)const 233 | { 234 | uint32_t i=0; 235 | for ( ; i o.char_at(i)) 239 | return 1; 240 | 241 | if (length() > i) 242 | return 1; 243 | else if (o.length() > i) 244 | return -1; 245 | return 0; 246 | } 247 | 248 | void reserve(uint32_t len) 249 | { 250 | if (mem_ && len < capacity_()) 251 | return; 252 | 253 | uint32_t s = total_bytes_(len); 254 | assert(s > sizeof(uint32_t)*3); 255 | uint32_t* m = (uint32_t*)malloc(s); 256 | memset(m, 0, s); 257 | if (mem_) 258 | { 259 | assert(total_bytes_() > sizeof(uint32_t)*3); 260 | assert(total_bytes_() < s); 261 | memcpy(m, mem_, total_bytes_()); 262 | defer_(); 263 | } 264 | mem_ = m; 265 | capacity_() = len; 266 | reference_count_() = 1; 267 | assert(capacity_() >= char_num_()); 268 | } 269 | 270 | void concat(const KString& o) 271 | { 272 | if (o.length() == 0)return; 273 | if (length() == 0) 274 | { 275 | *this = o; 276 | return; 277 | } 278 | if (mem_ && length() + o.length() < capacity_()) 279 | { 280 | copy_on_write_(); 281 | uint32_t l = char_num_(); 282 | char_num_() += o.length(); 283 | for ( uint32_t i=0; i=0 && j>=0; --i,--j) 343 | if (o[i] != char_at(j)) 344 | return false; 345 | return true; 346 | } 347 | 348 | bool end_with(const std::string& utf8str)const 349 | { 350 | return end_with(KString(utf8str)); 351 | } 352 | 353 | bool equals(const KString& o)const 354 | { 355 | return compare_to(o) == 0; 356 | } 357 | 358 | bool equals(const std::string& utf8str)const 359 | { 360 | return compare_to(KString(utf8str)) == 0; 361 | } 362 | 363 | bool operator == (const KString& o)const 364 | { 365 | return equals(o); 366 | } 367 | 368 | bool operator == (const std::string& utf8str)const 369 | { 370 | return equals(KString(utf8str)); 371 | } 372 | 373 | uint16_t* get_bytes()const 374 | { 375 | return unicodes_(); 376 | } 377 | 378 | std::string get_bytes(const std::string& encode)const 379 | { 380 | if (length() == 0)return ""; 381 | 382 | char* out = new char[length()*3]; 383 | char* outbuf = out; 384 | std::size_t inlen = length() *2 ; 385 | std::size_t outlen = length() *3; 386 | char* inbuf = (char*)unicodes_(); 387 | std::size_t ret = 0; 388 | iconv_t hdl = iconv_open(encode.c_str(), "ucs-2") ; 389 | if (hdl == (iconv_t)-1) 390 | { 391 | delete[] out; 392 | throw std::runtime_error("Can't initiate iconv handler"); 393 | } 394 | 395 | while(1) 396 | { 397 | ret = iconv(hdl, &inbuf, &inlen, &outbuf, &outlen); 398 | if (ret == 0) 399 | break; 400 | if (ret == (std::size_t)-1 && errno == E2BIG) 401 | { 402 | iconv_close(hdl); 403 | delete[] out; 404 | throw std::runtime_error("encoding convert error"); 405 | } 406 | inbuf++; 407 | inlen--; 408 | } 409 | 410 | iconv_close(hdl); 411 | if (outlen == (std::size_t)-1){ 412 | delete out; 413 | throw std::runtime_error("Not malloc enough memory."); 414 | } 415 | 416 | std::string re(out, length()*3-outlen); 417 | delete[] out; 418 | return re; 419 | } 420 | 421 | friend std::ostream& operator << (std::ostream& os, const KString& o) 422 | { 423 | os << o.get_bytes("utf-8"); 424 | return os; 425 | } 426 | 427 | uint32_t index_of(uint16_t c, uint32_t start_from=0)const 428 | { 429 | for ( uint32_t i=start_from; i length()) 443 | return -1; 444 | for ( uint32_t i=0; i<=length()-o.length(); ++i) 445 | if (char_at(i) == o[0]) 446 | { 447 | uint32_t j=0; 448 | for ( ; j=p+a.length(); --i,++j) 498 | unicodes_()[length() + b.length() - a.length()-j] = unicodes_()[i]; 499 | for ( uint32_t i=0; i 534 | split(uint16_t c) 535 | { 536 | std::vector v; 537 | uint32_t s = 0; 538 | uint32_t f = index_of(c); 539 | while (f != (uint32_t)-1) 540 | { 541 | v.push_back(substr(s, f- s)); 542 | s = f+1; 543 | f = index_of(c, s); 544 | } 545 | if (s < length()) 546 | v.push_back(substr(s)); 547 | return v; 548 | } 549 | 550 | void to_lower_case() 551 | { 552 | bool f = true; 553 | for ( uint32_t i=0; i= 'A' && char_at(i)<='Z') 555 | { 556 | f = false; 557 | break; 558 | } 559 | if (f)return; 560 | copy_on_write_(); 561 | for ( uint32_t i=0; i= 'A' && char_at(i)<='Z') 563 | unicodes_()[i] = 'a' + char_at(i) - 'A'; 564 | return; 565 | } 566 | 567 | void to_upper_case() 568 | { 569 | bool f = true; 570 | for ( uint32_t i=0; i= 'a' && char_at(i)<='z') 572 | { 573 | f = false; 574 | break; 575 | } 576 | if (f)return; 577 | copy_on_write_(); 578 | for ( uint32_t i=0; i= 'a' && char_at(i)<='z') 580 | unicodes_()[i] = 'A' + char_at(i) - 'a'; 581 | return; 582 | } 583 | 584 | void to_dbc() 585 | { 586 | bool f = true; 587 | for ( uint32_t i=0; i 65280 && char_at(i) < 65375)) 589 | { 590 | f = false; 591 | break; 592 | } 593 | if (f)return; 594 | copy_on_write_(); 595 | for ( uint32_t i=0; i 65280 && char_at(i) < 65375) 599 | unicodes_()[i] -= 65248; 600 | } 601 | 602 | void trim(uint16_t space = ' ') 603 | { 604 | uint32_t p = index_of(space); 605 | if (p == (uint32_t)-1) 606 | return; 607 | 608 | copy_on_write_(); 609 | uint32_t r = p +1; 610 | uint32_t t = 1; 611 | for(;r= length()) 628 | { 629 | copy_on_write_(), char_num_() = 0; 630 | return; 631 | } 632 | if (p != 0) 633 | { 634 | copy_on_write_(); 635 | for (uint32_t i=0;i+p=0 && char_at(i)==space; --i,--t); 640 | 641 | if ( p == 0 && length()!= t) 642 | copy_on_write_(); 643 | char_num_() = t; 644 | } 645 | 646 | void trim_into_1(uint16_t space = ' ') 647 | { 648 | uint32_t f = 0, t = 0, s = -2; 649 | bool chng = false; 650 | while (f < length()) 651 | { 652 | if (char_at(f) == space) 653 | { 654 | if (f - s == 1){f++,s++;continue;} 655 | s = f; 656 | } 657 | 658 | if (t != f && !chng) 659 | { 660 | copy_on_write_(); 661 | chng = true; 662 | } 663 | 664 | if (t != f) 665 | unicodes_()[t] = char_at(f); 666 | t++, f++; 667 | } 668 | if (chng) char_num_() = t; 669 | } 670 | 671 | static KString value_of(uint32_t v) 672 | { 673 | char buf[125]; 674 | memset(buf, 0, sizeof(buf)); 675 | sprintf(buf, "%d", v); 676 | return KString(std::string(buf)); 677 | } 678 | 679 | static KString value_of(int v) 680 | { 681 | char buf[125]; 682 | memset(buf, 0, sizeof(buf)); 683 | sprintf(buf, "%d", v); 684 | return KString(std::string(buf)); 685 | } 686 | 687 | static KString value_of(double v) 688 | { 689 | char buf[125]; 690 | memset(buf, 0, sizeof(buf)); 691 | sprintf(buf, "%f", v); 692 | return KString(std::string(buf)); 693 | } 694 | 695 | bool operator < (const KString& o)const 696 | { 697 | for ( uint32_t i=0; i o[i]) 701 | return false; 702 | 703 | if (length() < o.length()) 704 | return true; 705 | return false; 706 | } 707 | 708 | static bool is_korean(uint16_t ucs2char) 709 | { 710 | if ((ucs2char>=0x1100 && ucs2char<=0x11FF) 711 | ||(ucs2char>=0x3130 && ucs2char<=0x318F) 712 | ||(ucs2char>=0xAC00 && ucs2char<=0xD7AF) 713 | )return true; 714 | return false; 715 | } 716 | 717 | static bool is_chinese(uint16_t ucs2char) 718 | { 719 | if (((ucs2char>=0x2E80 && ucs2char<=0x2EF3) 720 | || (ucs2char>=0x2F00 && ucs2char<=0x2FD5) 721 | || (ucs2char>=0x3400 && ucs2char<=0x4DB5) 722 | || (ucs2char>=0x4E00 && ucs2char<=0x9FC3) 723 | || (ucs2char>=0xF900 && ucs2char<=0xFAD9)) 724 | && ucs2char!=12289 725 | && ucs2char!=12298 726 | && ucs2char!=12290 727 | && ucs2char!=12299 728 | && ucs2char!=65292 729 | && ucs2char!=65311 730 | && ucs2char!=65281 731 | && ucs2char!=65306 732 | && ucs2char!=65307 733 | && ucs2char!=8220 734 | && ucs2char!=8221 735 | && ucs2char!=12304 736 | && ucs2char!=12305 737 | && ucs2char!=65509 738 | && ucs2char!=8230 739 | && ucs2char!=65288 740 | && ucs2char!=65289 741 | && ucs2char!=8212 742 | && ucs2char!=20022) 743 | return true; 744 | 745 | return false; 746 | } 747 | 748 | static bool is_chn_numeric(uint16_t ucs2char) 749 | { 750 | if (ucs2char == 38646//零 751 | || ucs2char == 19968//一 752 | || ucs2char == 20108//二 753 | || ucs2char == 19977 754 | || ucs2char == 22235 755 | || ucs2char == 20116 756 | || ucs2char == 20845 757 | || ucs2char == 19971 758 | || ucs2char == 20843 759 | || ucs2char == 20061 760 | || ucs2char == 21313//十 761 | ) 762 | return true; 763 | return false; 764 | } 765 | 766 | static bool is_numeric(uint16_t ucs2char) 767 | { 768 | static const uint16_t zero('0'), nine('9'); 769 | if ( zero <= ucs2char && ucs2char <= nine ) 770 | return true; 771 | return false; 772 | } 773 | 774 | static bool is_english(uint16_t ucs2char) 775 | { 776 | static const uint16_t a('a'), z('z'), A('A'), Z('Z'); 777 | if ( ( a <= ucs2char && ucs2char <= z ) || ( A <= ucs2char && ucs2char <= Z ) ) 778 | return true; 779 | return false; 780 | } 781 | 782 | }; 783 | #endif 784 | -------------------------------------------------------------------------------- /include/segment/line_reader.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef LINE_READER_H_ 3 | #define LINE_READER_H_ 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace util{ 13 | 14 | class LineReader 15 | { 16 | FILE* f_; 17 | char* mem_; 18 | uint64_t bytes_; 19 | 20 | bool next_block_() 21 | { 22 | if (::feof(f_)) 23 | return false; 24 | memset(mem_, 0, bytes_); 25 | uint64_t p = ftell(f_); 26 | if(fread(mem_, bytes_, 1, f_)!=1); 27 | //std::cout<= mem_); 76 | assert(prev_line < mem_+bytes_); 77 | //std::cout<= mem_+bytes_ || *prev_line == 0) 84 | { 85 | if(next_block_())return mem_; 86 | return NULL; 87 | } 88 | return prev_line; 89 | } 90 | }; 91 | 92 | } 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /include/segment/normalize.h: -------------------------------------------------------------------------------- 1 | #ifndef NORMALIZE_H_ 2 | #define NORMALIZE_H_ 3 | 4 | #include "trd2simp.h" 5 | #include "kstring.hpp" 6 | 7 | #include 8 | 9 | 10 | class KNormalize 11 | { 12 | //static Trad2Simp trd2smp_; 13 | public: 14 | static void normalize(KString& kstr) 15 | { 16 | static Trad2Simp trd2smp_; 17 | try{ 18 | kstr.to_dbc(); 19 | kstr.to_lower_case(); 20 | trd2smp_.transform(kstr); 21 | kstr.trim_into_1(); 22 | kstr.trim_head_tail(); 23 | }catch(...){} 24 | } 25 | 26 | static void normalize(std::string& str) 27 | { 28 | try{ 29 | KString kstr(str); 30 | normalize(kstr); 31 | str = unicode_to_utf8(kstr); 32 | } 33 | catch(...){} 34 | } 35 | 36 | static std::string unicode_to_utf8(const KString& kstr) 37 | { 38 | std::string s; 39 | s.reserve(kstr.length() << 2); 40 | for(size_t i = 0; i < kstr.length(); ++i) 41 | { 42 | uint16_t unic = kstr[i]; 43 | if ( unic <= 0x0000007F ) 44 | { 45 | // * U-00000000 - U-0000007F: 0xxxxxxx 46 | s.append(1, unic & 0x7F); 47 | } 48 | else if ( unic >= 0x00000080 && unic <= 0x000007FF ) 49 | { 50 | // * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx 51 | s.append(1, (((unic >> 6) & 0x1F) | 0xC0)); 52 | s.append(1, ((unic & 0x3F) | 0x80)); 53 | } 54 | else if ( unic >= 0x00000800 && unic <= 0x0000FFFF ) 55 | { 56 | // * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 57 | s.append(1, (((unic >> 12) & 0x0F) | 0xE0)); 58 | s.append(1, (((unic >> 6) & 0x3F) | 0x80)); 59 | s.append(1, ((unic & 0x3F) | 0x80)); 60 | } 61 | else if ( unic >= 0x00010000 && unic <= 0x001FFFFF ) 62 | { 63 | // * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 64 | s.append(1, (((unic >> 18) & 0x07) | 0xF0)); 65 | s.append(1, (((unic >> 12) & 0x3F) | 0x80)); 66 | s.append(1, (((unic >> 6) & 0x3F) | 0x80)); 67 | s.append(1, ((unic & 0x3F) | 0x80)); 68 | } 69 | else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF ) 70 | { 71 | // * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 72 | s.append(1, (((unic >> 24) & 0x03) | 0xF8)); 73 | s.append(1, (((unic >> 18) & 0x3F) | 0x80)); 74 | s.append(1, (((unic >> 12) & 0x3F) | 0x80)); 75 | s.append(1, (((unic >> 6) & 0x3F) | 0x80)); 76 | s.append(1, ((unic & 0x3F) | 0x80)); 77 | } 78 | else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF ) 79 | { 80 | // * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 81 | s.append(1, (((unic >> 30) & 0x01) | 0xFC)); 82 | s.append(1, (((unic >> 24) & 0x3F) | 0x80)); 83 | s.append(1, (((unic >> 18) & 0x3F) | 0x80)); 84 | s.append(1, (((unic >> 12) & 0x3F) | 0x80)); 85 | s.append(1, (((unic >> 6) & 0x3F) | 0x80)); 86 | s.append(1, ((unic & 0x3F) | 0x80)); 87 | } 88 | } 89 | return s; 90 | } 91 | 92 | 93 | }; 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /include/segment/segment.h: -------------------------------------------------------------------------------- 1 | #ifndef NLP_SEGMENT_H_ 2 | #define NLP_SEGMENT_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "kstring.hpp" 17 | #include "normalize.h" 18 | #include "segment_dict.h" 19 | #include "line_reader.h" 20 | 21 | namespace knlp 22 | { 23 | class HorseTokenize{ 24 | KDictionary tk_dict_; 25 | KDictionary rewrite_dict_; 26 | 27 | bool is_digit_(char c)const 28 | { 29 | if (c >= '0' && c<='9') 30 | return true; 31 | if (c == '.' || c == '-' || c == '+' || c == '/' || c == '=' || c== '*' || c== '%' 32 | || c == ',' || c == '$' || c == '&' || c == '_') 33 | return true; 34 | return false; 35 | } 36 | 37 | bool is_punct_(char c)const 38 | { 39 | if (c == '.' || c == '-' || c == '+' || c == '/' || c == '=' || c== '*' || c== '%' 40 | || c == ',' || c == '$' || c == '&' || c == '_') 41 | return true; 42 | return false; 43 | } 44 | 45 | void merge_(std::vector >& tks)const 46 | { 47 | std::vector flags(tks.size(), 0); 48 | for (uint32_t i=0;i='a' && tks[i].first[j]<='z' && tks[i].first.length()<=2)) 54 | break; 55 | if (j < tks[i].first.length())flags[i] = 1; 56 | } 57 | 58 | for (uint32_t i=0;i >& tks)const 74 | { 75 | for (uint32_t i=0;i0 && is_punct_(tks[i].first[t])) 92 | { 93 | tks[i].first = tks[i].first.substr(0, t); 94 | t--; 95 | } 96 | } 97 | } 98 | 99 | public: 100 | HorseTokenize(const std::string& dir) 101 | :tk_dict_(dir+"/token.dict") 102 | ,rewrite_dict_(dir + "/rewrite.dict") 103 | { 104 | } 105 | 106 | void tokenize(const std::string& line, 107 | std::vector >& tks)const 108 | { 109 | tk_dict_.fmm(line, tks); 110 | //for(uint32_t i=0;i >& tks, 116 | std::vector >& subs)const 117 | { 118 | tk_dict_.subtokens(tks, subs); 119 | for (uint32_t i=0;i v; 122 | boost::split(v, subs[i].first, boost::is_any_of("-, /")); 123 | std::vector > s; 124 | for (uint32_t j=0;j >& subs)const 135 | { 136 | std::vector > tks(1, std::make_pair(token, 1.0)); 137 | tk_dict_.subtokens(tks, subs); 138 | for (uint32_t i=0;i v; 141 | boost::split(v, subs[i].first, boost::is_any_of("-, /")); 142 | std::vector > s; 143 | for (uint32_t j=0;j=0 ) 161 | return w; 162 | return tk_dict_.min(); 163 | } 164 | }; 165 | } 166 | 167 | #endif 168 | -------------------------------------------------------------------------------- /include/segment/segment_dict.h: -------------------------------------------------------------------------------- 1 | #ifndef NLP_SEGMENT_DICT_H_ 2 | #define NLP_SEGMENT_DICT_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "kstring.hpp" 16 | #include "normalize.h" 17 | #include "darts.h" 18 | #include "line_reader.h" 19 | #include "util/utf8.h" 20 | 21 | namespace knlp 22 | { 23 | 24 | templateclass KDictionary; 25 | template void store_values_(const std::vector& v, KDictionary& dict); 26 | template void save_values_(const std::string& nm, KDictionary& dict); 27 | template void load_values_(const std::string& nm, KDictionary& dict); 28 | 29 | template 30 | class KDictionary 31 | { 32 | Darts::DoubleArray trie_; 33 | std::vector values_; 34 | T min_, max_; 35 | 36 | friend void store_values_(const std::vector& v, KDictionary& dict); 37 | friend void save_values_(const std::string& nm, KDictionary& dict); 38 | friend void load_values_(const std::string& nm, KDictionary& dict); 39 | 40 | void normalize_(std::string& str)const 41 | { 42 | KNormalize::normalize(str); 43 | } 44 | 45 | void load_(const std::string& nm) 46 | { 47 | int32_t f = 1; 48 | try{ 49 | if((f = trie_.open((nm+".bin").c_str()))==0) 50 | load_values_(nm+".v", *this); 51 | } 52 | catch(...) 53 | {} 54 | 55 | if (0 == f) 56 | return; 57 | 58 | std::set > k_v; 59 | std::set keySet; 60 | char* li = NULL; 61 | util::LineReader lr(nm); 62 | while((li = lr.line(li)) != NULL) 63 | { 64 | char* t = strchr(li, '\t'); 65 | std::string k,v; 66 | if(t) 67 | { 68 | k = std::string(li, t); 69 | v = std::string(t+1); 70 | }else 71 | k = std::string(li); 72 | normalize_(k); 73 | if (k.length() == 0 || keySet.find(k)!=keySet.end()) 74 | continue; 75 | 76 | k_v.insert(std::make_pair(k, v)); 77 | keySet.insert(k); 78 | } 79 | 80 | std::vector keys(k_v.size()); 81 | std::vector v(k_v.size()); 82 | std::vector lengths(k_v.size()); 83 | std::vector values(k_v.size()); 84 | 85 | uint32_t i = 0, ff = 0; 86 | for (std::set >::const_iterator it=k_v.begin();it!=k_v.end();++it,i++) 87 | { 88 | keys[i] = it->first.c_str() 89 | ,lengths[i]=it->first.length() 90 | ,values[i] = i; 91 | if (it->second.length()) 92 | v[i]=it->second,ff=1; 93 | } 94 | assert(keys.size() == v.size()); 95 | trie_.build(keys.size(), &keys[0], &lengths[0], &values[0]); 96 | trie_.save((nm+".bin").c_str()); 97 | 98 | if (ff == 1) 99 | { 100 | store_values_(v, *this); 101 | save_values_(nm+".v", *this); 102 | } 103 | } 104 | 105 | public: 106 | 107 | KDictionary(const std::string& dict_nm) 108 | { 109 | load_(dict_nm); 110 | if (values_.size() == 0)return; 111 | min_ = *std::min_element(values_.begin(), values_.end()); 112 | max_ = *std::max_element(values_.begin(), values_.end()); 113 | } 114 | 115 | ~KDictionary() 116 | { 117 | } 118 | 119 | /** 120 | * Return value: 121 | * 0: sucess 122 | * 1: sucess, but no value for this key. 123 | * -1: not found 124 | * */ 125 | int32_t value(std::string key, T& v, bool nor = true)const 126 | { 127 | if (nor) 128 | normalize_(key); 129 | Darts::DoubleArray::result_pair_type res; 130 | trie_.exactMatchSearch(key.c_str(), res, key.length()); 131 | 132 | if (res.length == 0 && res.value == -1) 133 | return -1; 134 | 135 | if ((std::size_t)res.value < values_.size()) 136 | { 137 | v = values_[res.value]; 138 | return 0; 139 | } 140 | 141 | return 1; 142 | } 143 | 144 | T min()const 145 | { 146 | return min_; 147 | } 148 | 149 | T max()const 150 | { 151 | return max_; 152 | } 153 | 154 | bool has_key(std::string key, bool nor = true)const 155 | { 156 | T v; 157 | int32_t r = value(key, v, nor); 158 | if(r >= 0) 159 | return true; 160 | return false; 161 | } 162 | 163 | void fmm(std::string line, std::vector >& r, bool nor = true)const 164 | { 165 | r.clear(); 166 | if(nor)normalize_(line); 167 | // KString kstr(line); 168 | std::vector lens, cumu_lens; 169 | // check encoding 170 | std::string::iterator str_end = utf8::find_invalid(line.begin(), line.end()); 171 | // get bytes numbers 172 | line = std::string(line.begin(), str_end); 173 | std::string::iterator it = line.begin(); 174 | while(it != line.end()) 175 | { 176 | //std::string str = kstr.substr(i,1).get_bytes("utf-8"); 177 | uint32_t code = utf8::next(it, line.end()); 178 | std::string str; 179 | utf8::append(code, std::back_inserter(str)); 180 | lens.push_back(str.length()); 181 | if (cumu_lens.size() > 0) 182 | cumu_lens.push_back(str.length()+cumu_lens.back()); 183 | else cumu_lens.push_back(str.length()); 184 | } 185 | 186 | std::size_t key_pos=0; 187 | for (std::size_t j = 0; j =0) 205 | { 206 | T v = T();if ((uint32_t)last_state < values_.size())v = values_[last_state]; 207 | r.push_back(std::make_pair(std::string(line.c_str()+cumu_lens[jj]-lens[jj], line.c_str()+cumu_lens[last_j]), v)); 208 | } 209 | else 210 | r.push_back(std::make_pair(std::string(line.c_str()+cumu_lens[jj]-lens[jj], line.c_str()+cumu_lens[jj]), min())); 211 | j = last_j; 212 | key_pos = cumu_lens[j]; 213 | } 214 | } 215 | 216 | void subtokens(const std::vector >& tks, 217 | std::vector >& subs)const 218 | { 219 | subs.clear(); 220 | for (uint32_t i=0; i > ss; 231 | do{ 232 | std::vector > s; 233 | fmm(line, s, false); 234 | if (ss.size() == 0)line = tks[i].first; 235 | ss.push_back(s[0]); 236 | line = line.substr(s[0].first.length()); 237 | }while(line.length()); 238 | 239 | if (ss.size() >= kstr.length()) 240 | subs.push_back(tks[i]); 241 | else 242 | subs.insert(subs.end(), ss.begin(), ss.end()); 243 | } 244 | } 245 | }; 246 | 247 | template 248 | inline void store_values_(const std::vector& v, KDictionary& dict) 249 | { 250 | dict.values_.resize(v.size(), 0); 251 | for (uint32_t i=0;i 256 | inline void save_values_(const std::string& nm, KDictionary& dict) 257 | { 258 | FILE* f = fopen(nm.c_str(), "w+"); 259 | if (!f) 260 | return; 261 | uint32_t s = dict.values_.size(); 262 | assert(fwrite(&s, sizeof(s), 1, f) == 1); 263 | assert(fwrite(&dict.values_[0], dict.values_.size()*sizeof(T), 1, f) == 1); 264 | fclose(f); 265 | } 266 | 267 | template 268 | inline void load_values_(const std::string& nm, KDictionary& dict) 269 | { 270 | FILE* f = fopen(nm.c_str(), "r"); 271 | if (!f) 272 | return; 273 | uint32_t s = 0; 274 | assert(fread(&s, sizeof(s), 1, f) == 1); 275 | dict.values_.resize(s); 276 | assert(fread(&dict.values_[0], dict.values_.size()*sizeof(T), 1, f) == 1); 277 | fclose(f); 278 | } 279 | 280 | template<> 281 | inline void store_values_(const std::vector& v, KDictionary& dict) 282 | { 283 | dict.values_.resize(v.size(), NULL); 284 | for (uint32_t i=0;i 294 | inline void store_values_(const std::vector& v, KDictionary& dict) 295 | { 296 | dict.values_.resize(v.size(), 0); 297 | for (uint32_t i=0;i 302 | inline void store_values_(const std::vector& v, KDictionary& dict) 303 | { 304 | dict.values_.resize(v.size(), 0); 305 | for (uint32_t i=0;i 310 | inline void save_values_(const std::string& nm, KDictionary& dict) 311 | { 312 | FILE* f = fopen(nm.c_str(), "w+"); 313 | if (!f) 314 | return; 315 | uint32_t s = dict.values_.size(); 316 | fwrite(&s, 1, sizeof(s), f); 317 | for (uint32_t i=0;i 327 | inline void load_values_(const std::string& nm, KDictionary& dict) 328 | { 329 | FILE* f = fopen(nm.c_str(), "r"); 330 | if (!f) 331 | return; 332 | uint32_t s = 0; 333 | assert(fread(&s, sizeof(s), 1, f) == 1); 334 | dict.values_.resize(s); 335 | for (uint32_t i=0;i 12 | #include 13 | #include "segment/segment.h" 14 | //#include "PropSharedLock.h" 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | // Sort by second 23 | // NOTE: Typename R muse be comparable 24 | template 25 | class SORT 26 | { 27 | public: 28 | typedef std::pair VectorPair; 29 | 30 | static bool sortDescendBySecond(const VectorPair& lhs, const VectorPair& rhs){ 31 | return lhs.second > rhs.second; 32 | } 33 | 34 | static bool sortAscendBySecond(const VectorPair& lhs, const VectorPair& rhs){ 35 | return lhs.second < rhs.second; 36 | } 37 | }; 38 | 39 | class SegmentWrapper 40 | { 41 | private: 42 | std::string dictDir_; 43 | boost::unordered_set stopWords_; 44 | std::auto_ptr tok_; 45 | 46 | public: 47 | SegmentWrapper(const std::string& dictDir) 48 | :dictDir_(dictDir){ 49 | tok_.reset(new knlp::HorseTokenize(dictDir)); 50 | loadSpecialWords((dictDir_+"/stop_words.utf8")); 51 | } 52 | 53 | ~SegmentWrapper(){ 54 | } 55 | 56 | bool isDigital(uint16_t uchar){ 57 | uint16_t zero = '0'; 58 | uint16_t nine = '9'; 59 | if(zero <= uchar && uchar <= nine) 60 | return true; 61 | return false; 62 | } 63 | 64 | bool isAlpha(uint16_t uchar){ 65 | uint16_t a = 'a'; 66 | uint16_t z = 'z'; 67 | uint16_t A = 'A'; 68 | uint16_t Z = 'Z'; 69 | if((a <= uchar && uchar <= z)||(A <= uchar && uchar <= Z)) 70 | return true; 71 | return false; 72 | } 73 | 74 | // To check a term si alphanumberic after removing dot 75 | bool isAlphaNumberic(const std::string& term){ 76 | std::string nstr = boost::replace_all_copy(term, ".", ""); 77 | std::string::size_type i; 78 | for(i = 0; i < nstr.size(); ++i){ 79 | if(isDigital(nstr[i]) || isAlpha(nstr[i])) 80 | continue; 81 | return false; 82 | } 83 | return true; 84 | } 85 | 86 | bool loadSpecialWords(const std::string& filename){ 87 | if(filename.empty()) 88 | return false; 89 | std::ifstream ifs(filename.c_str()); 90 | std::string line; 91 | while(getline(ifs, line)){ 92 | if(line.empty()) 93 | continue; 94 | boost::algorithm::trim(line); 95 | stopWords_.insert(line); 96 | } 97 | ifs.close(); 98 | assert(stopWords_.size()); 99 | return true; 100 | } 101 | 102 | // Clean rules 103 | bool isNeedClean_(const std::string& term){ 104 | if(term.empty()) 105 | return true; 106 | if(term.length() < 4 || term.length() > 31) 107 | return true; 108 | if(isAlphaNumberic(term)) 109 | return true; 110 | return false; 111 | } 112 | 113 | bool isNeedClean(const std::string& token){ 114 | if(stopWords_.end() != stopWords_.find(token) || isNeedClean_(token)) 115 | return true; 116 | return false; 117 | } 118 | 119 | // dedup: it's true means you want to remove the deduplicate words afterm segmentation 120 | void segment(const std::string& line, std::vector& token, bool dedup=true){ 121 | token.clear(); 122 | std::vector > tmp; 123 | try{ 124 | tok_->tokenize(line, tmp); 125 | }catch(...){ 126 | tmp.clear(); 127 | } 128 | token.resize(tmp.size()); 129 | for(uint32_t i = 0; i < tmp.size(); ++i) 130 | token[i] = tmp[i].first; 131 | if(dedup){ 132 | std::set set_(token.begin(), token.end()); 133 | token.clear(); 134 | std::copy(set_.begin(), set_.end(), std::back_inserter(token)); 135 | } 136 | } 137 | 138 | // Segmentation and store tokens 139 | std::vector segment(const std::string& title){ 140 | std::vector tokens; 141 | tokens.clear(); 142 | if(title.empty()) 143 | return tokens; 144 | // Lock 145 | // bool isLock; 146 | // ScopedWriteBoolLock lock(mutex_, isLock); 147 | 148 | // std::vector tokens; 149 | subSegment(title, tokens, false); 150 | bigramModel(tokens); 151 | return tokens; 152 | /* boost::unordered_map >::iterator it; 153 | if(it != itemID_tokens_.end()) 154 | it->second = tokens; 155 | else{ 156 | itemID_tokens_[itemID] = tokens; 157 | itemIDs_.push_back(itemID); 158 | }*/ 159 | } 160 | 161 | // Given itemid ,return title token set 162 | /* void getTokens(int itemID, std::vector& token){ 163 | bool isLock; 164 | ScopedReadBoolLock lock(mutex_, isLock); 165 | token.clear(); 166 | boost::unordered_map >::iterator it; 167 | it = itemID_tokens_.find(itemID); 168 | if(it != itemID_tokens_.end()) 169 | token = it->second; 170 | } 171 | 172 | std::vector getItemIDs(){ 173 | bool isLock; 174 | ScopedReadBoolLock lock(mutex_, isLock); 175 | return itemIDs_; 176 | }*/ 177 | // sub tokenize 178 | void subSegment(const std::vector& token, std::vector& subtoken){ 179 | subtoken.clear(); 180 | if(token.empty()) 181 | return; 182 | std::vector > tmp, subtmp; 183 | tmp.resize(token.size()); 184 | for(uint32_t i = 0; i < token.size(); ++i) 185 | tmp[i] = std::make_pair(token[i], 1.0); 186 | tok_->subtokenize(tmp,subtmp); 187 | subtoken.resize(subtmp.size()); 188 | for(uint32_t j = 0; j < subtmp.size(); ++j) 189 | subtoken[j] = subtmp[j].first; 190 | } 191 | 192 | // dedup: it's true means you want to remove the deduplicate words afterm segmentation 193 | void subSegment(const std::string& line, std::vector& token, bool dedup=true){ 194 | token.clear(); 195 | std::vector > tmp, subtmp, subtoken; 196 | try{ 197 | tok_->tokenize(line, tmp); 198 | }catch(...){ 199 | tmp.clear(); 200 | } 201 | for(uint32_t i = 0; i < tmp.size(); ++i) 202 | { 203 | if(tmp[i].first.length() > 9){ 204 | tok_->subtokenize(tmp[i].first, subtoken); 205 | subtmp.push_back(tmp[i]); 206 | for(uint32_t j = 0; j < subtoken.size(); ++j) 207 | token.push_back(subtoken[j].first); 208 | } 209 | else 210 | token.push_back(tmp[i].first); 211 | } 212 | if(dedup){ 213 | std::set set_(token.begin(), token.end()); 214 | token.clear(); 215 | std::copy(set_.begin(), set_.end(), std::back_inserter(token)); 216 | } 217 | } 218 | 219 | // Extend terms based on bigram 220 | // before extension: t1, t2, t3 221 | // after extension: t1,t2,t3, t1_t2, t2_t3 222 | void bigramModel(std::vector& token){ 223 | // clean tokens 224 | std::vector::iterator it; 225 | for(it = token.begin(); it != token.end(); ++it){ 226 | if((*it).empty() || isNeedClean(*it)){ 227 | token.erase(it); 228 | it--; 229 | } 230 | } 231 | std::vector tmp; 232 | tmp.swap(token); 233 | std::size_t i, j, size = tmp.size(); 234 | std::string bigram; 235 | for(i = 0; i < size; ++i){ 236 | token.push_back(tmp[i]); 237 | j = i + 1; 238 | if(j < size){ 239 | bigram = tmp[i] + tmp[j]; 240 | token.push_back(bigram); 241 | } 242 | } 243 | } 244 | 245 | // Find the intersection between token1 and token2 246 | void intersect(const std::vector& token1 247 | ,const std::vector& token2 248 | ,std::vector& result){ 249 | result.clear(); 250 | if(token1.empty() || token2.empty()) 251 | return; 252 | boost::unordered_map token; 253 | for(uint32_t i = 0; i < token1.size(); ++i) 254 | token.insert(std::make_pair(token1[i], 1)); 255 | for(uint32_t j = 0; j < token2.size(); ++j) 256 | if(token.end() != token.find(token2[j])) 257 | result.push_back(token2[j]); 258 | } 259 | 260 | // Compute content similarity between t1 and t2, we choose the simplest way 261 | // sim = (intersection-size / token1.size()) * (intersection-size / token2.size()) 262 | // we don't consider the semantic similarity, so most of the similarity maybe zero. 263 | double computeContentSim2(const std::string& t1, const std::string& t2){ 264 | double sim = 0.0; 265 | if(t1.empty() || t2.empty()) 266 | return sim; 267 | std::vector token1, token2,result; 268 | subSegment(t1, token1, false); 269 | bigramModel(token1); 270 | subSegment(t2, token2, false); 271 | bigramModel(token2); 272 | intersect(token1, token2, result); 273 | // To ensure the size of token is big than zero 274 | if(token1.empty() || token2.empty()) 275 | return sim; 276 | sim = ((double)result.size()/token1.size()) * ((double)result.size()/token2.size()); 277 | return sim; 278 | } 279 | 280 | // Through ItemID find it's token set and compute it's similarity. 281 | /* double computeContentSim(int itemID1, int itemID2){ 282 | 283 | bool isLock; 284 | ScopedReadBoolLock lock(mutex_, isLock); 285 | 286 | double sim = 0.0; 287 | std::vector token1, token2,result; 288 | boost::unordered_map >::iterator it; 289 | it = itemID_tokens_.find(itemID1); 290 | if(it == itemID_tokens_.end() || it->second.empty()) 291 | return sim; 292 | token1 = it->second; 293 | it = itemID_tokens_.find(itemID2); 294 | if(it == itemID_tokens_.end() || it->second.empty()) 295 | return sim; 296 | token2 = it->second; 297 | 298 | intersect(token1, token2, result); 299 | sim = ((double)result.size()/token1.size()) * ((double)result.size()/token2.size()); 300 | return sim; 301 | }*/ 302 | 303 | double computeContentSim(const std::vector& token1 304 | ,const std::vector& token2){ 305 | double sim = 0.0; 306 | if(token1.empty() || token2.empty()) 307 | return sim; 308 | std::vector result; 309 | intersect(token1, token2, result); 310 | sim = ((double)result.size()/token1.size()) * ((double)result.size()/token2.size()); 311 | return sim; 312 | } 313 | }; 314 | 315 | 316 | #endif // segmentWrapper.h 317 | -------------------------------------------------------------------------------- /include/suggestion.hpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: suggestion.hpp 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Tue 26 Jul 2016 02:32:52 PM CST 7 | ************************************************************************/ 8 | // Query suggestion module is used to create index for data module 9 | // and the output interfaces for query suggestion,which contains data update 10 | // strategy and the final results. 11 | 12 | #ifndef SUGGESTION_ENGINE_HPP 13 | #define SUGGESTION_ENGINE_HPP 14 | 15 | #include 16 | #include "buildEngine.hpp" 17 | 18 | // static global resource directory in build engine 19 | std::string BuildEngine::res_dir_ = ""; 20 | // Suggestion class to used for getting the final result and data updating. 21 | class Suggestion { 22 | private: 23 | boost::shared_ptr pBuild_; // data building pointer 24 | 25 | KeyTermIDsType key_termids_; // prefix and corresponding term ids 26 | TermInfoType termsInfo_; // all the term's information(tf, reserve field,stored based on their ids) 27 | 28 | std::string res_dir_; // resource directory 29 | 30 | public: 31 | Suggestion(const std::string& res_dir) 32 | : res_dir_(res_dir) { 33 | if (!boost::filesystem::exists(res_dir)) { 34 | std::cout << "Resource directory " << res_dir << " not exists!\n"; 35 | std::cout << "The resource directory may like \"../resource/\"\n"; 36 | return; 37 | } 38 | BuildEngine::res_dir_ = res_dir_; 39 | pBuild_.reset(new BuildEngine()); 40 | } 41 | 42 | ~Suggestion() { 43 | key_termids_.clear(); 44 | termsInfo_.clear(); 45 | } 46 | 47 | // get final suggestion results 48 | // @key: user input string 49 | // @JsonRes: suggestion results in the form of json 50 | void GetJsonResult(const std::string& key, std::string& JsonRes) { 51 | 52 | std::vector terms, attrs; 53 | GetSuggestion(key, terms, attrs); 54 | 55 | 56 | JsonRes = "["; 57 | if (terms.empty()) { 58 | JsonRes += "]"; 59 | return; 60 | } 61 | 62 | for (uint32_t idx = 0; idx < terms.size(); ++idx) { 63 | if (terms[idx].empty()) 64 | continue; 65 | JsonRes += "{\"term\":\"" + terms[idx] + "\""; 66 | JsonRes += ",\"total_count\":" + attrs[idx] + "},"; 67 | } 68 | 69 | // delete the last ',' 70 | if (*JsonRes.rbegin() == ',') 71 | JsonRes.erase(JsonRes.size() - 1); 72 | JsonRes += "]"; 73 | } 74 | 75 | // remove more space only reserve one space 76 | // @input: the input string 77 | // @return: return the lowercase 78 | std::string RemoveSpace(const std::string& input) { 79 | uint32_t state = 0, start = 0, end = input.length(); 80 | // from start to find the first element which is not space or tab 81 | for (; start < input.size(); ++start) { 82 | if (input[start] != ' ' && input[start] != '\t') break; 83 | } 84 | 85 | // from end to find the first element which is not space or tab 86 | for (; end > 0;--end) { 87 | if (input[end-1] != ' ' && input[end - 1 ] != '\t') break; 88 | } 89 | 90 | // remove extra spaces keep only one in string 91 | std::string res(""); 92 | for (; start < end; ++start) { 93 | if (input[start] == ' ' || input[start] == '\t') 94 | ++state; 95 | else 96 | state = 0; 97 | 98 | if (state == 0) 99 | res += input[start]; 100 | else if (state == 1) 101 | res += ' '; 102 | } 103 | 104 | boost::to_lower(res); 105 | return res; 106 | } 107 | 108 | // get query suggestion according input key 109 | // @terms: candidate terms 110 | // @attrs: candidate term attributes 111 | // TODO: 112 | // needs to be locked when read the data 113 | bool GetSuggestion(const std::string& key 114 | ,std::vector& terms 115 | ,std::vector& attrs) { 116 | terms.clear(); 117 | attrs.clear(); 118 | 119 | std::string nkey = RemoveSpace(key); 120 | 121 | // get results if exists 122 | KeyTermIDsType::iterator keyIter; 123 | keyIter = key_termids_.find(key); 124 | if (keyIter == key_termids_.end()) 125 | return false; 126 | 127 | std::vector& termIds = keyIter->second; 128 | uint32_t size = termIds.size(); 129 | terms.resize(size); 130 | attrs.resize(size); 131 | for (uint32_t i = 0; i < size; ++i) { 132 | if (termIds[i] > termsInfo_.size()) 133 | continue; 134 | terms[i] = termsInfo_[termIds[i]].first; // term 135 | try { // term result number 136 | terms[i] = boost::lexical_cast(termsInfo_[termIds[i]].second.second); 137 | } catch(...) { 138 | terms[i] = "12"; 139 | } 140 | } 141 | if (terms.size() != attrs.size()) 142 | return false; 143 | 144 | return true; 145 | } 146 | 147 | // get data building module 148 | void GetDataModule(TermInfoType& termsInfo, KeyTermIDsType& key_termids) { 149 | 150 | pBuild_->GetDataModule(termsInfo, key_termids); 151 | } 152 | 153 | // Rebuilding the module and update data 154 | // TODO: 155 | // may consider data intergrity and validity so will be use write and read lock 156 | void ModuleUpate() { 157 | } 158 | 159 | // building data module with specified term file 160 | // @filename: term files the structure like [term '\t' tf '\t' result_num or other attribute(uint32_t)] 161 | void Build(const std::string& filename) { 162 | 163 | pBuild_->Build(filename); 164 | GetDataModule(termsInfo_, key_termids_); 165 | } 166 | 167 | }; 168 | 169 | #endif // suggestion.hpp 170 | -------------------------------------------------------------------------------- /include/util/mtrie.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: mtrie.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 16 Mar 2016 11:17:11 AM CST 7 | ************************************************************************/ 8 | // This data structure was created based on boost unordered_map, which is 9 | // a simpler version of memory trie, the performance may need to be optimized. 10 | 11 | #ifdef MTRIE_H 12 | #define MTRIE_H 13 | 14 | #ifndef WIN32 15 | #include 16 | #include 17 | #include 18 | #else 19 | #include 20 | #include 21 | 22 | typedef signed char int8_t; 23 | typedef short int16_t; 24 | typedef long int16_t; 25 | typedef __int64 int64_t; 26 | typedef unsigned char uint8_t; 27 | typedef unsigned short uint16_t; 28 | typedef unsigned long uint32_t; 29 | typedef unsigned __int64 uint64_t; 30 | 31 | #endif // end of WIN32 32 | 33 | #include 34 | #include 35 | 36 | #include 37 | 38 | template 39 | class MTrie 40 | { 41 | private: 42 | TrieType trie_; 43 | DataStorageType data_; 44 | NodeIDType key_; 45 | 46 | /** 47 | * @brief To insert str into mtrie structure 48 | */ 49 | void Add_(const StringType& str, NodeIDType& id) 50 | { 51 | id = GetRootID(); 52 | if(str.empty()) return; 53 | for(std::size_t i = 0; i < str.size(); ++i) 54 | { 55 | bool last = (i == str.size() - 1); 56 | std::pair key_pair(id, str[i]); 57 | typename TrieType::iterator it = trie_.find(key_pair); 58 | if(it != trie_.end()) 59 | { 60 | std::pair& value_pair = it->second; 61 | id = value_pair.first; 62 | if(last && !value_pair.second) 63 | { 64 | value_pair.second = true; 65 | } 66 | } 67 | else 68 | { 69 | id = key_; 70 | key_++; 71 | std::pair value_pair(id, last); 72 | trie_.insert(std::make_pair(key_pair, value_pair)); 73 | } 74 | } 75 | } 76 | 77 | 78 | public: 79 | typedef typename StringType::value_type CharType; 80 | typedef boost::unordered_map, 81 | std::pair > TrieType; 82 | typedef boost::unordered_map DataStorageType; 83 | 84 | MTrie():key_(1) 85 | { 86 | } 87 | 88 | /** 89 | * @brief insert string to mtrie structure 90 | */ 91 | void Add(cosnt StringType& str) 92 | { 93 | NodeIDType id = GetRootID(); 94 | Add_(str, id); 95 | } 96 | 97 | /** 98 | * @brief add given str into data storage. 99 | */ 100 | void Add(const StringType& str, const DataType& data) 101 | { 102 | NodeIDType id = GetRootID(); 103 | Add_(str, id); 104 | data_.insert(std::make_pair(id, data)); 105 | } 106 | 107 | /** 108 | * @brief get the root id it always zero. 109 | */ 110 | NodeIDType GetRootID() const 111 | { 112 | return 0; 113 | } 114 | 115 | /** 116 | * @brief to find the given parameter c and it's parent node id, 117 | * if it exists, it will return , and sotre the child node 118 | * id in parameter childNID. 119 | */ 120 | std::pair Find(const CharType& c, const NodeIDType& parentNID, 121 | NodeIDType& childNID) 122 | { 123 | std::pair result(false, false); 124 | std::pair key_pair(parentNID, c); 125 | typename TrieType::iterator it = trie_.find(key_pair); 126 | std::pair value_pair(0, false); 127 | if(it != trie_.end()) 128 | { 129 | value_pair = it->second; 130 | result.first = true; 131 | result.second = value_pair.second; 132 | childNID = value_pair.first; 133 | } 134 | 135 | return result; 136 | } 137 | 138 | /** 139 | * @brief get data from the given id, if the id exists, return ture and 140 | * the data will copy to parameter data, else it will return false. 141 | */ 142 | bool GetData(cosnt NodeIDType& id, DataType& data) 143 | { 144 | typename DataStorageType::iterator it = data_.find(id); 145 | if(it != data_.end()) 146 | { 147 | data = it->second; 148 | return true; 149 | } 150 | return false; 151 | } 152 | }; 153 | 154 | #endif // mtrie.h 155 | -------------------------------------------------------------------------------- /include/util/normalize.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: normalize.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Tue 12 Jul 2016 02:02:37 PM CST 7 | ************************************************************************/ 8 | // Normalization module contains several functions to convert one string 9 | // to a normalized string,like utf8 encoding,lower case,etc. And the encoding 10 | // was based on a open source "utf8-cpp". 11 | 12 | #ifndef UTIL_NORMALIZE_H 13 | #define UTIL_NORMALIZE_H 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "utf8.h" 20 | 21 | typedef uint16_t UnicodeType; 22 | 23 | class Normalize { 24 | public: 25 | Normalize(){ 26 | } 27 | 28 | // IsDigital 29 | static bool IsDigit(char c) { 30 | if (c >= '0' && c <= '9') 31 | return true; 32 | return false; 33 | } 34 | 35 | // IsAlpha 36 | static bool IsAlpha(char c) { 37 | if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) 38 | return true; 39 | return false; 40 | } 41 | 42 | // IsConnector 43 | static bool IsConnector(char c) { 44 | if ( c == '.' || c == '-' || c == '+') 45 | return true; 46 | return false; 47 | } 48 | 49 | // IsBreakPunctuation 50 | static bool IsBreakPunct(char c) { 51 | if (!IsPunctuation(c)) 52 | return false; 53 | if (c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') 54 | return true; 55 | 56 | return false; 57 | } 58 | 59 | // IsPunctuation 60 | static bool IsPunctuation(char c) { 61 | return ispunct(c); 62 | } 63 | 64 | // Convert a string to utf8 encoding, replace any invalid codes by unicode 65 | static bool ToUTF8(std::string& str) { 66 | if (str.empty()) 67 | return false; 68 | std::string temp; 69 | utf8::replace_invalid(str.begin(), str.end(), std::back_inserter(temp)); 70 | 71 | str = temp; 72 | return true; 73 | } 74 | 75 | // TODO: 76 | // remove invalid utf8 encoding if it exists. 77 | static bool RemoveInvalidUTF8(std::string& str) { 78 | std::string::iterator iter = utf8::find_invalid(str.begin(), str.end()); 79 | if (iter == str.end()) 80 | return false; 81 | std::string temp(str.begin(), iter); 82 | str = temp; 83 | return true; 84 | } 85 | 86 | // Convert a string to lower case 87 | static void ToLower(std::string& str) { 88 | std::string ustr(str); 89 | str = ""; 90 | std::string::size_type idx; 91 | for (idx = 0; idx < ustr.length(); ++idx) { 92 | str += ToLower_(ustr[idx]); 93 | } 94 | } 95 | 96 | // Convert a string to upper case 97 | static void ToUpper(std::string& str) { 98 | std::string ustr(str); 99 | str = ""; 100 | std::string::size_type idx; 101 | for (idx = 0; idx < ustr.length(); ++idx) { 102 | str += ToUpper_(ustr[idx]); 103 | } 104 | } 105 | 106 | // Check a string whether it is a valid utf8 encoding string 107 | static bool IsValidUTF8(const std::string& str) { 108 | if (str.empty()) 109 | return false; 110 | std::string::const_iterator iter = utf8::find_invalid(str.begin(), str.end()); 111 | if (iter != str.end()) { 112 | std::cout << "Invalid code found!" << std::endl; 113 | std::string temp(str.begin(), iter); 114 | std::cout << "This part is fine: " << temp << std::endl; 115 | return false; 116 | } 117 | return true; 118 | } 119 | 120 | // Check string is utf8 encode 121 | static bool GetUS2Char(const std::string& str, std::vector& uChars) { 122 | uChars.clear(); 123 | std::string ustr(str); 124 | // Avoid throwing exceptions 125 | if (!RemoveInvalidUTF8(ustr)) 126 | return false; 127 | std::string::iterator iter = ustr.begin(); 128 | while (iter != ustr.end()) { 129 | uint32_t code = utf8::next(iter, ustr.end()); 130 | uChars.push_back(code); 131 | } 132 | return true; 133 | } 134 | 135 | template 136 | static bool RemoveElement(std::vector& lVec, uint32_t pos 137 | ,std::vector& rVec) { 138 | return true; 139 | } 140 | 141 | // Determine whether a string is a chinese characters 142 | static bool IsChinese(const std::string& str) { 143 | if (!IsValidUTF8(str)) { 144 | // std::cout << "string is a invalid utf8 encoding!\n"; 145 | return false; 146 | } 147 | std::vector unicodes; 148 | utf8::utf8to16(str.begin(), str.end(), std::back_inserter(unicodes)); 149 | //std::cout << "size: " << unicodes.size() << std::endl; 150 | for (uint32_t i = 0; i < unicodes.size(); ++i) { 151 | if (!IsChineseChar_(unicodes[i])) { 152 | return false; 153 | } 154 | } 155 | return true; 156 | } 157 | 158 | // Reload 159 | static bool IsChinese(const UnicodeType& UCS2Char) { 160 | return IsChineseChar_(UCS2Char); 161 | } 162 | 163 | // convert an utf16 encoding(UnicodeType) to a utf8 string 164 | static bool UnicodeToUTF8Str(const std::vector& unicodes, std::string& utf8str) { 165 | utf8str = ""; 166 | if (unicodes.empty()) { 167 | return false; 168 | } 169 | 170 | utf8::utf16to8(unicodes.begin(), unicodes.end(), std::back_inserter(utf8str)); 171 | return true; 172 | } 173 | 174 | // convert an utf16 encoding(UnicodeType) to a utf8 string 175 | static bool UnicodeToUTF8Str(const UnicodeType& unicode, std::string& utf8str) { 176 | utf8str = ""; 177 | std::vector unicodes(1, unicode); 178 | if (unicodes.empty()) { 179 | return false; 180 | } 181 | 182 | utf8::utf16to8(unicodes.begin(), unicodes.end(), std::back_inserter(utf8str)); 183 | return true; 184 | } 185 | 186 | // convert a string to unicode encoding, unicode vector 187 | static bool ToUnicode(const std::string& str, std::vector& unicodes) { 188 | unicodes.clear(); 189 | std::string ustr(str); 190 | // Avoid throwing exceptions 191 | RemoveInvalidUTF8(ustr); 192 | utf8::utf8to16(ustr.begin(), ustr.end(), std::back_inserter(unicodes)); 193 | 194 | return true; 195 | } 196 | 197 | // 198 | private: 199 | // Determine whether a UnicodeType char is a chinese character 200 | static bool IsChineseChar_(UnicodeType ucs2char) { 201 | if(((ucs2char >= 0x2E80 && ucs2char <= 0x2EF3) // CJK Radicals 202 | ||(ucs2char >= 0x2F00 && ucs2char <= 0x2FD5) // Kangxi Radicals Range: 0x2F00 - 0X2FDF 203 | ||(ucs2char >= 0x3400 && ucs2char <= 0x4DB5) // CJK Unified Ideographs Extension A 204 | ||(ucs2char >= 0x4E00 && ucs2char <= 0x9FC3) // CJK Unified Ideographs 205 | ||(ucs2char >= 0xF900 && ucs2char <= 0xFAD9))// CJK Compatibility Ideographs 206 | && ucs2char != 12289 // Chinese Punctuaion Unicode encoding, 、 207 | && ucs2char != 12298 //《 208 | && ucs2char != 12290 // 。 209 | && ucs2char != 12299 // 》 210 | && ucs2char != 65292 // , 211 | && ucs2char != 65311 // ? 212 | && ucs2char != 65281 // ! 213 | && ucs2char != 65306 // : 214 | && ucs2char != 65307 // ; 215 | && ucs2char != 8220 // “ 216 | && ucs2char != 8221 // ” 217 | && ucs2char != 12304 // 【 218 | && ucs2char != 12305 // 】 219 | && ucs2char != 65509 // ¥ 220 | && ucs2char != 8230 // … 221 | && ucs2char != 65288 // ( 222 | && ucs2char != 65289 // ) 223 | && ucs2char != 8212 // — 224 | && ucs2char != 20022 )// 、 225 | return true; 226 | 227 | return false; 228 | } 229 | 230 | // Convert a char to lower case 231 | static inline char ToUpper_(char chConv) { 232 | return (chConv >= 'a' && chConv <= 'z') ? (chConv & 0xdf) : chConv; 233 | } 234 | static inline wchar_t ToUpper_(wchar_t chConv) { 235 | return (chConv >= L'a' && chConv <= L'z') ? (chConv & 0x00df) : chConv; 236 | } 237 | 238 | // Convert a char to upper case 239 | static inline char ToLower_(char chConv) { 240 | return (chConv >= 'A' && chConv <= 'Z') ? (chConv | 0x20) : chConv; 241 | } 242 | static inline wchar_t ToLower_(wchar_t chConv) { 243 | return (chConv >= L'A' && chConv <= 'Z') ? (chConv | 0x0020) : chConv; 244 | } 245 | }; 246 | 247 | 248 | #endif // util/normalize.h 249 | -------------------------------------------------------------------------------- /include/util/py_types.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: py_tyes.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 16 Mar 2016 10:39:19 AM CST 7 | ************************************************************************/ 8 | #ifdef PY_TYPES_H 9 | #define PY_TYPES_H 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | typedef uint16_t UCS2Char; 16 | typedef UCS2Char Unigram; 17 | typedef std::pair Bigram; 18 | typedef std::pair Trigram; 19 | typedef std::vector Ngram; 20 | 21 | 22 | template 23 | struct ScoreItem 24 | { 25 | T value; 26 | double score; 27 | 28 | bool operator<(const ScoreItem& other) cosnt 29 | { 30 | return score > other.score; 31 | } 32 | }; 33 | 34 | typedef ScoreItem CandidateResult; 35 | 36 | struct ViterbiItemT 37 | { 38 | std::string text; 39 | double score; 40 | 41 | const UCS2Char& GetLastChar() const 42 | { 43 | return text[text.length() -1]; 44 | } 45 | }; 46 | 47 | #endif // py_types.h 48 | -------------------------------------------------------------------------------- /include/util/types.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: types.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Thu 10 Mar 2016 05:15:30 PM CST 7 | ************************************************************************/ 8 | #ifndef TYPES_H 9 | #define TYPES_H 10 | 11 | #include 12 | 13 | #endif 14 | 15 | -------------------------------------------------------------------------------- /include/util/utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "utf8/checked.h" 32 | #include "utf8/unchecked.h" 33 | 34 | #endif // header guard 35 | -------------------------------------------------------------------------------- /include/util/utf8/checked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | #include 33 | 34 | namespace utf8 35 | { 36 | // Base for the exceptions that may be thrown from the library 37 | class exception : public ::std::exception { 38 | }; 39 | 40 | // Exceptions that may be thrown from the library functions. 41 | class invalid_code_point : public exception { 42 | uint32_t cp; 43 | public: 44 | invalid_code_point(uint32_t cp) : cp(cp) {} 45 | virtual const char* what() const throw() { return "Invalid code point"; } 46 | uint32_t code_point() const {return cp;} 47 | }; 48 | 49 | class invalid_utf8 : public exception { 50 | uint8_t u8; 51 | public: 52 | invalid_utf8 (uint8_t u) : u8(u) {} 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } 54 | uint8_t utf8_octet() const {return u8;} 55 | }; 56 | 57 | class invalid_utf16 : public exception { 58 | uint16_t u16; 59 | public: 60 | invalid_utf16 (uint16_t u) : u16(u) {} 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } 62 | uint16_t utf16_word() const {return u16;} 63 | }; 64 | 65 | class not_enough_room : public exception { 66 | public: 67 | virtual const char* what() const throw() { return "Not enough space"; } 68 | }; 69 | 70 | /// The library API - functions intended to be called by the users 71 | 72 | template 73 | octet_iterator append(uint32_t cp, octet_iterator result) 74 | { 75 | if (!utf8::internal::is_code_point_valid(cp)) 76 | throw invalid_code_point(cp); 77 | 78 | if (cp < 0x80) // one octet 79 | *(result++) = static_cast(cp); 80 | else if (cp < 0x800) { // two octets 81 | *(result++) = static_cast((cp >> 6) | 0xc0); 82 | *(result++) = static_cast((cp & 0x3f) | 0x80); 83 | } 84 | else if (cp < 0x10000) { // three octets 85 | *(result++) = static_cast((cp >> 12) | 0xe0); 86 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 87 | *(result++) = static_cast((cp & 0x3f) | 0x80); 88 | } 89 | else { // four octets 90 | *(result++) = static_cast((cp >> 18) | 0xf0); 91 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 92 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 93 | *(result++) = static_cast((cp & 0x3f) | 0x80); 94 | } 95 | return result; 96 | } 97 | 98 | template 99 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 100 | { 101 | while (start != end) { 102 | octet_iterator sequence_start = start; 103 | internal::utf_error err_code = utf8::internal::validate_next(start, end); 104 | switch (err_code) { 105 | case internal::UTF8_OK : 106 | for (octet_iterator it = sequence_start; it != start; ++it) 107 | *out++ = *it; 108 | break; 109 | case internal::NOT_ENOUGH_ROOM: 110 | throw not_enough_room(); 111 | case internal::INVALID_LEAD: 112 | out = utf8::append (replacement, out); 113 | ++start; 114 | break; 115 | case internal::INCOMPLETE_SEQUENCE: 116 | case internal::OVERLONG_SEQUENCE: 117 | case internal::INVALID_CODE_POINT: 118 | out = utf8::append (replacement, out); 119 | ++start; 120 | // just one replacement mark for the sequence 121 | while (start != end && utf8::internal::is_trail(*start)) 122 | ++start; 123 | break; 124 | } 125 | } 126 | return out; 127 | } 128 | 129 | template 130 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 131 | { 132 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); 133 | return utf8::replace_invalid(start, end, out, replacement_marker); 134 | } 135 | 136 | template 137 | uint32_t next(octet_iterator& it, octet_iterator end) 138 | { 139 | uint32_t cp = 0; 140 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); 141 | switch (err_code) { 142 | case internal::UTF8_OK : 143 | break; 144 | case internal::NOT_ENOUGH_ROOM : 145 | throw not_enough_room(); 146 | case internal::INVALID_LEAD : 147 | case internal::INCOMPLETE_SEQUENCE : 148 | case internal::OVERLONG_SEQUENCE : 149 | throw invalid_utf8(*it); 150 | case internal::INVALID_CODE_POINT : 151 | throw invalid_code_point(cp); 152 | } 153 | return cp; 154 | } 155 | 156 | template 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) 158 | { 159 | return utf8::next(it, end); 160 | } 161 | 162 | template 163 | uint32_t prior(octet_iterator& it, octet_iterator start) 164 | { 165 | // can't do much if it == start 166 | if (it == start) 167 | throw not_enough_room(); 168 | 169 | octet_iterator end = it; 170 | // Go back until we hit either a lead octet or start 171 | while (utf8::internal::is_trail(*(--it))) 172 | if (it == start) 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence 174 | return utf8::peek_next(it, end); 175 | } 176 | 177 | /// Deprecated in versions that include "prior" 178 | template 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) 180 | { 181 | octet_iterator end = it; 182 | while (utf8::internal::is_trail(*(--it))) 183 | if (it == pass_start) 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence 185 | octet_iterator temp = it; 186 | return utf8::next(temp, end); 187 | } 188 | 189 | template 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 191 | { 192 | for (distance_type i = 0; i < n; ++i) 193 | utf8::next(it, end); 194 | } 195 | 196 | template 197 | typename std::iterator_traits::difference_type 198 | distance (octet_iterator first, octet_iterator last) 199 | { 200 | typename std::iterator_traits::difference_type dist; 201 | for (dist = 0; first < last; ++dist) 202 | utf8::next(first, last); 203 | return dist; 204 | } 205 | 206 | template 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 208 | { 209 | while (start != end) { 210 | uint32_t cp = utf8::internal::mask16(*start++); 211 | // Take care of surrogate pairs first 212 | if (utf8::internal::is_lead_surrogate(cp)) { 213 | if (start != end) { 214 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 215 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 217 | else 218 | throw invalid_utf16(static_cast(trail_surrogate)); 219 | } 220 | else 221 | throw invalid_utf16(static_cast(cp)); 222 | 223 | } 224 | // Lone trail surrogate 225 | else if (utf8::internal::is_trail_surrogate(cp)) 226 | throw invalid_utf16(static_cast(cp)); 227 | 228 | result = utf8::append(cp, result); 229 | } 230 | return result; 231 | } 232 | 233 | template 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 235 | { 236 | while (start < end) { 237 | uint32_t cp = utf8::next(start, end); 238 | if (cp > 0xffff) { //make a surrogate pair 239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 241 | } 242 | else 243 | *result++ = static_cast(cp); 244 | } 245 | return result; 246 | } 247 | 248 | template 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 250 | { 251 | while (start != end) 252 | result = utf8::append(*(start++), result); 253 | 254 | return result; 255 | } 256 | 257 | template 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 259 | { 260 | while (start < end) 261 | (*result++) = utf8::next(start, end); 262 | 263 | return result; 264 | } 265 | 266 | // The iterator class 267 | template 268 | class iterator : public std::iterator { 269 | octet_iterator it; 270 | octet_iterator range_start; 271 | octet_iterator range_end; 272 | public: 273 | iterator () {} 274 | explicit iterator (const octet_iterator& octet_it, 275 | const octet_iterator& range_start, 276 | const octet_iterator& range_end) : 277 | it(octet_it), range_start(range_start), range_end(range_end) 278 | { 279 | if (it < range_start || it > range_end) 280 | throw std::out_of_range("Invalid utf-8 iterator position"); 281 | } 282 | // the default "big three" are OK 283 | octet_iterator base () const { return it; } 284 | uint32_t operator * () const 285 | { 286 | octet_iterator temp = it; 287 | return utf8::next(temp, range_end); 288 | } 289 | bool operator == (const iterator& rhs) const 290 | { 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) 292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 293 | return (it == rhs.it); 294 | } 295 | bool operator != (const iterator& rhs) const 296 | { 297 | return !(operator == (rhs)); 298 | } 299 | iterator& operator ++ () 300 | { 301 | utf8::next(it, range_end); 302 | return *this; 303 | } 304 | iterator operator ++ (int) 305 | { 306 | iterator temp = *this; 307 | utf8::next(it, range_end); 308 | return temp; 309 | } 310 | iterator& operator -- () 311 | { 312 | utf8::prior(it, range_start); 313 | return *this; 314 | } 315 | iterator operator -- (int) 316 | { 317 | iterator temp = *this; 318 | utf8::prior(it, range_start); 319 | return temp; 320 | } 321 | }; // class iterator 322 | 323 | } // namespace utf8 324 | 325 | #endif //header guard 326 | 327 | 328 | -------------------------------------------------------------------------------- /include/util/utf8/core.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | 33 | namespace utf8 34 | { 35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 | // You may need to change them to match your system. 37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 | typedef unsigned char uint8_t; 39 | typedef unsigned short uint16_t; 40 | typedef unsigned int uint32_t; 41 | 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time 43 | namespace internal 44 | { 45 | // Unicode constants 46 | // Leading (high) surrogates: 0xd800 - 0xdbff 47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 | 55 | // Maximum valid value for a Unicode code point 56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 | 58 | template 59 | inline uint8_t mask8(octet_type oc) 60 | { 61 | return static_cast(0xff & oc); 62 | } 63 | template 64 | inline uint16_t mask16(u16_type oc) 65 | { 66 | return static_cast(0xffff & oc); 67 | } 68 | template 69 | inline bool is_trail(octet_type oc) 70 | { 71 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); 72 | } 73 | 74 | template 75 | inline bool is_lead_surrogate(u16 cp) 76 | { 77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 | } 79 | 80 | template 81 | inline bool is_trail_surrogate(u16 cp) 82 | { 83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 | } 85 | 86 | template 87 | inline bool is_surrogate(u16 cp) 88 | { 89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 | } 91 | 92 | template 93 | inline bool is_code_point_valid(u32 cp) 94 | { 95 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); 96 | } 97 | 98 | template 99 | inline typename std::iterator_traits::difference_type 100 | sequence_length(octet_iterator lead_it) 101 | { 102 | uint8_t lead = utf8::internal::mask8(*lead_it); 103 | if (lead < 0x80) 104 | return 1; 105 | else if ((lead >> 5) == 0x6) 106 | return 2; 107 | else if ((lead >> 4) == 0xe) 108 | return 3; 109 | else if ((lead >> 3) == 0x1e) 110 | return 4; 111 | else 112 | return 0; 113 | } 114 | 115 | template 116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 | { 118 | if (cp < 0x80) { 119 | if (length != 1) 120 | return true; 121 | } 122 | else if (cp < 0x800) { 123 | if (length != 2) 124 | return true; 125 | } 126 | else if (cp < 0x10000) { 127 | if (length != 3) 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 | 136 | /// Helper for get_sequence_x 137 | template 138 | utf_error increase_safely(octet_iterator& it, octet_iterator end) 139 | { 140 | if (++it == end) 141 | return NOT_ENOUGH_ROOM; 142 | 143 | if (!utf8::internal::is_trail(*it)) 144 | return INCOMPLETE_SEQUENCE; 145 | 146 | return UTF8_OK; 147 | } 148 | 149 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} 150 | 151 | /// get_sequence_x functions decode utf-8 sequences of the length x 152 | template 153 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) 154 | { 155 | if (it == end) 156 | return NOT_ENOUGH_ROOM; 157 | 158 | code_point = utf8::internal::mask8(*it); 159 | 160 | return UTF8_OK; 161 | } 162 | 163 | template 164 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) 165 | { 166 | if (it == end) 167 | return NOT_ENOUGH_ROOM; 168 | 169 | code_point = utf8::internal::mask8(*it); 170 | 171 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 172 | 173 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); 174 | 175 | return UTF8_OK; 176 | } 177 | 178 | template 179 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) 180 | { 181 | if (it == end) 182 | return NOT_ENOUGH_ROOM; 183 | 184 | code_point = utf8::internal::mask8(*it); 185 | 186 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 187 | 188 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 189 | 190 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 191 | 192 | code_point += (*it) & 0x3f; 193 | 194 | return UTF8_OK; 195 | } 196 | 197 | template 198 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) 199 | { 200 | if (it == end) 201 | return NOT_ENOUGH_ROOM; 202 | 203 | code_point = utf8::internal::mask8(*it); 204 | 205 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 206 | 207 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 208 | 209 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 210 | 211 | code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; 212 | 213 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 214 | 215 | code_point += (*it) & 0x3f; 216 | 217 | return UTF8_OK; 218 | } 219 | 220 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR 221 | 222 | template 223 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) 224 | { 225 | // Save the original value of it so we can go back in case of failure 226 | // Of course, it does not make much sense with i.e. stream iterators 227 | octet_iterator original_it = it; 228 | 229 | uint32_t cp = 0; 230 | // Determine the sequence length based on the lead octet 231 | typedef typename std::iterator_traits::difference_type octet_difference_type; 232 | const octet_difference_type length = utf8::internal::sequence_length(it); 233 | 234 | // Get trail octets and calculate the code point 235 | utf_error err = UTF8_OK; 236 | switch (length) { 237 | case 0: 238 | return INVALID_LEAD; 239 | case 1: 240 | err = utf8::internal::get_sequence_1(it, end, cp); 241 | break; 242 | case 2: 243 | err = utf8::internal::get_sequence_2(it, end, cp); 244 | break; 245 | case 3: 246 | err = utf8::internal::get_sequence_3(it, end, cp); 247 | break; 248 | case 4: 249 | err = utf8::internal::get_sequence_4(it, end, cp); 250 | break; 251 | } 252 | 253 | if (err == UTF8_OK) { 254 | // Decoding succeeded. Now, security checks... 255 | if (utf8::internal::is_code_point_valid(cp)) { 256 | if (!utf8::internal::is_overlong_sequence(cp, length)){ 257 | // Passed! Return here. 258 | code_point = cp; 259 | ++it; 260 | return UTF8_OK; 261 | } 262 | else 263 | err = OVERLONG_SEQUENCE; 264 | } 265 | else 266 | err = INVALID_CODE_POINT; 267 | } 268 | 269 | // Failure branch - restore the original value of the iterator 270 | it = original_it; 271 | return err; 272 | } 273 | 274 | template 275 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 276 | uint32_t ignored; 277 | return utf8::internal::validate_next(it, end, ignored); 278 | } 279 | 280 | } // namespace internal 281 | 282 | /// The library API - functions intended to be called by the users 283 | 284 | // Byte order mark 285 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 286 | 287 | template 288 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 289 | { 290 | octet_iterator result = start; 291 | while (result != end) { 292 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); 293 | if (err_code != internal::UTF8_OK) 294 | return result; 295 | } 296 | return result; 297 | } 298 | 299 | template 300 | inline bool is_valid(octet_iterator start, octet_iterator end) 301 | { 302 | return (utf8::find_invalid(start, end) == end); 303 | } 304 | 305 | template 306 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 307 | { 308 | return ( 309 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && 310 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && 311 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) 312 | ); 313 | } 314 | 315 | //Deprecated in release 2.3 316 | template 317 | inline bool is_bom (octet_iterator it) 318 | { 319 | return ( 320 | (utf8::internal::mask8(*it++)) == bom[0] && 321 | (utf8::internal::mask8(*it++)) == bom[1] && 322 | (utf8::internal::mask8(*it)) == bom[2] 323 | ); 324 | } 325 | } // namespace utf8 326 | 327 | #endif // header guard 328 | 329 | 330 | -------------------------------------------------------------------------------- /include/util/utf8/unchecked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | 33 | namespace utf8 34 | { 35 | namespace unchecked 36 | { 37 | template 38 | octet_iterator append(uint32_t cp, octet_iterator result) 39 | { 40 | if (cp < 0x80) // one octet 41 | *(result++) = static_cast(cp); 42 | else if (cp < 0x800) { // two octets 43 | *(result++) = static_cast((cp >> 6) | 0xc0); 44 | *(result++) = static_cast((cp & 0x3f) | 0x80); 45 | } 46 | else if (cp < 0x10000) { // three octets 47 | *(result++) = static_cast((cp >> 12) | 0xe0); 48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 49 | *(result++) = static_cast((cp & 0x3f) | 0x80); 50 | } 51 | else { // four octets 52 | *(result++) = static_cast((cp >> 18) | 0xf0); 53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); 54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 55 | *(result++) = static_cast((cp & 0x3f) | 0x80); 56 | } 57 | return result; 58 | } 59 | 60 | template 61 | uint32_t next(octet_iterator& it) 62 | { 63 | uint32_t cp = utf8::internal::mask8(*it); 64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); 65 | switch (length) { 66 | case 1: 67 | break; 68 | case 2: 69 | it++; 70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 71 | break; 72 | case 3: 73 | ++it; 74 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 75 | ++it; 76 | cp += (*it) & 0x3f; 77 | break; 78 | case 4: 79 | ++it; 80 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 81 | ++it; 82 | cp += (utf8::internal::mask8(*it) << 6) & 0xfff; 83 | ++it; 84 | cp += (*it) & 0x3f; 85 | break; 86 | } 87 | ++it; 88 | return cp; 89 | } 90 | 91 | template 92 | uint32_t peek_next(octet_iterator it) 93 | { 94 | return utf8::unchecked::next(it); 95 | } 96 | 97 | template 98 | uint32_t prior(octet_iterator& it) 99 | { 100 | while (utf8::internal::is_trail(*(--it))) ; 101 | octet_iterator temp = it; 102 | return utf8::unchecked::next(temp); 103 | } 104 | 105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 106 | template 107 | inline uint32_t previous(octet_iterator& it) 108 | { 109 | return utf8::unchecked::prior(it); 110 | } 111 | 112 | template 113 | void advance (octet_iterator& it, distance_type n) 114 | { 115 | for (distance_type i = 0; i < n; ++i) 116 | utf8::unchecked::next(it); 117 | } 118 | 119 | template 120 | typename std::iterator_traits::difference_type 121 | distance (octet_iterator first, octet_iterator last) 122 | { 123 | typename std::iterator_traits::difference_type dist; 124 | for (dist = 0; first < last; ++dist) 125 | utf8::unchecked::next(first); 126 | return dist; 127 | } 128 | 129 | template 130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 131 | { 132 | while (start != end) { 133 | uint32_t cp = utf8::internal::mask16(*start++); 134 | // Take care of surrogate pairs first 135 | if (utf8::internal::is_lead_surrogate(cp)) { 136 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 138 | } 139 | result = utf8::unchecked::append(cp, result); 140 | } 141 | return result; 142 | } 143 | 144 | template 145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 146 | { 147 | while (start < end) { 148 | uint32_t cp = utf8::unchecked::next(start); 149 | if (cp > 0xffff) { //make a surrogate pair 150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 152 | } 153 | else 154 | *result++ = static_cast(cp); 155 | } 156 | return result; 157 | } 158 | 159 | template 160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 161 | { 162 | while (start != end) 163 | result = utf8::unchecked::append(*(start++), result); 164 | 165 | return result; 166 | } 167 | 168 | template 169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 170 | { 171 | while (start < end) 172 | (*result++) = utf8::unchecked::next(start); 173 | 174 | return result; 175 | } 176 | 177 | // The iterator class 178 | template 179 | class iterator : public std::iterator { 180 | octet_iterator it; 181 | public: 182 | iterator () {} 183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 184 | // the default "big three" are OK 185 | octet_iterator base () const { return it; } 186 | uint32_t operator * () const 187 | { 188 | octet_iterator temp = it; 189 | return utf8::unchecked::next(temp); 190 | } 191 | bool operator == (const iterator& rhs) const 192 | { 193 | return (it == rhs.it); 194 | } 195 | bool operator != (const iterator& rhs) const 196 | { 197 | return !(operator == (rhs)); 198 | } 199 | iterator& operator ++ () 200 | { 201 | ::std::advance(it, utf8::internal::sequence_length(it)); 202 | return *this; 203 | } 204 | iterator operator ++ (int) 205 | { 206 | iterator temp = *this; 207 | ::std::advance(it, utf8::internal::sequence_length(it)); 208 | return temp; 209 | } 210 | iterator& operator -- () 211 | { 212 | utf8::unchecked::prior(it); 213 | return *this; 214 | } 215 | iterator operator -- (int) 216 | { 217 | iterator temp = *this; 218 | utf8::unchecked::prior(it); 219 | return temp; 220 | } 221 | }; // class iterator 222 | 223 | } // namespace utf8::unchecked 224 | } // namespace utf8 225 | 226 | 227 | #endif // header guard 228 | 229 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | 2 | PUB_DIR:=../include/ 3 | SRC_INC:= ./ 4 | 5 | PROG1 = t_normalize_unit 6 | PROG2 = t_dictionary_unit 7 | PROG3 = t_segment_unit 8 | PROG4 = t_build_unit 9 | PROG5 = t_suggestion_unit 10 | 11 | CFLAGS = -W -Wall -I../ 12 | 13 | CC=g++ 14 | #all: $(PROG1) $(PROG2) $(PROG3) $(PROG4) $(PROG5) 15 | all: $(PROG5) 16 | 17 | LDFLAGS=-lboost_system -lboost_serialization -lboost_filesystem -lboost_unit_test_framework 18 | 19 | 20 | # normalize unit test 21 | #$(PROG1): $(PROG1).cc 22 | # $(CC) -g -o $(PROG1) $(PROG1).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR) 23 | 24 | # dictionary unit test 25 | #$(PROG2): $(PROG2).cc 26 | # $(CC) -g -o $(PROG2) $(PROG2).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR) 27 | 28 | # segment unit test 29 | #$(PROG3): $(PROG3).cc 30 | # $(CC) -g -o $(PROG3) $(PROG3).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR) 31 | 32 | # build engine unit test 33 | #$(PROG4): $(PROG4).cc 34 | # $(CC) -g -o $(PROG4) $(PROG4).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR) 35 | 36 | # suggestion unit test 37 | $(PROG5): $(PROG5).cc 38 | $(CC) -g -o $(PROG5) $(PROG5).cc $(CFLAGS) $(LDFLAGS) -I $(SRC_INC) -I $(PUB_DIR) 39 | 40 | run: 41 | ./$(PROG1) 42 | ./$(PROG2) 43 | ./$(PROG3) 44 | ./$(PROG4) 45 | ./$(PROG5) 46 | 47 | clean: 48 | rm -rf $(PROG1) $(PROG2) $(PROG3) $(PROG4) $(PROG5) *.exe *.dSYM *.obj *.exp .*o *.lib .*.txt 49 | -------------------------------------------------------------------------------- /test/t_build_unit.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: t_build_unit.cc 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Thu 21 Jul 2016 04:21:35 PM CST 7 | ************************************************************************/ 8 | #define BOOST_TEST_DYN_LINK 9 | #define BOOST_TEST_MODULE DataBuildEngineTest 10 | 11 | #include 12 | #include 13 | #include "unit_test.h" 14 | #include "buildEngine.hpp" 15 | 16 | 17 | 18 | std::string BuildEngine::res_dir_("../resource/"); // resource dir 19 | BOOST_AUTO_TEST_SUITE (BuildEngineTest) 20 | 21 | //std::string BuildEngine::res_dir_("../resource/"); // resource dir 22 | boost::shared_ptr pBuild(new BuildEngine()); 23 | 24 | 25 | // construct 26 | //pBuild.reset(new BuildEngine()); 27 | 28 | 29 | // helper 30 | void print_vector(const std::vector& vec) { 31 | for (uint32_t i = 0; i < vec.size(); ++i) { 32 | std::cout << vec[i] << ","; 33 | } 34 | std::cout << "\n"; 35 | } 36 | 37 | 38 | void genByPrefix(const std::string& str) { 39 | std::vector chars, words, keys; 40 | pBuild->Parse(str, chars, words); 41 | 42 | pBuild->GenerateByPrefix(chars, keys, 10); 43 | std::cout << "==========GenerateByPrefix==========\nInput: " << str << std::endl; 44 | print_vector(keys); 45 | } 46 | 47 | void genByWordInfix(const std::string& str) { 48 | std::vector chars, words, keys; 49 | pBuild->Parse(str, chars, words); 50 | 51 | pBuild->GenerateByWordInfix(words, keys, 10); 52 | std::cout << "=========GenerateByWordInfix===========\nInput: " << str << std::endl; 53 | print_vector(keys); 54 | } 55 | 56 | 57 | void genByWordSuffix(const std::string& str) { 58 | std::vector chars, words, keys; 59 | pBuild->Parse(str, chars, words); 60 | 61 | pBuild->GenerateByWordSuffix(words, keys, 10); 62 | std::cout << "=========GenerateByWordSuffix===========\nInput: " << str << std::endl; 63 | print_vector(keys); 64 | } 65 | 66 | 67 | void genByPinYinPrefix(const std::string& str) { 68 | std::vector keys; 69 | 70 | pBuild->GenerateByPinYinPrefix(str, keys, 10); 71 | std::cout << "=========GenerateByPinYinPrefix===========\nInput: " << str << std::endl; 72 | print_vector(keys); 73 | } 74 | 75 | void genByShengMuPrefix(const std::string& str) { 76 | std::vector keys; 77 | 78 | pBuild->GenerateByShengMuPrefix(str, keys, 10); 79 | std::cout << "=========GenerateByPinYinPrefix===========\nInput: " << str << std::endl; 80 | print_vector(keys); 81 | } 82 | 83 | 84 | // test BuildEngine::parse 85 | void test_parse(const std::string& str) { 86 | std::vector chars, words; 87 | 88 | pBuild->Parse(str, chars, words); 89 | std::cout << "==========Parse==========\nInput: " << str << std::endl; 90 | std::cout << "Chars: "; 91 | print_vector(chars); 92 | std::cout << "Words: "; 93 | print_vector(words); 94 | std::cout << "====================\n"; 95 | } 96 | 97 | // generate test terms 98 | void gendata() { 99 | 100 | // construct data 101 | std::ofstream ofs(".terms.txt"); 102 | if (!ofs) { 103 | std::cout << "Open file .terms.txt file error!\n"; 104 | } 105 | // write test data 106 | ofs << "贝贝德皮诺" << "\t" << 300 << "\t" << 123 << std::endl; 107 | ofs << "bebedepino" << "\t" << 250 << "\t" << 231 << std::endl; 108 | ofs << "背背佳" << "\t" << 130 << "\t" << 42 <Build(".terms.txt"); 119 | 120 | // flush 121 | pBuild->Flush(".term.txt", ".key_terms.txt"); 122 | } 123 | 124 | // get building results 125 | void getDataModule() { 126 | gendata(); 127 | 128 | TermInfoType terms; 129 | KeyTermIDsType key_termids; 130 | pBuild->GetDataModule(terms, key_termids); 131 | 132 | std::cout << "terms size: " << terms.size() << "\tkey_term id size: " << key_termids.size() << std::endl; 133 | } 134 | 135 | // -------------------------------- 136 | 137 | // Case 1, BuildEngine::Parse() 138 | BOOST_AUTO_TEST_CASE (Parse) { 139 | 140 | // 1 141 | test_parse("贝贝德皮诺"); 142 | test_parse("bebedepino"); 143 | test_parse("贝贝德皮诺bebedepino"); 144 | test_parse("贝贝德皮诺》《bebed*)epino"); 145 | } 146 | 147 | // Case 2, BuildEngine::GenerateByPrefix() 148 | BOOST_AUTO_TEST_CASE (GenerateByPrefix) { 149 | genByPrefix("贝贝德=皮诺"); 150 | genByPrefix("bebede)pino"); 151 | genByPrefix("贝贝德皮诺bebedep"); 152 | } 153 | 154 | // Case 3, BuildEngine::GenerateByWordInfix() 155 | BOOST_AUTO_TEST_CASE (GenerateByWordInfix) { 156 | 157 | genByWordInfix("贝贝德皮诺"); 158 | } 159 | 160 | // Case 4, BuildEnginie::GenerateByWordSuffix() 161 | BOOST_AUTO_TEST_CASE (GenByWordSuffix) { 162 | 163 | genByWordSuffix("贝贝德皮诺"); 164 | } 165 | 166 | // Case 5, BuildEngine::GenerateByPinYinPrefix() 167 | BOOST_AUTO_TEST_CASE (GenByPinYinPrefix) { 168 | 169 | genByPinYinPrefix("贝贝德皮诺"); 170 | } 171 | 172 | // Case 6, BuildEngine::GenerateByShengMuPrefix() 173 | BOOST_AUTO_TEST_CASE (GenByShengMuPrefix) { 174 | 175 | genByShengMuPrefix("银行"); 176 | } 177 | 178 | // Case 7, BuildEngine::Build() and BuildEngine::Flush() 179 | BOOST_AUTO_TEST_CASE (BuildingAndFlush) { 180 | 181 | build(); 182 | } 183 | 184 | // Case 8, BuildEngine::GetDataModule() 185 | BOOST_AUTO_TEST_CASE (GetDataModule) { 186 | 187 | getDataModule(); 188 | } 189 | 190 | BOOST_AUTO_TEST_SUITE_END() 191 | 192 | -------------------------------------------------------------------------------- /test/t_dictionary_unit.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: t_dictionary_unit.cc 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Fri 15 Jul 2016 10:50:56 AM CST 7 | ************************************************************************/ 8 | #define BOOST_TEST_DYN_LINK 9 | #define BOOST_TEST_MODULE DictionaryTest 10 | #include 11 | #include "unit_test.h" 12 | #include "dictionary.hpp" 13 | 14 | // name of the test suite is DictionaryTest 15 | BOOST_AUTO_TEST_SUITE (DictionaryTest) 16 | 17 | // Dictionary object 18 | std::string dir("../resource/cn"); 19 | Dictionary seg(dir); 20 | 21 | // show segmentation results 22 | void display_tokens(const std::string& input) { 23 | std::vector vec; 24 | seg.Segment(input, vec); 25 | std::cout << "Input:" << input << std::endl; 26 | for (uint32_t i = 0; i < vec.size(); ++i) { 27 | std::cout << "Result: " << vec[i] << ","; 28 | } 29 | std::cout << std::endl; 30 | } 31 | 32 | // show pinyin conversion to chinese results 33 | void display_convert_cn(const std::string& input) { 34 | std::vector vec; 35 | if (!seg.GetChar(input, vec)) 36 | return; 37 | 38 | std::cout << "Input PinYin:" << input << std::endl; 39 | for (uint32_t i = 0; i < vec.size(); ++i) { 40 | std::cout << "Result: " << vec[i] << ","; 41 | } 42 | std::cout << std::endl; 43 | } 44 | 45 | // show chinese conversion to pinyin results 46 | void display_convert_py(const std::string& input) { 47 | std::vector vec; 48 | if (!seg.GetPinYin(input, vec)) 49 | return; 50 | 51 | std::cout << "Input Cn:" << input << std::endl; 52 | for (uint32_t i = 0; i < vec.size(); ++i) { 53 | std::cout << "Result: " << vec[i] << ","; 54 | } 55 | std::cout << std::endl; 56 | } 57 | 58 | // show single chinese conversion to pinyin results 59 | void display_cn_convert_py(const std::string& input) { 60 | std::vector vec; 61 | if (!seg.GetPinYinTerm(input, vec)) 62 | return; 63 | 64 | std::cout << "Input Cn:" << input << std::endl; 65 | for (uint32_t i = 0; i < vec.size(); ++i) { 66 | std::cout << "Result: " << vec[i] << ","; 67 | } 68 | std::cout << std::endl; 69 | } 70 | 71 | // Case 1 , pinyin segmentation 72 | BOOST_AUTO_TEST_CASE(PinYinSegment) { 73 | display_tokens("nanaodeye"); 74 | display_tokens("mangzuoni"); 75 | display_tokens("woyaochitang"); 76 | display_tokens("woyaochitan"); 77 | display_tokens("congxin"); 78 | 79 | display_tokens("yinhangjiaapple"); 80 | display_tokens("gongsicompany"); 81 | display_tokens("shangshi123"); 82 | 83 | display_tokens("qingguangxu15nian"); 84 | display_tokens("sanxingSUMSONG"); 85 | 86 | } 87 | 88 | // Case 2, pinyin convert to chinese characters 89 | BOOST_AUTO_TEST_CASE(PinYin2Cn) { 90 | display_convert_cn("zhen"); 91 | display_convert_cn("zi"); 92 | } 93 | 94 | // Case 3, single chinese words conver to pinyin 95 | BOOST_AUTO_TEST_CASE(Cn2PinYin) { 96 | display_cn_convert_py("白"); 97 | display_cn_convert_py("爱"); 98 | display_cn_convert_py("鬼"); 99 | } 100 | 101 | // Case 4, chinese string convers to pinyin 102 | BOOST_AUTO_TEST_CASE(CnStr2PinYin) { 103 | display_convert_py("我们的爱"); 104 | display_convert_py("你在哪儿"); 105 | display_convert_py("中过"); 106 | display_convert_py("哈哈"); 107 | 108 | display_convert_py("银行apple"); 109 | display_convert_py("女王大人1234"); 110 | display_convert_py("女王520么么哒"); 111 | display_convert_py("尹汝杰541帮五买"); 112 | } 113 | 114 | BOOST_AUTO_TEST_SUITE_END() 115 | -------------------------------------------------------------------------------- /test/t_normalize_unit.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: t_normalize_unit.cc 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Tue 12 Jul 2016 05:30:00 PM CST 7 | ************************************************************************/ 8 | #define BOOST_TEST_DYN_LINK 9 | #define BOOST_TEST_MODULE normalizeTest 10 | //#include 11 | #include 12 | #include "unit_test.h" 13 | #include "util/normalize.h" 14 | 15 | // name of the test suite is normalizeTest 16 | BOOST_AUTO_TEST_SUITE (normalizeTest) 17 | 18 | // 1 Normalize::ToLower() 19 | // Convert string to lower case 20 | BOOST_AUTO_TEST_CASE (ToLower) { 21 | std::string str("ABCD-HK.3SG"); 22 | Normalize::ToLower(str); 23 | BOOST_CHECK_MESSAGE(str == "abcd-hk.3sg", "ToLower result: " << str); 24 | } 25 | 26 | // 2 Normalize::ToUpper() 27 | // Convert string to upper case 28 | BOOST_AUTO_TEST_CASE (ToUpper) { 29 | std::string str("i love you 小红!"); 30 | Normalize::ToUpper(str); 31 | BOOST_CHECK_MESSAGE(str == "I LOVE YOU 小红!", "ToUpper result: " << str); 32 | } 33 | 34 | // 3 Normalize::ToUTF8() 35 | // Convert to utf8 encoding 36 | BOOST_AUTO_TEST_CASE (UTF8Encoding) { 37 | std::string str("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); 38 | Normalize::ToUTF8(str); 39 | BOOST_CHECK_MESSAGE(str == "a����z", "To utf8 result:" << str); 40 | str = "把"; 41 | // Normalize::ToUTF8(str); 42 | // std::cout << "T: " << str << "\t T[0]:" << tt << std::endl; 43 | } 44 | 45 | // 4 Normalize::IsValidUTF8() 46 | // Check a string is a valid utf8 encoding 47 | BOOST_AUTO_TEST_CASE (IsUTF8Encoding) { 48 | // unknown encoding 日ш 49 | std::string str("\xe6\x97\xa5\xd1\x88\xfa"); 50 | bool flag = Normalize::IsValidUTF8(str); 51 | BOOST_CHECK_MESSAGE(flag == false, "IsUTF8Encoding result: " << flag); 52 | } 53 | 54 | // 5 Normalize::IsChinese() 55 | // Determine whether a string is a chinese characters 56 | BOOST_AUTO_TEST_CASE (IsChinese) { 57 | // not chinese as it contains "》" 58 | std::string str("我爱中国》"); 59 | bool flag = Normalize::IsChinese(str); 60 | BOOST_CHECK_MESSAGE( flag == false, str << "IsChinese result: " << flag); 61 | str = "青青原上草"; 62 | flag = Normalize::IsChinese(str); 63 | BOOST_CHECK_MESSAGE( flag == true, str << " IsChinese result: " << flag); 64 | } 65 | 66 | // 6 Normalize::ToUnicode() 67 | // Get utf16 encoding arrary of a string 68 | BOOST_AUTO_TEST_CASE (ToUnicode) { 69 | std::string str("大智若愚"); 70 | std::vector unicodes; 71 | Normalize::ToUnicode(str, unicodes); 72 | BOOST_CHECK_MESSAGE( unicodes.size() == 4, str << "To utf16 size: " << unicodes.size()); 73 | 74 | str = "银行abc"; 75 | Normalize::ToUnicode(str, unicodes); 76 | BOOST_CHECK_MESSAGE( unicodes.size() == 5, str << "To utf16 size: " << unicodes.size()); 77 | } 78 | 79 | // 7 Normalize::UnicodeToUTF8Str() 80 | // Convert a chinese character with utf16 encoding(uint16_t) to a utf8 string 81 | BOOST_AUTO_TEST_CASE (UnicodesToUTF8Str) { 82 | std::string str("中华人民共和国"); 83 | std::vector unicodes; 84 | Normalize::ToUnicode(str, unicodes); 85 | std::string utf8str; 86 | Normalize::UnicodeToUTF8Str(unicodes, utf8str); 87 | BOOST_CHECK_MESSAGE( str == utf8str, str << "After utf16 encoding and decoding: " << utf8str); 88 | 89 | str = "连衣裙Love"; 90 | Normalize::ToUnicode(str, unicodes); 91 | Normalize::UnicodeToUTF8Str(unicodes, utf8str); 92 | BOOST_CHECK_MESSAGE( str == utf8str, str << "After utf16 encoding and decoding: " << utf8str); 93 | } 94 | 95 | // 8 Normalize::UnicodeToUTF8Str() 96 | // Convert a chinese character with utf16 encoding(uint16_t) to a utf8 string 97 | BOOST_AUTO_TEST_CASE (UnicodeToUTF8Str) { 98 | std::string str("爱"); 99 | std::vector unicodes; 100 | Normalize::ToUnicode(str, unicodes); 101 | std::string utf8str; 102 | Normalize::UnicodeToUTF8Str(unicodes[0], utf8str); 103 | BOOST_CHECK_MESSAGE( str == utf8str, str << "After utf16 encoding and decoding: " << utf8str); 104 | } 105 | 106 | // 9 Normalize::IsDigit() 107 | BOOST_AUTO_TEST_CASE (IsDigit) { 108 | bool flag = false; 109 | flag = Normalize::IsDigit('2'); 110 | BOOST_CHECK_MESSAGE( flag == true, "'s' IsDigit result: " << flag); 111 | 112 | flag = Normalize::IsDigit('w'); 113 | BOOST_CHECK_MESSAGE(flag == false, "'w' IsDigit result: " << flag); 114 | } 115 | 116 | // 10 Normalize::IsAlpha() 117 | BOOST_AUTO_TEST_CASE (IsAlpha) { 118 | bool flag = false; 119 | flag = Normalize::IsAlpha('z'); 120 | BOOST_CHECK_MESSAGE( flag == true, "'z' IsDigit result: " << flag); 121 | 122 | flag = Normalize::IsAlpha('3'); 123 | BOOST_CHECK_MESSAGE(flag == false, "'3' IsDigit result: " << flag); 124 | } 125 | 126 | // 11 Normalize::IsConnector() 127 | BOOST_AUTO_TEST_CASE (IsConnector) { 128 | bool flag = false; 129 | flag = Normalize::IsConnector('-'); 130 | BOOST_CHECK_MESSAGE( flag == true, "'-' IsDigit result: " << flag); 131 | 132 | flag = Normalize::IsConnector('.'); 133 | BOOST_CHECK_MESSAGE(flag == true, "'.' IsDigit result: " << flag); 134 | 135 | flag = Normalize::IsConnector('+'); 136 | BOOST_CHECK_MESSAGE(flag == true, "'+' IsDigit result: " << flag); 137 | 138 | flag = Normalize::IsConnector('='); 139 | BOOST_CHECK_MESSAGE(flag == false, "'=' IsDigit result: " << flag); 140 | } 141 | 142 | // 12 Normalize::IsBreakPunct() 143 | BOOST_AUTO_TEST_CASE (IsPunct) { 144 | bool flag = false; 145 | flag = Normalize::IsBreakPunct('['); 146 | BOOST_CHECK_MESSAGE( flag == true, "'[' IsDigit result: " << flag); 147 | 148 | flag = Normalize::IsBreakPunct(']'); 149 | BOOST_CHECK_MESSAGE(flag == true, "']' IsDigit result: " << flag); 150 | 151 | flag = Normalize::IsBreakPunct('('); 152 | BOOST_CHECK_MESSAGE(flag == true, "'(' IsDigit result: " << flag); 153 | 154 | flag = Normalize::IsBreakPunct(')'); 155 | BOOST_CHECK_MESSAGE(flag == true, "')' IsDigit result: " << flag); 156 | 157 | flag = Normalize::IsBreakPunct('{'); 158 | BOOST_CHECK_MESSAGE(flag == true, "'{' IsDigit result: " << flag); 159 | 160 | flag = Normalize::IsBreakPunct('}'); 161 | BOOST_CHECK_MESSAGE(flag == true, "'}' IsDigit result: " << flag); 162 | 163 | flag = Normalize::IsBreakPunct('*'); 164 | BOOST_CHECK_MESSAGE(flag == false, "'*' IsDigit result: " << flag); 165 | } 166 | 167 | // 13 Normalize::IsPunctuation() 168 | BOOST_AUTO_TEST_CASE (IsPunctuation) { 169 | bool flag = false; 170 | flag = Normalize::IsPunctuation('['); 171 | BOOST_CHECK_MESSAGE( flag == true, "'[' IsDigit result: " << flag); 172 | 173 | flag = Normalize::IsPunctuation(','); 174 | BOOST_CHECK_MESSAGE( flag == true, "',' IsDigit result: " << flag); 175 | 176 | flag = Normalize::IsPunctuation('?'); 177 | BOOST_CHECK_MESSAGE( flag == true, "'?' IsDigit result: " << flag); 178 | } 179 | 180 | BOOST_AUTO_TEST_SUITE_END() 181 | 182 | 183 | -------------------------------------------------------------------------------- /test/t_segment_unit.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: t_segment_unit.cc 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 20 Jul 2016 02:54:25 PM CST 7 | ************************************************************************/ 8 | #define BOOST_TEST_DYN_LINK 9 | #define BOOST_TEST_MODULE SegmentWrapper 10 | 11 | #include 12 | #include "unit_test.h" 13 | #include "segmentWrapper.h" 14 | 15 | BOOST_AUTO_TEST_SUITE (segment) 16 | 17 | std::string resDir("../resource/dict"); 18 | std::auto_ptr segWrapper_(new SegmentWrapper(resDir)); 19 | 20 | 21 | // show results 22 | void show_tokens(const std::string& str) { 23 | std::vector tokens; 24 | segWrapper_->segment(str, tokens, false); 25 | 26 | std::cout <<"Input:" < 13 | #include "suggestion.hpp" 14 | 15 | boost::shared_ptr pSuggest(new Suggestion("../resource/")); 16 | 17 | BOOST_AUTO_TEST_SUITE (SuggestionTest) 18 | 19 | // generate test terms 20 | void gendata() { 21 | 22 | // construct data 23 | std::ofstream ofs(".terms.txt"); 24 | if (!ofs) { 25 | std::cout << "Open file .terms.txt file error!\n"; 26 | } 27 | // write test data 28 | ofs << "贝贝德皮诺" << "\t" << 300 << "\t" << 123 << std::endl; 29 | ofs << "bebedepino" << "\t" << 250 << "\t" << 231 << std::endl; 30 | ofs << "背背佳" << "\t" << 130 << "\t" << 42 <RemoveSpace(str) << std::endl; 39 | } 40 | 41 | // Case 1, Suggestion::RemoveSpace() 42 | BOOST_AUTO_TEST_CASE (removeSpace) { 43 | 44 | print_str("be bedi pino"); 45 | } 46 | 47 | BOOST_AUTO_TEST_SUITE_END() 48 | -------------------------------------------------------------------------------- /test/unit_test.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | @ File Name: unit_test.h 3 | @ Method: 4 | @ Author: Jerry Shi 5 | @ Mail: jerryshi0110@gmail.com 6 | @ Created Time: Wed 13 Jul 2016 01:22:10 PM CST 7 | ************************************************************************/ 8 | #ifndef UNIT_TEST_H 9 | #define UNIT_TEST_H 10 | 11 | #include 12 | //#include 13 | #include 14 | #endif // unit_test.h 15 | 16 | --------------------------------------------------------------------------------