├── .gitmodules ├── CMakeLists.txt ├── README.md ├── export.py ├── include └── tts.hpp ├── resource ├── jieba │ ├── hmm_model.utf8 │ ├── idf.utf8 │ ├── jieba.dict.utf8 │ ├── pos_dict │ │ ├── char_state_tab.utf8 │ │ ├── prob_emit.utf8 │ │ ├── prob_start.utf8 │ │ └── prob_trans.utf8 │ ├── stop_words.utf8 │ └── user.dict.utf8 └── pinyin │ └── mandarin │ ├── phrases_dict.txt │ ├── phrases_map.txt │ ├── trans_word.txt │ ├── user_dict.txt │ └── word.txt └── src ├── 3rd_include ├── cpp-pinyin │ ├── CanTone.h │ ├── ChineseG2p.h │ ├── ChineseG2p_p.h │ ├── DictUtil.h │ ├── G2pglobal.h │ ├── Jyutping.h │ ├── ManTone.h │ ├── ManToneUtil.h │ ├── Pinyin.h │ ├── PinyinGlobal.h │ ├── PinyinRes.h │ ├── ToFinal.hpp │ ├── ToneConverter.h │ └── U16Str.h ├── cppjieba │ ├── DictTrie.hpp │ ├── FullSegment.hpp │ ├── HMMModel.hpp │ ├── HMMSegment.hpp │ ├── Jieba.hpp │ ├── KeywordExtractor.hpp │ ├── MPSegment.hpp │ ├── MixSegment.hpp │ ├── PosTagger.hpp │ ├── PreFilter.hpp │ ├── QuerySegment.hpp │ ├── SegmentBase.hpp │ ├── SegmentTagged.hpp │ ├── TextRankExtractor.hpp │ ├── Trie.hpp │ └── Unicode.hpp ├── limonp │ ├── ArgvContext.hpp │ ├── Closure.hpp │ ├── Colors.hpp │ ├── Condition.hpp │ ├── Config.hpp │ ├── ForcePublic.hpp │ ├── LocalVector.hpp │ ├── Logging.hpp │ ├── NonCopyable.hpp │ ├── StdExtension.hpp │ └── StringUtil.hpp └── zh_normalization │ ├── TextNormalizer.h │ ├── chinese_converter.h │ ├── chronology.h │ ├── constants.h │ ├── num.h │ ├── phonecode.h │ └── quantifier.h ├── cpp-pinyin ├── CanTone.cpp ├── ChineseG2p.cpp ├── DictUtil.cpp ├── G2pglobal.cpp ├── Jyutping.cpp ├── ManTone.cpp ├── ManToneUtil.cpp ├── Pinyin.cpp ├── PinyinRes.cpp ├── ToneConverter.cpp └── U16Str.cpp ├── tokenizer.cpp ├── tokenizer.hpp ├── tts.cpp ├── tts_demo.cpp ├── ttsconfig.hpp ├── zh_normalization ├── TextNormalizer.cpp ├── chinese_converter.cpp ├── chronology.cpp ├── constants.cpp ├── num.cpp ├── phonecode.cpp └── quantifier.cpp ├── zhg2p.cpp └── zhg2p.hpp /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "MNN"] 2 | path = MNN 3 | url = https://github.com/alibaba/MNN.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(mnn-tts) 3 | 4 | if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 6 | endif() 7 | 8 | if (MSVC) 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++17") 10 | add_compile_options("$<$:/source-charset:utf-8>") 11 | else() 12 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") 13 | endif() 14 | 15 | set(MNN_LOW_MEMORY ON CACHE BOOL "Open MNN_LOW_MEMORY" FORCE) 16 | set(MNN_SUPPORT_TRANSFORMER_FUSE ON CACHE BOOL "Open MNN_SUPPORT_TRANSFORMER_FUSE" FORCE) 17 | set(MNN_BUILD_AUDIO ON CACHE BOOL "Open MNN_BUILD_AUDIO" FORCE) 18 | add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/MNN) 19 | 20 | # include dir 21 | include_directories(${CMAKE_CURRENT_LIST_DIR}/include/ 22 | ${CMAKE_CURRENT_LIST_DIR}/src/3rd_include/ 23 | ${CMAKE_CURRENT_LIST_DIR}/MNN/include/ 24 | ${CMAKE_CURRENT_LIST_DIR}/MNN/3rd_party/ 25 | ${CMAKE_CURRENT_LIST_DIR}/MNN/tools/audio/include/ 26 | ) 27 | 28 | # source files 29 | FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp 30 | ${CMAKE_CURRENT_LIST_DIR}/src/cpp-pinyin/*.cpp 31 | ${CMAKE_CURRENT_LIST_DIR}/src/zh_normalization/*.cpp) 32 | add_executable(tts_demo ${SRCS}) 33 | 34 | target_link_libraries(tts_demo MNN MNN_Express MNNAudio) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mnn-tts 2 | 3 | 目前仅支持[Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)。 4 | 5 | 6 | ## 模型导出 7 | 8 | ``` 9 | cd mnn-tts 10 | huggingface-cli download --resume-download onnx-community/Kokoro-82M-v1.0-ONNX --local-dir Kokoro-82M-v1.0-ONNX 11 | python export.py ./Kokoro-82M-v1.0-ONNX 12 | ``` 13 | 14 | ## 模型测试 15 | 16 | ``` 17 | mkdir build 18 | cd build 19 | cmake .. && make -j32 20 | ./tts_demo ../model/config.json 你好 21 | open output.wav 22 | ``` 23 | 24 | ## 参考项目 25 | - [kokoro](https://github.com/hexgrad/kokoro) 26 | - [misaki](https://pypi.org/project/misaki/) 27 | - [cppjieba](https://github.com/yanyiwu/cppjieba) 28 | - [cpp-pinyin](https://github.com/wolfgitpr/cpp-pinyin) -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import glob 5 | import base64 6 | import argparse 7 | import numpy as np 8 | import MNN.expr as expr 9 | from MNN.tools import mnnconvert 10 | 11 | RESET = "\033[0m" 12 | GREEN = "\033[32;1m" 13 | YELLOW = "\033[33;4m" 14 | 15 | class Kokoro: 16 | 17 | def __init__(self, args): 18 | self.model_path = args.path 19 | self.dst_path = args.dst_path 20 | if not os.path.exists(self.dst_path): 21 | os.makedirs(self.dst_path) 22 | if os.path.exists(args.mnnconvert): 23 | self.mnnconvert = args.mnnconvert 24 | else: 25 | self.mnnconvert = None 26 | 27 | def convert(self, onnx_path, mnn_path): 28 | convert_args = [ 29 | '', 30 | '-f', 31 | 'ONNX', 32 | '--modelFile', 33 | str(onnx_path), 34 | '--MNNModel', 35 | str(mnn_path), 36 | '--weightQuantBits', 37 | '8', 38 | #'--weightQuantBlock', 39 | #'128' 40 | ] 41 | sfd = os.dup(1) 42 | log_fp = open('./.export.log', "a") 43 | log_fd = log_fp.fileno() 44 | # mnnconvert ... > .export.log 45 | os.dup2(log_fd, 1) 46 | try: 47 | sys.argv = convert_args 48 | sys.argc = len(convert_args) 49 | if self.mnnconvert is None: 50 | mnnconvert.main() 51 | else: 52 | convert_args[0] = self.mnnconvert 53 | cmd = ' '.join(convert_args) 54 | message = os.popen(cmd).read() 55 | print(message) 56 | sys.argv = [] 57 | finally: 58 | os.dup2(sfd, 1) 59 | os.close(log_fd) 60 | 61 | def export_model(self): 62 | onnx_file = os.path.join(self.model_path, "onnx", "model.onnx") 63 | self.convert(onnx_file, f'{self.dst_path}/tts.mnn') 64 | 65 | def export_voice(self): 66 | voices = [] 67 | self.styles = [] 68 | for voice_file in glob.glob(os.path.join(self.model_path, "voices", "*.bin")): 69 | style = os.path.basename(voice_file).split('.')[0] 70 | if style != 'zf_xiaoxiao': continue 71 | voice = np.fromfile(voice_file, dtype=np.float32).reshape(-1, 1, 256) 72 | voices.append(expr.const(voice, voice.shape, expr.NCHW, expr.float)) 73 | expr.save(voices, f'{self.dst_path}/voices.mnn') 74 | 75 | def export_config(self): 76 | tts_config = {} 77 | tts_config['styles'] = self.styles 78 | with open(f'{self.dst_path}/tts_config.json', 'w', encoding='utf-8') as f: 79 | json.dump(tts_config, f, ensure_ascii=False, indent=4) 80 | with open(f'{self.dst_path}/config.json', 'w', encoding='utf-8') as f: 81 | config = { 82 | "tts_model": "tts.mnn", 83 | "voices": "voices.mnn", 84 | "tokenizer_file": "tokenizer.txt", 85 | "backend_type": "cpu", 86 | "thread_num": 4, 87 | "precision": "low", 88 | "memory": "low", 89 | } 90 | json.dump(config, f, ensure_ascii=False, indent=4) 91 | 92 | def export_tokenizer(self): 93 | # TOKENIZER MAGIC NUMBER 94 | MAGIC_NUMBER = 430 95 | # TOKENIZER TYPE 96 | SENTENCEPIECE = 0; TIKTOIKEN = 1; BERT = 2; HUGGINGFACE = 3 97 | def write_line(fp, *args): 98 | for arg in args: 99 | for token in arg: 100 | fp.write(str(token) + ' ') 101 | fp.write('\n') 102 | def write_header(fp, type, stop_ids, speicals, prefix = []): 103 | fp.write(f'{MAGIC_NUMBER} {type}\n') 104 | fp.write(f'{len(speicals)} {len(stop_ids)} {len(prefix)}\n') 105 | write_line(fp, speicals, stop_ids, prefix) 106 | 107 | tokenizer_file = os.path.join(self.model_path, "tokenizer.json") 108 | with open(tokenizer_file, "r", encoding="utf-8") as f: 109 | vocab_dict = json.load(f)['model']['vocab'] 110 | vocab_size = 0 111 | for k, v in vocab_dict.items(): 112 | vocab_size = max(vocab_size, v) 113 | vocab_list = ['' for i in range(vocab_size + 1)] 114 | for k, v in vocab_dict.items(): 115 | vocab_list[v] = k 116 | file_path = os.path.join(self.dst_path, "tokenizer.txt") 117 | with open(file_path, "w", encoding="utf8") as fp: 118 | write_header(fp, TIKTOIKEN, [], [], []) 119 | fp.write(f'{len(vocab_list)}\n') 120 | for v in vocab_list: 121 | line = base64.b64encode(v.encode('utf-8')).decode("utf8") + "\n" 122 | fp.write(line) 123 | 124 | def export(self): 125 | self.export_model() 126 | self.export_voice() 127 | self.export_tokenizer() 128 | self.export_config() 129 | print(f'{GREEN}[SUCCESS]{RESET} export model to {YELLOW}{self.dst_path}{RESET}') 130 | 131 | 132 | if __name__ == '__main__': 133 | parser = argparse.ArgumentParser(description='tts_exporter', formatter_class=argparse.RawTextHelpFormatter) 134 | parser.add_argument('--path', type=str, required=True, help='path of model.') 135 | parser.add_argument('--dst_path', type=str, default='./model', help='export onnx/mnn model to path, defaut is `./model`.') 136 | parser.add_argument('--mnnconvert', type=str, default='../../../build/MNNConvert', help='local mnnconvert path, if invalid, using pymnn.') 137 | args = parser.parse_args() 138 | kokoro = Kokoro(args) 139 | kokoro.export() -------------------------------------------------------------------------------- /include/tts.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // tts.hpp 3 | // 4 | // Created by MNN on 2025/2/20. 5 | // ZhaodeWang 6 | // 7 | 8 | #ifndef TTS_hpp 9 | #define TTS_hpp 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace MNN { 27 | namespace Transformer { 28 | 29 | class TtsConfig; 30 | class Tokenizer; 31 | class Zhg2p; 32 | 33 | class MNN_PUBLIC Tts { 34 | public: 35 | static Tts* createTTS(const std::string& config_path); 36 | static void save(const std::string& file, Express::VARP wavform); 37 | Tts(std::shared_ptr config) : config_(config) {} 38 | virtual ~Tts(); 39 | void load(); 40 | Express::VARP generate(const std::string& text, float speed = 1.0f); 41 | private: 42 | std::shared_ptr config_; 43 | std::shared_ptr tokenizer_; 44 | std::shared_ptr runtime_manager_; 45 | std::shared_ptr module_; 46 | std::vector voices_; 47 | std::shared_ptr g2p_; 48 | }; 49 | 50 | } 51 | } 52 | 53 | #endif // TTS_hpp 54 | -------------------------------------------------------------------------------- /resource/jieba/pos_dict/prob_start.utf8: -------------------------------------------------------------------------------- 1 | #初始状态的概率 2 | #格式 3 | #状态:概率 4 | B,a:-4.7623052146 5 | B,ad:-6.68006603678 6 | B,ag:-3.14e+100 7 | B,an:-8.69708322302 8 | B,b:-5.01837436211 9 | B,bg:-3.14e+100 10 | B,c:-3.42388018495 11 | B,d:-3.97504752976 12 | B,df:-8.88897423083 13 | B,dg:-3.14e+100 14 | B,e:-8.56355183039 15 | B,en:-3.14e+100 16 | B,f:-5.49163041848 17 | B,g:-3.14e+100 18 | B,h:-13.53336513 19 | B,i:-6.11578472756 20 | B,in:-3.14e+100 21 | B,j:-5.05761912847 22 | B,jn:-3.14e+100 23 | B,k:-3.14e+100 24 | B,l:-4.90588358466 25 | B,ln:-3.14e+100 26 | B,m:-3.6524299819 27 | B,mg:-3.14e+100 28 | B,mq:-6.7869530014 29 | B,n:-1.69662577975 30 | B,ng:-3.14e+100 31 | B,nr:-2.23104959138 32 | B,nrfg:-5.87372217541 33 | B,nrt:-4.98564273352 34 | B,ns:-2.8228438315 35 | B,nt:-4.84609166818 36 | B,nz:-3.94698846058 37 | B,o:-8.43349870215 38 | B,p:-4.20098413209 39 | B,q:-6.99812385896 40 | B,qe:-3.14e+100 41 | B,qg:-3.14e+100 42 | B,r:-3.40981877908 43 | B,rg:-3.14e+100 44 | B,rr:-12.4347528413 45 | B,rz:-7.94611647157 46 | B,s:-5.52267359084 47 | B,t:-3.36474790945 48 | B,tg:-3.14e+100 49 | B,u:-9.1639172775 50 | B,ud:-3.14e+100 51 | B,ug:-3.14e+100 52 | B,uj:-3.14e+100 53 | B,ul:-3.14e+100 54 | B,uv:-3.14e+100 55 | B,uz:-3.14e+100 56 | B,v:-2.67405848743 57 | B,vd:-9.04472876024 58 | B,vg:-3.14e+100 59 | B,vi:-12.4347528413 60 | B,vn:-4.33156108902 61 | B,vq:-12.1470707689 62 | B,w:-3.14e+100 63 | B,x:-3.14e+100 64 | B,y:-9.84448567586 65 | B,yg:-3.14e+100 66 | B,z:-7.04568111149 67 | B,zg:-3.14e+100 68 | E,a:-3.14e+100 69 | E,ad:-3.14e+100 70 | E,ag:-3.14e+100 71 | E,an:-3.14e+100 72 | E,b:-3.14e+100 73 | E,bg:-3.14e+100 74 | E,c:-3.14e+100 75 | E,d:-3.14e+100 76 | E,df:-3.14e+100 77 | E,dg:-3.14e+100 78 | E,e:-3.14e+100 79 | E,en:-3.14e+100 80 | E,f:-3.14e+100 81 | E,g:-3.14e+100 82 | E,h:-3.14e+100 83 | E,i:-3.14e+100 84 | E,in:-3.14e+100 85 | E,j:-3.14e+100 86 | E,jn:-3.14e+100 87 | E,k:-3.14e+100 88 | E,l:-3.14e+100 89 | E,ln:-3.14e+100 90 | E,m:-3.14e+100 91 | E,mg:-3.14e+100 92 | E,mq:-3.14e+100 93 | E,n:-3.14e+100 94 | E,ng:-3.14e+100 95 | E,nr:-3.14e+100 96 | E,nrfg:-3.14e+100 97 | E,nrt:-3.14e+100 98 | E,ns:-3.14e+100 99 | E,nt:-3.14e+100 100 | E,nz:-3.14e+100 101 | E,o:-3.14e+100 102 | E,p:-3.14e+100 103 | E,q:-3.14e+100 104 | E,qe:-3.14e+100 105 | E,qg:-3.14e+100 106 | E,r:-3.14e+100 107 | E,rg:-3.14e+100 108 | E,rr:-3.14e+100 109 | E,rz:-3.14e+100 110 | E,s:-3.14e+100 111 | E,t:-3.14e+100 112 | E,tg:-3.14e+100 113 | E,u:-3.14e+100 114 | E,ud:-3.14e+100 115 | E,ug:-3.14e+100 116 | E,uj:-3.14e+100 117 | E,ul:-3.14e+100 118 | E,uv:-3.14e+100 119 | E,uz:-3.14e+100 120 | E,v:-3.14e+100 121 | E,vd:-3.14e+100 122 | E,vg:-3.14e+100 123 | E,vi:-3.14e+100 124 | E,vn:-3.14e+100 125 | E,vq:-3.14e+100 126 | E,w:-3.14e+100 127 | E,x:-3.14e+100 128 | E,y:-3.14e+100 129 | E,yg:-3.14e+100 130 | E,z:-3.14e+100 131 | E,zg:-3.14e+100 132 | M,a:-3.14e+100 133 | M,ad:-3.14e+100 134 | M,ag:-3.14e+100 135 | M,an:-3.14e+100 136 | M,b:-3.14e+100 137 | M,bg:-3.14e+100 138 | M,c:-3.14e+100 139 | M,d:-3.14e+100 140 | M,df:-3.14e+100 141 | M,dg:-3.14e+100 142 | M,e:-3.14e+100 143 | M,en:-3.14e+100 144 | M,f:-3.14e+100 145 | M,g:-3.14e+100 146 | M,h:-3.14e+100 147 | M,i:-3.14e+100 148 | M,in:-3.14e+100 149 | M,j:-3.14e+100 150 | M,jn:-3.14e+100 151 | M,k:-3.14e+100 152 | M,l:-3.14e+100 153 | M,ln:-3.14e+100 154 | M,m:-3.14e+100 155 | M,mg:-3.14e+100 156 | M,mq:-3.14e+100 157 | M,n:-3.14e+100 158 | M,ng:-3.14e+100 159 | M,nr:-3.14e+100 160 | M,nrfg:-3.14e+100 161 | M,nrt:-3.14e+100 162 | M,ns:-3.14e+100 163 | M,nt:-3.14e+100 164 | M,nz:-3.14e+100 165 | M,o:-3.14e+100 166 | M,p:-3.14e+100 167 | M,q:-3.14e+100 168 | M,qe:-3.14e+100 169 | M,qg:-3.14e+100 170 | M,r:-3.14e+100 171 | M,rg:-3.14e+100 172 | M,rr:-3.14e+100 173 | M,rz:-3.14e+100 174 | M,s:-3.14e+100 175 | M,t:-3.14e+100 176 | M,tg:-3.14e+100 177 | M,u:-3.14e+100 178 | M,ud:-3.14e+100 179 | M,ug:-3.14e+100 180 | M,uj:-3.14e+100 181 | M,ul:-3.14e+100 182 | M,uv:-3.14e+100 183 | M,uz:-3.14e+100 184 | M,v:-3.14e+100 185 | M,vd:-3.14e+100 186 | M,vg:-3.14e+100 187 | M,vi:-3.14e+100 188 | M,vn:-3.14e+100 189 | M,vq:-3.14e+100 190 | M,w:-3.14e+100 191 | M,x:-3.14e+100 192 | M,y:-3.14e+100 193 | M,yg:-3.14e+100 194 | M,z:-3.14e+100 195 | M,zg:-3.14e+100 196 | S,a:-3.90253968313 197 | S,ad:-11.0484584802 198 | S,ag:-6.95411391796 199 | S,an:-12.8402179494 200 | S,b:-6.47288876397 201 | S,bg:-3.14e+100 202 | S,c:-4.78696679586 203 | S,d:-3.90391976418 204 | S,df:-3.14e+100 205 | S,dg:-8.9483976513 206 | S,e:-5.94251300628 207 | S,en:-3.14e+100 208 | S,f:-5.19482024998 209 | S,g:-6.50782681533 210 | S,h:-8.65056320738 211 | S,i:-3.14e+100 212 | S,in:-3.14e+100 213 | S,j:-4.91199211964 214 | S,jn:-3.14e+100 215 | S,k:-6.94032059583 216 | S,l:-3.14e+100 217 | S,ln:-3.14e+100 218 | S,m:-3.26920065212 219 | S,mg:-10.8253149289 220 | S,mq:-3.14e+100 221 | S,n:-3.85514838976 222 | S,ng:-4.9134348611 223 | S,nr:-4.48366310396 224 | S,nrfg:-3.14e+100 225 | S,nrt:-3.14e+100 226 | S,ns:-3.14e+100 227 | S,nt:-12.1470707689 228 | S,nz:-3.14e+100 229 | S,o:-8.46446092775 230 | S,p:-2.98684018136 231 | S,q:-4.88865861826 232 | S,qe:-3.14e+100 233 | S,qg:-3.14e+100 234 | S,r:-2.76353367841 235 | S,rg:-10.2752685919 236 | S,rr:-3.14e+100 237 | S,rz:-3.14e+100 238 | S,s:-3.14e+100 239 | S,t:-3.14e+100 240 | S,tg:-6.27284253188 241 | S,u:-6.94032059583 242 | S,ud:-7.72823016105 243 | S,ug:-7.53940370266 244 | S,uj:-6.85251045118 245 | S,ul:-8.41537131755 246 | S,uv:-8.15808672229 247 | S,uz:-9.29925862537 248 | S,v:-3.05329230341 249 | S,vd:-3.14e+100 250 | S,vg:-5.94301818437 251 | S,vi:-3.14e+100 252 | S,vn:-11.4539235883 253 | S,vq:-3.14e+100 254 | S,w:-3.14e+100 255 | S,x:-8.42741965607 256 | S,y:-6.19707946995 257 | S,yg:-13.53336513 258 | S,z:-3.14e+100 259 | S,zg:-3.14e+100 260 | -------------------------------------------------------------------------------- /resource/jieba/user.dict.utf8: -------------------------------------------------------------------------------- 1 | 云计算 2 | 韩玉鉴赏 3 | 蓝翔 nz 4 | 区块链 10 nz 5 | -------------------------------------------------------------------------------- /resource/pinyin/mandarin/user_dict.txt: -------------------------------------------------------------------------------- 1 | 陟罚臧否:zhi4 fa2 zang2 pi3 2 | 汤汤:shang1 shang1 3 | 到了:dao4 le1 4 | 脖颈:bo2 geng3 5 | 破的:po4 de5 6 | 重场:zhong4 chang3 7 | 很重:hen3 zhong4 8 | 跪地:gui4 di4 9 | 都会:dou1 hui4 10 | 乐都:le4 dou1 11 | 花都:hua1 dou1 12 | 中都:zhong1 dou1 13 | 上都:shang4 dou1 14 | 大都:da4 dou1 15 | 曾都:ceng2 dou1 16 | 陪都:pei2 dou1 17 | 京都:jing1 dou1 18 | 国都:guo2 dou1 19 | 成都:cheng2 du1 20 | 莞然:wan3 ran2 21 | 着边:zhe5 bian1 22 | 彷佛:fang3 fu2 23 | 得要:de2 yao4 24 | 吱吱:zhi1 zhi1 25 | 非得:fei1 de2 26 | 美的:mei3 de5 27 | 中的:zhong1 de5 28 | 席地:xi2 di4 29 | 一地:yi2 di4 30 | 之地:zhi1 di4 31 | 今朝:jin2 zhao1 32 | 本色:ben3 se4 33 | 执著:zhi2 zhuo2 34 | 朝霞:zhao1 xia2 35 | 盛满:cheng2 man3 36 | 着眼:zhe5 yan3 37 | 着地:zhuo2 de5 38 | 泥地:ni2 di4 39 | 雪地:xue3 di4 40 | 地煞:di4 sha4 41 | 地久:di4 jiu3 42 | 地裂:di4 lie4 43 | 一宿:yi4 xiu3 44 | 一觉:yi2 jiao4 45 | 乐经:le4 jing1 46 | 将进酒:qiang1 jin4 jiu3 47 | 天和地:tian1 he2 di4 48 | 天塌地陷:tian1 ta1 di4 xian4 49 | 流血:liu2 xie3 50 | 纶巾:guan1 jin1 51 | 穿着:chuan1 zhe5 52 | 都没:dou1 mei2 53 | 都是:dou1 shi4 54 | 一行:yi4 hang2 55 | 一朝:yi4 zhao1 56 | 两行:liang3 hang2 57 | 面的:mian4 de5 58 | 没入:mo4 ru4 59 | 还重:hai2 zhong4 60 | 情重:qing2 zhong4 61 | 重色:zhong4 se4 62 | 澄清:cheng2 qing1 63 | 两行:liang3 hang2 64 | 几行:ji3 hang2 65 | 重头:chong2 tou2 66 | 好重:hao3 zhong4 67 | 狗血:gou3 xue4 68 | 屏住:bing3 zhu4 69 | 满地:man3 di4 70 | 彷佛:fang3 fu2 71 | 端的:duan1 de5 72 | 了了:liao3 le5 73 | 诗行:shi1 hang2 74 | 传来:chuan2 lai2 75 | 朝暮:zhao1 mu4 76 | 吞没了:tun1 mo4 le5 77 | 淹没了:yan1 mo4 le5 78 | 言重复:yan2 chong2 fu4 79 | 牵着手:qian2 zhe5 shou3 80 | 一行行:yi4 hang2 hang2 81 | 还愿意:hai2 yuan4 yi4 82 | 重感冒:zhong4 gan3 mao4 83 | 地之角:di4 zhi1 jiao3 84 | 相似的:xiang1 si4 de5 85 | 睡不着:shui4 bu4 zhao2 86 | 类似的:lei4 si4 de5 87 | 得知了:de2 zhi1 le5 88 | 没日没夜:mei2 ri4 mei2 ye4 89 | 天昏地暗:tian1 hun1 di4 an4 90 | 装模作样:zhuang1 mu2 zuo4 yang4 91 | 了无音讯:liao3 wu2 yin2 xun4 92 | 心事重重:xin1 shi4 chong2 chong2 93 | 重重关卡:chong2 chong2 guan1 qia3 94 | 情深意重:qing2 shen1 yi4 zhong4 95 | 了无牵挂:liao3 wu2 qian1 gua4 96 | 喜怒哀乐:xi3 nu4 ai1 le4 97 | 天南地北:tian1 nan2 di4 bei3 98 | 天翻地覆:tian1 fan1 di4 fu4 99 | 草长莺飞:cao3 zhang3 ying1 fei1 100 | 随随便便:sui2 sui2 bian4 bian4 101 | 想着想着:xiang3 zhe5 xiang3 zhe5 -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/CanTone.h: -------------------------------------------------------------------------------- 1 | #ifndef CANTONECONVERTER_H 2 | #define CANTONECONVERTER_H 3 | 4 | #include 5 | #include 6 | 7 | namespace Pinyin 8 | { 9 | class CPP_PINYIN_EXPORT CanTone final : public ToneConverter { 10 | public: 11 | enum Style { 12 | // 普通风格,不带声调。如: 中国 -> ``zung gwok`` 13 | NORMAL = 0, 14 | // 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``zung1 gwok3`` 15 | TONE3 = 8 16 | }; 17 | 18 | CanTone() { 19 | m_converts.insert({static_cast(Style::NORMAL), tone3ToNormal}); 20 | }; 21 | ~CanTone() override = default; 22 | 23 | static std::u16string tone3ToNormal(const std::u16string &pinyin, bool v_to_u = false, 24 | bool neutral_tone_with_five = false); 25 | }; 26 | } // Pinyin 27 | 28 | #endif //CANTONECONVERTER_H 29 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/ChineseG2p.h: -------------------------------------------------------------------------------- 1 | #ifndef ChineseG2p_H 2 | #define ChineseG2p_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace Pinyin 12 | { 13 | enum CPP_PINYIN_EXPORT Error { 14 | // Keep original characters 15 | Default = 0, 16 | // Ignore this character (do not export) 17 | Ignore = 1 18 | }; 19 | 20 | class ChineseG2pPrivate; 21 | 22 | class CPP_PINYIN_EXPORT ChineseG2p { 23 | public: 24 | explicit ChineseG2p(const std::string &language); 25 | 26 | ~ChineseG2p(); 27 | 28 | bool initialized() const; 29 | 30 | bool loadUserDict(const std::filesystem::path &filePath) const; 31 | 32 | void setToneConverter(const ToneConverter &toneConverter) const; 33 | 34 | std::string tradToSim(const std::string &oneHanzi) const; 35 | 36 | bool isPolyphonic(const std::string &oneHanzi) const; 37 | 38 | protected: 39 | PinyinResVector hanziToPinyin(const std::string &hans, int style = 0, 40 | Error error = Default, bool candidates = true, bool v_to_u = false, 41 | bool neutral_tone_with_five = false) const; 42 | 43 | PinyinResVector hanziToPinyin(const std::vector &hans, 44 | int style = 0, Error error = Default, 45 | bool candidates = true, bool v_to_u = false, 46 | bool neutral_tone_with_five = false) const; 47 | 48 | std::vector getDefaultPinyin(const std::string &oneHanzi, int style = 0, bool v_to_u = false, 49 | bool neutral_tone_with_five = false) const; 50 | 51 | std::unique_ptr d_ptr; 52 | 53 | private: 54 | PinyinResVector hanziToPinyin(const std::vector &hans, 55 | int style = 0, Error error = Default, 56 | bool candidates = true, bool v_to_u = false, 57 | bool neutral_tone_with_five = false) const; 58 | PinyinResVector hanziToPinyin(const std::vector &hansList, int style = 0, 59 | Error error = Default, bool candidates = true, bool v_to_u = false, 60 | bool neutral_tone_with_five = false) const; 61 | }; 62 | } 63 | 64 | #endif // ChineseG2p_H 65 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/ChineseG2p_p.h: -------------------------------------------------------------------------------- 1 | #ifndef ChineseG2pPRIVATE_H 2 | #define ChineseG2pPRIVATE_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "cpp-pinyin/U16Str.h" 10 | 11 | namespace Pinyin 12 | { 13 | class ChineseG2pPrivate final { 14 | public: 15 | explicit ChineseG2pPrivate(std::string language); 16 | ~ChineseG2pPrivate(); 17 | 18 | void init(); 19 | 20 | bool initialized = false; 21 | 22 | std::unordered_map phrases_map; 23 | std::unordered_map> phrases_dict; 24 | std::unordered_map> word_dict; 25 | std::unordered_map trans_dict; 26 | 27 | std::string m_language; 28 | ToneConverter m_toneConverter; 29 | 30 | inline bool isPolyphonic(const char16_t &oneHanzi) const { 31 | return phrases_map.find(oneHanzi) != phrases_map.end(); 32 | } 33 | 34 | inline char16_t tradToSim(const char16_t &oneHanzi) const { 35 | const auto &it = trans_dict.find(oneHanzi); 36 | return it != trans_dict.end() ? it->second : oneHanzi; 37 | } 38 | 39 | inline std::u16string toneConvert(const std::u16string &pinyin, int style, bool v_to_u = false, 40 | bool neutral_tone_with_five = false) const { 41 | return m_toneConverter.convert({pinyin.begin(), pinyin.end()}, style, v_to_u, neutral_tone_with_five); 42 | } 43 | 44 | inline std::vector toneConvert(const std::vector &pinyin, int style, 45 | bool v_to_u = false, 46 | bool neutral_tone_with_five = false) const { 47 | std::vector tonePinyin; 48 | tonePinyin.reserve(pinyin.size()); 49 | for (const std::u16string &p : pinyin) { 50 | tonePinyin.push_back(toneConvert(p, style, v_to_u, neutral_tone_with_five)); 51 | } 52 | return tonePinyin; 53 | } 54 | 55 | std::unordered_set toneSeen; 56 | std::vector toneCandidates; 57 | 58 | inline std::vector getDefaultPinyin(const char16_t &oneHanzi, int style = 0, 59 | bool v_to_u = false, 60 | bool neutral_tone_with_five = false) { 61 | const auto &it = word_dict.find(oneHanzi); 62 | if (it == word_dict.end()) 63 | return {u16strToUtf8str(oneHanzi)}; 64 | 65 | const std::vector &candidates = it->second; 66 | 67 | toneCandidates.clear(); 68 | toneSeen.clear(); 69 | 70 | for (const std::u16string &pinyin : candidates) { 71 | const auto &tarPinyin = u16strToUtf8str(toneConvert(pinyin, style, v_to_u, neutral_tone_with_five)); 72 | if (toneSeen.insert(tarPinyin).second) { 73 | toneCandidates.push_back(tarPinyin); 74 | } 75 | } 76 | 77 | if (toneCandidates.empty()) 78 | return {u16strToUtf8str(oneHanzi)}; 79 | return toneCandidates; 80 | } 81 | 82 | void zhPosition(const std::vector &input, std::vector &res, 83 | std::vector &positions); 84 | }; 85 | } 86 | 87 | #endif // ChineseG2pPRIVATE_H 88 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/DictUtil.h: -------------------------------------------------------------------------------- 1 | #ifndef DICTUTIL_H 2 | #define DICTUTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "ManToneUtil.h" 9 | 10 | namespace Pinyin 11 | { 12 | bool loadDict(const std::filesystem::path &dict_dir, 13 | std::unordered_map &resultMap, const char &sep1 = ':'); 14 | 15 | bool loadDict(const std::filesystem::path &dict_dir, 16 | std::unordered_map &resultMap, const char &sep1 = ':'); 17 | 18 | bool loadDict(const std::filesystem::path &dict_dir, 19 | std::unordered_map> &resultMap, const char &sep1 = ':', 20 | const std::string &sep2 = ","); 21 | 22 | bool loadDict(const std::filesystem::path &dict_dir, 23 | std::unordered_map> &resultMap, const char &sep1 = ':', 24 | const std::string &sep2 = ","); 25 | 26 | bool loadAdditionalDict(const std::filesystem::path &dict_dir, 27 | std::unordered_map> &resultMap, 28 | const char &sep1 = ':', 29 | const std::string &sep2 = " ", 30 | const std::function &converterForDefaultPinyin 31 | = tone3ToTone); 32 | } // Pinyin 33 | 34 | #endif //DICTUTIL_H 35 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/G2pglobal.h: -------------------------------------------------------------------------------- 1 | #ifndef G2PGLOBAL_H 2 | #define G2PGLOBAL_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace Pinyin 9 | { 10 | std::filesystem::path CPP_PINYIN_EXPORT dictionaryPath(); 11 | 12 | void CPP_PINYIN_EXPORT setDictionaryPath(const std::filesystem::path &dir); 13 | 14 | bool CPP_PINYIN_EXPORT isLetter(const char16_t &c); 15 | 16 | bool CPP_PINYIN_EXPORT isHanzi(const char16_t &c); 17 | 18 | bool CPP_PINYIN_EXPORT isKana(const char16_t &c); 19 | 20 | bool CPP_PINYIN_EXPORT isDigit(const char16_t &c); 21 | 22 | bool CPP_PINYIN_EXPORT isSpace(const char16_t &c); 23 | 24 | bool CPP_PINYIN_EXPORT isSpecialKana(const char16_t &c); 25 | } 26 | 27 | #endif // G2PGLOBAL_H 28 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/Jyutping.h: -------------------------------------------------------------------------------- 1 | #ifndef DATASET_TOOLS_CANTONESE_H 2 | #define DATASET_TOOLS_CANTONESE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace Pinyin 9 | { 10 | class CPP_PINYIN_EXPORT Jyutping final : public ChineseG2p { 11 | public: 12 | explicit Jyutping() : 13 | ChineseG2p("cantonese") { 14 | this->setToneConverter(m_toneConverter); 15 | } 16 | 17 | ~Jyutping() = default; 18 | 19 | PinyinResVector hanziToPinyin(const std::string &hans, 20 | CanTone::Style style = CanTone::Style::TONE3, 21 | Error error = Default, bool candidates = true) const; 22 | 23 | PinyinResVector hanziToPinyin(const std::vector &hans, 24 | CanTone::Style style = CanTone::Style::TONE3, 25 | Error error = Default, bool candidates = true) const; 26 | std::vector getDefaultPinyin(const std::string &hanzi, 27 | CanTone::Style style = CanTone::Style::TONE3) const; 28 | 29 | CanTone m_toneConverter; 30 | }; 31 | } 32 | #endif // DATASET_TOOLS_CANTONESE_H 33 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/ManTone.h: -------------------------------------------------------------------------------- 1 | #ifndef TONEUTIL_H 2 | #define TONEUTIL_H 3 | 4 | #include 5 | #include 6 | 7 | namespace Pinyin 8 | { 9 | class CPP_PINYIN_EXPORT ManTone final : public ToneConverter { 10 | public: 11 | // https://github.com/mozillazg/python-pinyin/blob/master/pypinyin/constants.py 12 | enum Style { 13 | // 普通风格,不带声调。如: 中国 -> ``zhong guo`` 14 | NORMAL = 0, 15 | // 标准声调风格,拼音声调在韵母第一个字母上(默认风格)。如: 中国 -> ``zhōng guó`` 16 | TONE = 1, 17 | // 声调风格2,即拼音声调在各个韵母之后,用数字 [1-4] 进行表示。如: 中国 -> ``zho1ng guo2`` 18 | TONE2 = 2, 19 | // 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``zhong1 guo2`` 20 | TONE3 = 8 21 | }; 22 | 23 | ManTone() { 24 | m_converts.insert({static_cast(Style::NORMAL), toneToNormal}); 25 | m_converts.insert({static_cast(Style::TONE), toneToTone}); 26 | m_converts.insert({static_cast(Style::TONE2), toneToTone2}); 27 | m_converts.insert({static_cast(Style::TONE3), toneToTone3}); 28 | }; 29 | ~ManTone() override = default; 30 | 31 | static std::u16string toneToNormal(const std::u16string &pinyin, bool v_to_u = false, 32 | bool neutral_tone_with_five = false); 33 | 34 | static std::u16string toneToTone(const std::u16string &pinyin, bool v_to_u = false, 35 | bool neutral_tone_with_five = false); 36 | 37 | static std::u16string toneToTone2(const std::u16string &pinyin, bool v_to_u = false, 38 | bool neutral_tone_with_five = false); 39 | 40 | static std::u16string toneToTone3(const std::u16string &pinyin, bool v_to_u = false, 41 | bool neutral_tone_with_five = false); 42 | }; 43 | 44 | 45 | } // Pinyin 46 | 47 | #endif //TONEUTIL_H 48 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/ManToneUtil.h: -------------------------------------------------------------------------------- 1 | #ifndef MANTONEUTIL_H 2 | #define MANTONEUTIL_H 3 | 4 | #include 5 | 6 | namespace Pinyin 7 | { 8 | std::u16string tone3ToTone(const std::u16string &pinyin); 9 | } // Pinyin 10 | 11 | #endif //MANTONEUTIL_H 12 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/Pinyin.h: -------------------------------------------------------------------------------- 1 | #ifndef DATASET_TOOLS_MANDARIN_H 2 | #define DATASET_TOOLS_MANDARIN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace Pinyin 9 | { 10 | class CPP_PINYIN_EXPORT Pinyin final : public ChineseG2p { 11 | public: 12 | explicit Pinyin() : 13 | ChineseG2p("mandarin") { 14 | this->setToneConverter(m_toneConverter); 15 | } 16 | 17 | ~Pinyin() = default; 18 | 19 | PinyinResVector hanziToPinyin(const std::string &hans, 20 | ManTone::Style style = ManTone::Style::TONE, 21 | Error error = Default, bool candidates = true, bool v_to_u = false, 22 | bool neutral_tone_with_five = false) const; 23 | 24 | PinyinResVector hanziToPinyin(const std::vector &hans, 25 | ManTone::Style style = ManTone::Style::TONE, 26 | Error error = Default, bool candidates = true, bool v_to_u = false, 27 | bool neutral_tone_with_five = false) const; 28 | 29 | std::vector getDefaultPinyin(const std::string &hanzi, 30 | ManTone::Style style = ManTone::Style::TONE, 31 | bool v_to_u = false, bool neutral_tone_with_five = false) const; 32 | 33 | ManTone m_toneConverter; 34 | }; 35 | } 36 | 37 | #endif // DATASET_TOOLS_MANDARIN_H 38 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/PinyinGlobal.h: -------------------------------------------------------------------------------- 1 | #ifndef PINYINGLOBAL_H 2 | #define PINYINGLOBAL_H 3 | 4 | #ifdef _MSC_VER 5 | # define CPP_PINYIN_DECL_EXPORT __declspec(dllexport) 6 | # define CPP_PINYIN_DECL_IMPORT __declspec(dllimport) 7 | #else 8 | # define CPP_PINYIN_DECL_EXPORT __attribute__((visibility("default"))) 9 | # define CPP_PINYIN_DECL_IMPORT __attribute__((visibility("default"))) 10 | #endif 11 | 12 | #ifndef CPP_PINYIN_EXPORT 13 | # ifdef CPP_PINYIN_STATIC 14 | # define CPP_PINYIN_EXPORT 15 | # else 16 | # ifdef CPP_PINYIN_LIBRARY 17 | # define CPP_PINYIN_EXPORT CPP_PINYIN_DECL_EXPORT 18 | # else 19 | # define CPP_PINYIN_EXPORT CPP_PINYIN_DECL_IMPORT 20 | # endif 21 | # endif 22 | #endif 23 | 24 | #endif //PINYINGLOBAL_H 25 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/PinyinRes.h: -------------------------------------------------------------------------------- 1 | #ifndef G2PRES_H 2 | #define G2PRES_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace Pinyin 10 | { 11 | struct CPP_PINYIN_EXPORT PinyinRes { 12 | std::string hanzi; 13 | std::string pinyin; 14 | std::vector candidates; // Candidate pinyin of Polyphonic Characters. 15 | bool error = true; // Whether the conversion failed. 16 | }; 17 | 18 | class CPP_PINYIN_EXPORT PinyinResVector : public std::vector { 19 | public: 20 | // Convert PinyinResVector to std::vector 21 | std::vector toStdVector() const; 22 | 23 | // Convert PinyinResVector to std::string with delimiter 24 | std::string toStdStr(const std::string &delimiter = " ") const; 25 | }; 26 | } 27 | #endif //G2PRES_H 28 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/ToFinal.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | // 编码转换工具 12 | wstring_convert> converter; 13 | 14 | // 韵母表 15 | const unordered_set _FINALS = { 16 | L"i", L"u", L"ü", L"a", L"ia", L"ua", L"o", L"uo", 17 | L"e", L"ie", L"üe", L"ai", L"uai", L"ei", L"uei", L"ao", 18 | L"iao", L"ou", L"iou", L"an", L"ian", L"uan", L"üan", L"en", 19 | L"in", L"uen", L"ün", L"ang", L"iang", L"uang", L"eng", L"ing", 20 | L"ueng", L"ong", L"iong", L"er", L"ê", 21 | }; 22 | 23 | // u -> ü 映射 24 | const map UV_MAP = { 25 | {L"u", L"ü"}, {L"ū", L"ǖ"}, {L"ú", L"ǘ"}, {L"ǔ", L"ǚ"}, {L"ù", L"ǜ"}}; 26 | const set U_TONES = {L"u", L"ū", L"ú", L"ǔ", L"ù"}; 27 | const set I_TONES = {L"i", L"ī", L"í", L"ǐ", L"ì"}; 28 | 29 | // iu -> iou 映射 30 | const map IU_MAP = { 31 | {L"iu", L"iou"}, {L"iū", L"ioū"}, {L"iú", L"ioú"}, {L"iǔ", L"ioǔ"}, {L"iù", L"ioù"}}; 32 | 33 | // ui -> uei 映射 34 | const map UI_MAP = { 35 | {L"ui", L"uei"}, {L"uī", L"ueī"}, {L"uí", L"ueí"}, {L"uǐ", L"ueǐ"}, {L"uì", L"ueì"}}; 36 | 37 | // un -> uen 映射 38 | const map UN_MAP = { 39 | {L"un", L"uen"}, {L"ūn", L"ūen"}, {L"ún", L"úen"}, {L"ǔn", L"ǔen"}, {L"ùn", L"ùen"}}; 40 | 41 | inline wstring convert_zero_consonant(const wstring& pinyin) { 42 | wstring raw = pinyin; 43 | if (!pinyin.empty() && pinyin[0] == L'y') { 44 | wstring no_y = pinyin.substr(1); 45 | if (no_y.empty()) return raw; 46 | 47 | wstring first(1, no_y[0]); 48 | if (U_TONES.count(first)) { 49 | wstring replaced = UV_MAP.at(first) + no_y.substr(1); 50 | if (_FINALS.count(replaced)) return replaced; 51 | } else if (I_TONES.count(first)) { 52 | if (_FINALS.count(no_y)) return no_y; 53 | } else { 54 | wstring new_py = L"i" + no_y; 55 | if (_FINALS.count(new_py)) return new_py; 56 | } 57 | return raw; 58 | } 59 | 60 | if (!pinyin.empty() && pinyin[0] == L'w') { 61 | wstring no_w = pinyin.substr(1); 62 | if (no_w.empty()) return raw; 63 | 64 | wstring first(1, no_w[0]); 65 | if (U_TONES.count(first)) { 66 | if (_FINALS.count(no_w)) return no_w; 67 | } else { 68 | wstring new_py = L"u" + no_w; 69 | if (_FINALS.count(new_py)) return new_py; 70 | } 71 | return raw; 72 | } 73 | return pinyin; 74 | } 75 | 76 | inline wstring convert_uv(const wstring& pinyin) { 77 | if (pinyin.length() < 2) return pinyin; 78 | 79 | wchar_t first = pinyin[0]; 80 | if (first != L'j' && first != L'q' && first != L'x') return pinyin; 81 | 82 | wstring second(1, pinyin[1]); 83 | if (UV_MAP.find(second) != UV_MAP.end()) { 84 | return wstring(1, first) + UV_MAP.at(second) + pinyin.substr(2); 85 | } 86 | return pinyin; 87 | } 88 | 89 | inline wstring convert_iou(const wstring& pinyin) { 90 | wregex re(LR"((\w+?)(i[ūúǔù]|iu)$)"); 91 | wsmatch match; 92 | if (regex_match(pinyin, match, re) && match.size() == 3) { 93 | wstring key = match[2].str(); 94 | if (IU_MAP.find(key) != IU_MAP.end()) { 95 | return match[1].str() + IU_MAP.at(key); 96 | } 97 | } 98 | return pinyin; 99 | } 100 | 101 | inline wstring convert_uei(const wstring& pinyin) { 102 | wregex re(LR"((\w+?)(u[īíǐì]|ui)$)"); 103 | wsmatch match; 104 | if (regex_match(pinyin, match, re) && match.size() == 3) { 105 | wstring key = match[2].str(); 106 | if (UI_MAP.find(key) != UI_MAP.end()) { 107 | return match[1].str() + UI_MAP.at(key); 108 | } 109 | } 110 | return pinyin; 111 | } 112 | 113 | inline wstring convert_uen(const wstring& pinyin) { 114 | wregex re(LR"(([a-z]+)(ǔn|un|ùn|ūn|ún)$)"); 115 | wsmatch match; 116 | if (regex_match(pinyin, match, re) && match.size() == 3) { 117 | wstring key = match[2].str(); 118 | if (UN_MAP.find(key) != UN_MAP.end()) { 119 | auto tmp = match[1].str() + UN_MAP.at(key); 120 | return match[1].str() + UN_MAP.at(key); 121 | } 122 | } 123 | return pinyin; 124 | } 125 | 126 | inline wstring convert_finals(const wstring& pinyin) { 127 | wstring result = convert_zero_consonant(pinyin); 128 | result = convert_uv(result); 129 | result = convert_iou(result); 130 | result = convert_uei(result); 131 | result = convert_uen(result); 132 | return result; 133 | } 134 | 135 | // 外部接口 136 | inline string pinyin_to_finals(const string& pinyin) { 137 | wstring wide_pinyin = converter.from_bytes(pinyin); 138 | wstring result = convert_finals(wide_pinyin); 139 | return converter.to_bytes(result); 140 | } -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/ToneConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef TUNEUTIL_H 2 | #define TUNEUTIL_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace Pinyin 11 | { 12 | class CPP_PINYIN_EXPORT ToneConverter { 13 | public: 14 | enum Style { 15 | // 普通风格,不带声调。如: 中国 -> ``zhong guo`` 16 | NORMAL = 0, 17 | // 标准声调风格,拼音声调在韵母第一个字母上(默认风格)。如: 中国 -> ``zhōng guó`` 18 | TONE = 1 19 | }; 20 | 21 | ToneConverter() {} 22 | virtual ~ToneConverter() {} 23 | 24 | std::u16string convert(std::u16string str, int style, bool v_to_u = false, bool neutral_tone_with_five = false) const; 25 | 26 | protected: 27 | std::unordered_map> 28 | m_converts; 29 | }; 30 | } 31 | 32 | #endif //TUNEUTIL_H 33 | -------------------------------------------------------------------------------- /src/3rd_include/cpp-pinyin/U16Str.h: -------------------------------------------------------------------------------- 1 | #ifndef U16STR_H 2 | #define U16STR_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace Pinyin 9 | { 10 | std::string CPP_PINYIN_EXPORT u16strToUtf8str(const char16_t &ch16); 11 | std::string CPP_PINYIN_EXPORT u16strToUtf8str(const std::u16string &u16str); 12 | std::u16string CPP_PINYIN_EXPORT utf8strToU16str(const std::string &utf8str); 13 | } 14 | #endif //U16STR_H 15 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/DictTrie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_DICT_TRIE_HPP 2 | #define CPPJIEBA_DICT_TRIE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "limonp/StringUtil.hpp" 14 | #include "limonp/Logging.hpp" 15 | #include "Unicode.hpp" 16 | #include "Trie.hpp" 17 | 18 | namespace cppjieba { 19 | 20 | using namespace limonp; 21 | 22 | const double MIN_DOUBLE = -3.14e+100; 23 | const double MAX_DOUBLE = 3.14e+100; 24 | const size_t DICT_COLUMN_NUM = 3; 25 | const char* const UNKNOWN_TAG = ""; 26 | 27 | class DictTrie { 28 | public: 29 | enum UserWordWeightOption { 30 | WordWeightMin, 31 | WordWeightMedian, 32 | WordWeightMax, 33 | }; // enum UserWordWeightOption 34 | 35 | DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) { 36 | Init(dict_path, user_dict_paths, user_word_weight_opt); 37 | } 38 | 39 | ~DictTrie() { 40 | delete trie_; 41 | } 42 | 43 | bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 44 | DictUnit node_info; 45 | if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) { 46 | return false; 47 | } 48 | active_node_infos_.push_back(node_info); 49 | trie_->InsertNode(node_info.word, &active_node_infos_.back()); 50 | return true; 51 | } 52 | 53 | bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { 54 | DictUnit node_info; 55 | double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ; 56 | if (!MakeNodeInfo(node_info, word, weight , tag)) { 57 | return false; 58 | } 59 | active_node_infos_.push_back(node_info); 60 | trie_->InsertNode(node_info.word, &active_node_infos_.back()); 61 | return true; 62 | } 63 | 64 | bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 65 | DictUnit node_info; 66 | if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) { 67 | return false; 68 | } 69 | trie_->DeleteNode(node_info.word, &node_info); 70 | return true; 71 | } 72 | 73 | const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { 74 | return trie_->Find(begin, end); 75 | } 76 | 77 | void Find(RuneStrArray::const_iterator begin, 78 | RuneStrArray::const_iterator end, 79 | vector&res, 80 | size_t max_word_len = MAX_WORD_LENGTH) const { 81 | trie_->Find(begin, end, res, max_word_len); 82 | } 83 | 84 | bool Find(const string& word) 85 | { 86 | const DictUnit *tmp = NULL; 87 | RuneStrArray runes; 88 | if (!DecodeUTF8RunesInString(word, runes)) 89 | { 90 | XLOG(ERROR) << "Decode failed."; 91 | } 92 | tmp = Find(runes.begin(), runes.end()); 93 | if (tmp == NULL) 94 | { 95 | return false; 96 | } 97 | else 98 | { 99 | return true; 100 | } 101 | } 102 | 103 | bool IsUserDictSingleChineseWord(const Rune& word) const { 104 | return IsIn(user_dict_single_chinese_word_, word); 105 | } 106 | 107 | double GetMinWeight() const { 108 | return min_weight_; 109 | } 110 | 111 | void InserUserDictNode(const string& line) { 112 | vector buf; 113 | DictUnit node_info; 114 | Split(line, buf, " "); 115 | if(buf.size() == 1){ 116 | MakeNodeInfo(node_info, 117 | buf[0], 118 | user_word_default_weight_, 119 | UNKNOWN_TAG); 120 | } else if (buf.size() == 2) { 121 | MakeNodeInfo(node_info, 122 | buf[0], 123 | user_word_default_weight_, 124 | buf[1]); 125 | } else if (buf.size() == 3) { 126 | int freq = atoi(buf[1].c_str()); 127 | assert(freq_sum_ > 0.0); 128 | double weight = log(1.0 * freq / freq_sum_); 129 | MakeNodeInfo(node_info, buf[0], weight, buf[2]); 130 | } 131 | static_node_infos_.push_back(node_info); 132 | if (node_info.word.size() == 1) { 133 | user_dict_single_chinese_word_.insert(node_info.word[0]); 134 | } 135 | } 136 | 137 | void LoadUserDict(const vector& buf) { 138 | for (size_t i = 0; i < buf.size(); i++) { 139 | InserUserDictNode(buf[i]); 140 | } 141 | } 142 | 143 | void LoadUserDict(const set& buf) { 144 | std::set::const_iterator iter; 145 | for (iter = buf.begin(); iter != buf.end(); iter++){ 146 | InserUserDictNode(*iter); 147 | } 148 | } 149 | 150 | void LoadUserDict(const string& filePaths) { 151 | vector files = limonp::Split(filePaths, "|;"); 152 | for (size_t i = 0; i < files.size(); i++) { 153 | ifstream ifs(files[i].c_str()); 154 | XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; 155 | string line; 156 | 157 | while(getline(ifs, line)) { 158 | if (line.size() == 0) { 159 | continue; 160 | } 161 | InserUserDictNode(line); 162 | } 163 | } 164 | } 165 | 166 | 167 | private: 168 | void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { 169 | LoadDict(dict_path); 170 | freq_sum_ = CalcFreqSum(static_node_infos_); 171 | CalculateWeight(static_node_infos_, freq_sum_); 172 | SetStaticWordWeights(user_word_weight_opt); 173 | 174 | if (user_dict_paths.size()) { 175 | LoadUserDict(user_dict_paths); 176 | } 177 | Shrink(static_node_infos_); 178 | CreateTrie(static_node_infos_); 179 | } 180 | 181 | void CreateTrie(const vector& dictUnits) { 182 | assert(dictUnits.size()); 183 | vector words; 184 | vector valuePointers; 185 | for (size_t i = 0 ; i < dictUnits.size(); i ++) { 186 | words.push_back(dictUnits[i].word); 187 | valuePointers.push_back(&dictUnits[i]); 188 | } 189 | 190 | trie_ = new Trie(words, valuePointers); 191 | } 192 | 193 | 194 | 195 | 196 | bool MakeNodeInfo(DictUnit& node_info, 197 | const string& word, 198 | double weight, 199 | const string& tag) { 200 | if (!DecodeUTF8RunesInString(word, node_info.word)) { 201 | XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word; 202 | return false; 203 | } 204 | node_info.weight = weight; 205 | node_info.tag = tag; 206 | return true; 207 | } 208 | 209 | void LoadDict(const string& filePath) { 210 | ifstream ifs(filePath.c_str()); 211 | XCHECK(ifs.is_open()) << "open " << filePath << " failed."; 212 | string line; 213 | vector buf; 214 | 215 | DictUnit node_info; 216 | while (getline(ifs, line)) { 217 | Split(line, buf, " "); 218 | XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line; 219 | MakeNodeInfo(node_info, 220 | buf[0], 221 | atof(buf[1].c_str()), 222 | buf[2]); 223 | static_node_infos_.push_back(node_info); 224 | } 225 | } 226 | 227 | static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) { 228 | return lhs.weight < rhs.weight; 229 | } 230 | 231 | void SetStaticWordWeights(UserWordWeightOption option) { 232 | XCHECK(!static_node_infos_.empty()); 233 | vector x = static_node_infos_; 234 | sort(x.begin(), x.end(), WeightCompare); 235 | min_weight_ = x[0].weight; 236 | max_weight_ = x[x.size() - 1].weight; 237 | median_weight_ = x[x.size() / 2].weight; 238 | switch (option) { 239 | case WordWeightMin: 240 | user_word_default_weight_ = min_weight_; 241 | break; 242 | case WordWeightMedian: 243 | user_word_default_weight_ = median_weight_; 244 | break; 245 | default: 246 | user_word_default_weight_ = max_weight_; 247 | break; 248 | } 249 | } 250 | 251 | double CalcFreqSum(const vector& node_infos) const { 252 | double sum = 0.0; 253 | for (size_t i = 0; i < node_infos.size(); i++) { 254 | sum += node_infos[i].weight; 255 | } 256 | return sum; 257 | } 258 | 259 | void CalculateWeight(vector& node_infos, double sum) const { 260 | assert(sum > 0.0); 261 | for (size_t i = 0; i < node_infos.size(); i++) { 262 | DictUnit& node_info = node_infos[i]; 263 | assert(node_info.weight > 0.0); 264 | node_info.weight = log(double(node_info.weight)/sum); 265 | } 266 | } 267 | 268 | void Shrink(vector& units) const { 269 | vector(units.begin(), units.end()).swap(units); 270 | } 271 | 272 | vector static_node_infos_; 273 | deque active_node_infos_; // must not be vector 274 | Trie * trie_; 275 | 276 | double freq_sum_; 277 | double min_weight_; 278 | double max_weight_; 279 | double median_weight_; 280 | double user_word_default_weight_; 281 | unordered_set user_dict_single_chinese_word_; 282 | }; 283 | } 284 | 285 | #endif 286 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "Unicode.hpp" 11 | 12 | namespace cppjieba { 13 | class FullSegment: public SegmentBase { 14 | public: 15 | FullSegment(const string& dictPath) { 16 | dictTrie_ = new DictTrie(dictPath); 17 | isNeedDestroy_ = true; 18 | } 19 | FullSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~FullSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | void Cut(const string& sentence, 29 | vector& words) const { 30 | vector tmp; 31 | Cut(sentence, tmp); 32 | GetStringsFromWords(tmp, words); 33 | } 34 | void Cut(const string& sentence, 35 | vector& words) const { 36 | PreFilter pre_filter(symbols_, sentence); 37 | PreFilter::Range range; 38 | vector wrs; 39 | wrs.reserve(sentence.size()/2); 40 | while (pre_filter.HasNext()) { 41 | range = pre_filter.Next(); 42 | Cut(range.begin, range.end, wrs); 43 | } 44 | words.clear(); 45 | words.reserve(wrs.size()); 46 | GetWordsFromWordRanges(sentence, wrs, words); 47 | } 48 | void Cut(RuneStrArray::const_iterator begin, 49 | RuneStrArray::const_iterator end, 50 | vector& res) const { 51 | // result of searching in trie tree 52 | LocalVector > tRes; 53 | 54 | // max index of res's words 55 | size_t maxIdx = 0; 56 | 57 | // always equals to (uItr - begin) 58 | size_t uIdx = 0; 59 | 60 | // tmp variables 61 | size_t wordLen = 0; 62 | assert(dictTrie_); 63 | vector dags; 64 | dictTrie_->Find(begin, end, dags); 65 | for (size_t i = 0; i < dags.size(); i++) { 66 | for (size_t j = 0; j < dags[i].nexts.size(); j++) { 67 | size_t nextoffset = dags[i].nexts[j].first; 68 | assert(nextoffset < dags.size()); 69 | const DictUnit* du = dags[i].nexts[j].second; 70 | if (du == NULL) { 71 | if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { 72 | WordRange wr(begin + i, begin + nextoffset); 73 | res.push_back(wr); 74 | } 75 | } else { 76 | wordLen = du->word.size(); 77 | if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { 78 | WordRange wr(begin + i, begin + nextoffset); 79 | res.push_back(wr); 80 | } 81 | } 82 | maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; 83 | } 84 | uIdx++; 85 | } 86 | } 87 | private: 88 | const DictTrie* dictTrie_; 89 | bool isNeedDestroy_; 90 | }; 91 | } 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/HMMModel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_HMMMODEL_H 2 | #define CPPJIEBA_HMMMODEL_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "Trie.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | using namespace limonp; 10 | typedef unordered_map EmitProbMap; 11 | 12 | struct HMMModel { 13 | /* 14 | * STATUS: 15 | * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S 16 | * */ 17 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 18 | 19 | HMMModel(const string& modelPath) { 20 | memset(startProb, 0, sizeof(startProb)); 21 | memset(transProb, 0, sizeof(transProb)); 22 | statMap[0] = 'B'; 23 | statMap[1] = 'E'; 24 | statMap[2] = 'M'; 25 | statMap[3] = 'S'; 26 | emitProbVec.push_back(&emitProbB); 27 | emitProbVec.push_back(&emitProbE); 28 | emitProbVec.push_back(&emitProbM); 29 | emitProbVec.push_back(&emitProbS); 30 | LoadModel(modelPath); 31 | } 32 | ~HMMModel() { 33 | } 34 | void LoadModel(const string& filePath) { 35 | ifstream ifile(filePath.c_str()); 36 | XCHECK(ifile.is_open()) << "open " << filePath << " failed"; 37 | string line; 38 | vector tmp; 39 | vector tmp2; 40 | //Load startProb 41 | XCHECK(GetLine(ifile, line)); 42 | Split(line, tmp, " "); 43 | XCHECK(tmp.size() == STATUS_SUM); 44 | for (size_t j = 0; j< tmp.size(); j++) { 45 | startProb[j] = atof(tmp[j].c_str()); 46 | } 47 | 48 | //Load transProb 49 | for (size_t i = 0; i < STATUS_SUM; i++) { 50 | XCHECK(GetLine(ifile, line)); 51 | Split(line, tmp, " "); 52 | XCHECK(tmp.size() == STATUS_SUM); 53 | for (size_t j =0; j < STATUS_SUM; j++) { 54 | transProb[i][j] = atof(tmp[j].c_str()); 55 | } 56 | } 57 | 58 | //Load emitProbB 59 | XCHECK(GetLine(ifile, line)); 60 | XCHECK(LoadEmitProb(line, emitProbB)); 61 | 62 | //Load emitProbE 63 | XCHECK(GetLine(ifile, line)); 64 | XCHECK(LoadEmitProb(line, emitProbE)); 65 | 66 | //Load emitProbM 67 | XCHECK(GetLine(ifile, line)); 68 | XCHECK(LoadEmitProb(line, emitProbM)); 69 | 70 | //Load emitProbS 71 | XCHECK(GetLine(ifile, line)); 72 | XCHECK(LoadEmitProb(line, emitProbS)); 73 | } 74 | double GetEmitProb(const EmitProbMap* ptMp, Rune key, 75 | double defVal)const { 76 | EmitProbMap::const_iterator cit = ptMp->find(key); 77 | if (cit == ptMp->end()) { 78 | return defVal; 79 | } 80 | return cit->second; 81 | } 82 | bool GetLine(ifstream& ifile, string& line) { 83 | while (getline(ifile, line)) { 84 | Trim(line); 85 | if (line.empty()) { 86 | continue; 87 | } 88 | if (StartsWith(line, "#")) { 89 | continue; 90 | } 91 | return true; 92 | } 93 | return false; 94 | } 95 | bool LoadEmitProb(const string& line, EmitProbMap& mp) { 96 | if (line.empty()) { 97 | return false; 98 | } 99 | vector tmp, tmp2; 100 | Unicode unicode; 101 | Split(line, tmp, ","); 102 | for (size_t i = 0; i < tmp.size(); i++) { 103 | Split(tmp[i], tmp2, ":"); 104 | if (2 != tmp2.size()) { 105 | XLOG(ERROR) << "emitProb illegal."; 106 | return false; 107 | } 108 | if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) { 109 | XLOG(ERROR) << "TransCode failed."; 110 | return false; 111 | } 112 | mp[unicode[0]] = atof(tmp2[1].c_str()); 113 | } 114 | return true; 115 | } 116 | 117 | char statMap[STATUS_SUM]; 118 | double startProb[STATUS_SUM]; 119 | double transProb[STATUS_SUM][STATUS_SUM]; 120 | EmitProbMap emitProbB; 121 | EmitProbMap emitProbE; 122 | EmitProbMap emitProbM; 123 | EmitProbMap emitProbS; 124 | vector emitProbVec; 125 | }; // struct HMMModel 126 | 127 | } // namespace cppjieba 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/HMMSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIBEA_HMMSEGMENT_H 2 | #define CPPJIBEA_HMMSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "HMMModel.hpp" 9 | #include "SegmentBase.hpp" 10 | 11 | namespace cppjieba { 12 | class HMMSegment: public SegmentBase { 13 | public: 14 | HMMSegment(const string& filePath) 15 | : model_(new HMMModel(filePath)), isNeedDestroy_(true) { 16 | } 17 | HMMSegment(const HMMModel* model) 18 | : model_(model), isNeedDestroy_(false) { 19 | } 20 | ~HMMSegment() { 21 | if (isNeedDestroy_) { 22 | delete model_; 23 | } 24 | } 25 | 26 | void Cut(const string& sentence, 27 | vector& words) const { 28 | vector tmp; 29 | Cut(sentence, tmp); 30 | GetStringsFromWords(tmp, words); 31 | } 32 | void Cut(const string& sentence, 33 | vector& words) const { 34 | PreFilter pre_filter(symbols_, sentence); 35 | PreFilter::Range range; 36 | vector wrs; 37 | wrs.reserve(sentence.size()/2); 38 | while (pre_filter.HasNext()) { 39 | range = pre_filter.Next(); 40 | Cut(range.begin, range.end, wrs); 41 | } 42 | words.clear(); 43 | words.reserve(wrs.size()); 44 | GetWordsFromWordRanges(sentence, wrs, words); 45 | } 46 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { 47 | RuneStrArray::const_iterator left = begin; 48 | RuneStrArray::const_iterator right = begin; 49 | while (right != end) { 50 | if (right->rune < 0x80) { 51 | if (left != right) { 52 | InternalCut(left, right, res); 53 | } 54 | left = right; 55 | do { 56 | right = SequentialLetterRule(left, end); 57 | if (right != left) { 58 | break; 59 | } 60 | right = NumbersRule(left, end); 61 | if (right != left) { 62 | break; 63 | } 64 | right ++; 65 | } while (false); 66 | WordRange wr(left, right - 1); 67 | res.push_back(wr); 68 | left = right; 69 | } else { 70 | right++; 71 | } 72 | } 73 | if (left != right) { 74 | InternalCut(left, right, res); 75 | } 76 | } 77 | private: 78 | // sequential letters rule 79 | RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { 80 | Rune x = begin->rune; 81 | if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { 82 | begin ++; 83 | } else { 84 | return begin; 85 | } 86 | while (begin != end) { 87 | x = begin->rune; 88 | if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { 89 | begin ++; 90 | } else { 91 | break; 92 | } 93 | } 94 | return begin; 95 | } 96 | // 97 | RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { 98 | Rune x = begin->rune; 99 | if ('0' <= x && x <= '9') { 100 | begin ++; 101 | } else { 102 | return begin; 103 | } 104 | while (begin != end) { 105 | x = begin->rune; 106 | if ( ('0' <= x && x <= '9') || x == '.') { 107 | begin++; 108 | } else { 109 | break; 110 | } 111 | } 112 | return begin; 113 | } 114 | void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { 115 | vector status; 116 | Viterbi(begin, end, status); 117 | 118 | RuneStrArray::const_iterator left = begin; 119 | RuneStrArray::const_iterator right; 120 | for (size_t i = 0; i < status.size(); i++) { 121 | if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) 122 | right = begin + i + 1; 123 | WordRange wr(left, right - 1); 124 | res.push_back(wr); 125 | left = right; 126 | } 127 | } 128 | } 129 | 130 | void Viterbi(RuneStrArray::const_iterator begin, 131 | RuneStrArray::const_iterator end, 132 | vector& status) const { 133 | size_t Y = HMMModel::STATUS_SUM; 134 | size_t X = end - begin; 135 | 136 | size_t XYSize = X * Y; 137 | size_t now, old, stat; 138 | double tmp, endE, endS; 139 | 140 | vector path(XYSize); 141 | vector weight(XYSize); 142 | 143 | //start 144 | for (size_t y = 0; y < Y; y++) { 145 | weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE); 146 | path[0 + y * X] = -1; 147 | } 148 | 149 | double emitProb; 150 | 151 | for (size_t x = 1; x < X; x++) { 152 | for (size_t y = 0; y < Y; y++) { 153 | now = x + y*X; 154 | weight[now] = MIN_DOUBLE; 155 | path[now] = HMMModel::E; // warning 156 | emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE); 157 | for (size_t preY = 0; preY < Y; preY++) { 158 | old = x - 1 + preY * X; 159 | tmp = weight[old] + model_->transProb[preY][y] + emitProb; 160 | if (tmp > weight[now]) { 161 | weight[now] = tmp; 162 | path[now] = preY; 163 | } 164 | } 165 | } 166 | } 167 | 168 | endE = weight[X-1+HMMModel::E*X]; 169 | endS = weight[X-1+HMMModel::S*X]; 170 | stat = 0; 171 | if (endE >= endS) { 172 | stat = HMMModel::E; 173 | } else { 174 | stat = HMMModel::S; 175 | } 176 | 177 | status.resize(X); 178 | for (int x = X -1 ; x >= 0; x--) { 179 | status[x] = stat; 180 | stat = path[x + stat*X]; 181 | } 182 | } 183 | 184 | const HMMModel* model_; 185 | bool isNeedDestroy_; 186 | }; // class HMMSegment 187 | 188 | } // namespace cppjieba 189 | 190 | #endif 191 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/Jieba.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEAB_JIEBA_H 2 | #define CPPJIEAB_JIEBA_H 3 | 4 | #include "QuerySegment.hpp" 5 | #include "KeywordExtractor.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class Jieba { 10 | public: 11 | Jieba(const string& dict_path = "", 12 | const string& model_path = "", 13 | const string& user_dict_path = "", 14 | const string& idf_path = "", 15 | const string& stop_word_path = "") 16 | : dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")), 17 | model_(getPath(model_path, "hmm_model.utf8")), 18 | mp_seg_(&dict_trie_), 19 | hmm_seg_(&model_), 20 | mix_seg_(&dict_trie_, &model_), 21 | full_seg_(&dict_trie_), 22 | query_seg_(&dict_trie_, &model_), 23 | extractor(&dict_trie_, &model_, 24 | getPath(idf_path, "idf.utf8"), 25 | getPath(stop_word_path, "stop_words.utf8")) { 26 | } 27 | ~Jieba() { 28 | } 29 | 30 | struct LocWord { 31 | string word; 32 | size_t begin; 33 | size_t end; 34 | }; // struct LocWord 35 | 36 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 37 | mix_seg_.Cut(sentence, words, hmm); 38 | } 39 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 40 | mix_seg_.Cut(sentence, words, hmm); 41 | } 42 | void CutAll(const string& sentence, vector& words) const { 43 | full_seg_.Cut(sentence, words); 44 | } 45 | void CutAll(const string& sentence, vector& words) const { 46 | full_seg_.Cut(sentence, words); 47 | } 48 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 49 | query_seg_.Cut(sentence, words, hmm); 50 | } 51 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 52 | query_seg_.Cut(sentence, words, hmm); 53 | } 54 | void CutHMM(const string& sentence, vector& words) const { 55 | hmm_seg_.Cut(sentence, words); 56 | } 57 | void CutHMM(const string& sentence, vector& words) const { 58 | hmm_seg_.Cut(sentence, words); 59 | } 60 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 61 | mp_seg_.Cut(sentence, words, max_word_len); 62 | } 63 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 64 | mp_seg_.Cut(sentence, words, max_word_len); 65 | } 66 | 67 | void Tag(const string& sentence, vector >& words) const { 68 | mix_seg_.Tag(sentence, words); 69 | } 70 | string LookupTag(const string &str) const { 71 | return mix_seg_.LookupTag(str); 72 | } 73 | bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 74 | return dict_trie_.InsertUserWord(word, tag); 75 | } 76 | 77 | bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { 78 | return dict_trie_.InsertUserWord(word,freq, tag); 79 | } 80 | 81 | bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 82 | return dict_trie_.DeleteUserWord(word, tag); 83 | } 84 | 85 | bool Find(const string& word) 86 | { 87 | return dict_trie_.Find(word); 88 | } 89 | 90 | void ResetSeparators(const string& s) { 91 | //TODO 92 | mp_seg_.ResetSeparators(s); 93 | hmm_seg_.ResetSeparators(s); 94 | mix_seg_.ResetSeparators(s); 95 | full_seg_.ResetSeparators(s); 96 | query_seg_.ResetSeparators(s); 97 | } 98 | 99 | const DictTrie* GetDictTrie() const { 100 | return &dict_trie_; 101 | } 102 | 103 | const HMMModel* GetHMMModel() const { 104 | return &model_; 105 | } 106 | 107 | void LoadUserDict(const vector& buf) { 108 | dict_trie_.LoadUserDict(buf); 109 | } 110 | 111 | void LoadUserDict(const set& buf) { 112 | dict_trie_.LoadUserDict(buf); 113 | } 114 | 115 | void LoadUserDict(const string& path) { 116 | dict_trie_.LoadUserDict(path); 117 | } 118 | 119 | private: 120 | static string pathJoin(const string& dir, const string& filename) { 121 | if (dir.empty()) { 122 | return filename; 123 | } 124 | 125 | char last_char = dir[dir.length() - 1]; 126 | if (last_char == '/' || last_char == '\\') { 127 | return dir + filename; 128 | } else { 129 | #ifdef _WIN32 130 | return dir + '\\' + filename; 131 | #else 132 | return dir + '/' + filename; 133 | #endif 134 | } 135 | } 136 | 137 | static string getCurrentDirectory() { 138 | string path(__FILE__); 139 | size_t pos = path.find_last_of("/\\"); 140 | return (pos == string::npos) ? "" : path.substr(0, pos); 141 | } 142 | 143 | static string getPath(const string& path, const string& default_file) { 144 | if (path.empty()) { 145 | string current_dir = getCurrentDirectory(); 146 | string parent_dir = current_dir.substr(0, current_dir.find_last_of("/\\")); 147 | string grandparent_dir = parent_dir.substr(0, parent_dir.find_last_of("/\\")); 148 | return pathJoin(pathJoin(grandparent_dir, "dict"), default_file); 149 | } 150 | return path; 151 | } 152 | 153 | DictTrie dict_trie_; 154 | HMMModel model_; 155 | 156 | // They share the same dict trie and model 157 | MPSegment mp_seg_; 158 | HMMSegment hmm_seg_; 159 | MixSegment mix_seg_; 160 | FullSegment full_seg_; 161 | QuerySegment query_seg_; 162 | 163 | public: 164 | KeywordExtractor extractor; 165 | }; // class Jieba 166 | 167 | } // namespace cppjieba 168 | 169 | #endif // CPPJIEAB_JIEBA_H 170 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/KeywordExtractor.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H 2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H 3 | 4 | #include 5 | #include 6 | #include "MixSegment.hpp" 7 | 8 | namespace cppjieba { 9 | 10 | using namespace limonp; 11 | using namespace std; 12 | 13 | /*utf8*/ 14 | class KeywordExtractor { 15 | public: 16 | struct Word { 17 | string word; 18 | vector offsets; 19 | double weight; 20 | }; // struct Word 21 | 22 | KeywordExtractor(const string& dictPath, 23 | const string& hmmFilePath, 24 | const string& idfPath, 25 | const string& stopWordPath, 26 | const string& userDict = "") 27 | : segment_(dictPath, hmmFilePath, userDict) { 28 | LoadIdfDict(idfPath); 29 | LoadStopWordDict(stopWordPath); 30 | } 31 | KeywordExtractor(const DictTrie* dictTrie, 32 | const HMMModel* model, 33 | const string& idfPath, 34 | const string& stopWordPath) 35 | : segment_(dictTrie, model) { 36 | LoadIdfDict(idfPath); 37 | LoadStopWordDict(stopWordPath); 38 | } 39 | ~KeywordExtractor() { 40 | } 41 | 42 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 43 | vector topWords; 44 | Extract(sentence, topWords, topN); 45 | for (size_t i = 0; i < topWords.size(); i++) { 46 | keywords.push_back(topWords[i].word); 47 | } 48 | } 49 | 50 | void Extract(const string& sentence, vector >& keywords, size_t topN) const { 51 | vector topWords; 52 | Extract(sentence, topWords, topN); 53 | for (size_t i = 0; i < topWords.size(); i++) { 54 | keywords.push_back(pair(topWords[i].word, topWords[i].weight)); 55 | } 56 | } 57 | 58 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 59 | vector words; 60 | segment_.Cut(sentence, words); 61 | 62 | map wordmap; 63 | size_t offset = 0; 64 | for (size_t i = 0; i < words.size(); ++i) { 65 | size_t t = offset; 66 | offset += words[i].size(); 67 | if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { 68 | continue; 69 | } 70 | wordmap[words[i]].offsets.push_back(t); 71 | wordmap[words[i]].weight += 1.0; 72 | } 73 | if (offset != sentence.size()) { 74 | XLOG(ERROR) << "words illegal"; 75 | return; 76 | } 77 | 78 | keywords.clear(); 79 | keywords.reserve(wordmap.size()); 80 | for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { 81 | unordered_map::const_iterator cit = idfMap_.find(itr->first); 82 | if (cit != idfMap_.end()) { 83 | itr->second.weight *= cit->second; 84 | } else { 85 | itr->second.weight *= idfAverage_; 86 | } 87 | itr->second.word = itr->first; 88 | keywords.push_back(itr->second); 89 | } 90 | topN = min(topN, keywords.size()); 91 | partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); 92 | keywords.resize(topN); 93 | } 94 | private: 95 | void LoadIdfDict(const string& idfPath) { 96 | ifstream ifs(idfPath.c_str()); 97 | XCHECK(ifs.is_open()) << "open " << idfPath << " failed"; 98 | string line ; 99 | vector buf; 100 | double idf = 0.0; 101 | double idfSum = 0.0; 102 | size_t lineno = 0; 103 | for (; getline(ifs, line); lineno++) { 104 | buf.clear(); 105 | if (line.empty()) { 106 | XLOG(ERROR) << "lineno: " << lineno << " empty. skipped."; 107 | continue; 108 | } 109 | Split(line, buf, " "); 110 | if (buf.size() != 2) { 111 | XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped."; 112 | continue; 113 | } 114 | idf = atof(buf[1].c_str()); 115 | idfMap_[buf[0]] = idf; 116 | idfSum += idf; 117 | 118 | } 119 | 120 | assert(lineno); 121 | idfAverage_ = idfSum / lineno; 122 | assert(idfAverage_ > 0.0); 123 | } 124 | void LoadStopWordDict(const string& filePath) { 125 | ifstream ifs(filePath.c_str()); 126 | XCHECK(ifs.is_open()) << "open " << filePath << " failed"; 127 | string line ; 128 | while (getline(ifs, line)) { 129 | stopWords_.insert(line); 130 | } 131 | assert(stopWords_.size()); 132 | } 133 | 134 | static bool Compare(const Word& lhs, const Word& rhs) { 135 | return lhs.weight > rhs.weight; 136 | } 137 | 138 | MixSegment segment_; 139 | unordered_map idfMap_; 140 | double idfAverage_; 141 | 142 | unordered_set stopWords_; 143 | }; // class KeywordExtractor 144 | 145 | inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { 146 | return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 147 | } 148 | 149 | } // namespace cppjieba 150 | 151 | #endif 152 | 153 | 154 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/MPSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MPSEGMENT_H 2 | #define CPPJIEBA_MPSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentTagged.hpp" 10 | #include "PosTagger.hpp" 11 | 12 | namespace cppjieba { 13 | 14 | class MPSegment: public SegmentTagged { 15 | public: 16 | MPSegment(const string& dictPath, const string& userDictPath = "") 17 | : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { 18 | } 19 | MPSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~MPSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | 29 | void Cut(const string& sentence, vector& words) const { 30 | Cut(sentence, words, MAX_WORD_LENGTH); 31 | } 32 | 33 | void Cut(const string& sentence, 34 | vector& words, 35 | size_t max_word_len) const { 36 | vector tmp; 37 | Cut(sentence, tmp, max_word_len); 38 | GetStringsFromWords(tmp, words); 39 | } 40 | void Cut(const string& sentence, 41 | vector& words, 42 | size_t max_word_len = MAX_WORD_LENGTH) const { 43 | PreFilter pre_filter(symbols_, sentence); 44 | PreFilter::Range range; 45 | vector wrs; 46 | wrs.reserve(sentence.size()/2); 47 | while (pre_filter.HasNext()) { 48 | range = pre_filter.Next(); 49 | Cut(range.begin, range.end, wrs, max_word_len); 50 | } 51 | words.clear(); 52 | words.reserve(wrs.size()); 53 | GetWordsFromWordRanges(sentence, wrs, words); 54 | } 55 | void Cut(RuneStrArray::const_iterator begin, 56 | RuneStrArray::const_iterator end, 57 | vector& words, 58 | size_t max_word_len = MAX_WORD_LENGTH) const { 59 | vector dags; 60 | dictTrie_->Find(begin, 61 | end, 62 | dags, 63 | max_word_len); 64 | CalcDP(dags); 65 | CutByDag(begin, end, dags, words); 66 | } 67 | 68 | const DictTrie* GetDictTrie() const { 69 | return dictTrie_; 70 | } 71 | 72 | bool Tag(const string& src, vector >& res) const { 73 | return tagger_.Tag(src, res, *this); 74 | } 75 | 76 | bool IsUserDictSingleChineseWord(const Rune& value) const { 77 | return dictTrie_->IsUserDictSingleChineseWord(value); 78 | } 79 | private: 80 | void CalcDP(vector& dags) const { 81 | size_t nextPos; 82 | const DictUnit* p; 83 | double val; 84 | 85 | for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { 86 | rit->pInfo = NULL; 87 | rit->weight = MIN_DOUBLE; 88 | assert(!rit->nexts.empty()); 89 | for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { 90 | nextPos = it->first; 91 | p = it->second; 92 | val = 0.0; 93 | if (nextPos + 1 < dags.size()) { 94 | val += dags[nextPos + 1].weight; 95 | } 96 | 97 | if (p) { 98 | val += p->weight; 99 | } else { 100 | val += dictTrie_->GetMinWeight(); 101 | } 102 | if (val > rit->weight) { 103 | rit->pInfo = p; 104 | rit->weight = val; 105 | } 106 | } 107 | } 108 | } 109 | void CutByDag(RuneStrArray::const_iterator begin, 110 | RuneStrArray::const_iterator end, 111 | const vector& dags, 112 | vector& words) const { 113 | size_t i = 0; 114 | while (i < dags.size()) { 115 | const DictUnit* p = dags[i].pInfo; 116 | if (p) { 117 | assert(p->word.size() >= 1); 118 | WordRange wr(begin + i, begin + i + p->word.size() - 1); 119 | words.push_back(wr); 120 | i += p->word.size(); 121 | } else { //single chinese word 122 | WordRange wr(begin + i, begin + i); 123 | words.push_back(wr); 124 | i++; 125 | } 126 | } 127 | } 128 | 129 | const DictTrie* dictTrie_; 130 | bool isNeedDestroy_; 131 | PosTagger tagger_; 132 | 133 | }; // class MPSegment 134 | 135 | } // namespace cppjieba 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/MixSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MIXSEGMENT_H 2 | #define CPPJIEBA_MIXSEGMENT_H 3 | 4 | #include 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "limonp/StringUtil.hpp" 8 | #include "PosTagger.hpp" 9 | 10 | namespace cppjieba { 11 | class MixSegment: public SegmentTagged { 12 | public: 13 | MixSegment(const string& mpSegDict, const string& hmmSegDict, 14 | const string& userDict = "") 15 | : mpSeg_(mpSegDict, userDict), 16 | hmmSeg_(hmmSegDict) { 17 | } 18 | MixSegment(const DictTrie* dictTrie, const HMMModel* model) 19 | : mpSeg_(dictTrie), hmmSeg_(model) { 20 | } 21 | ~MixSegment() { 22 | } 23 | 24 | void Cut(const string& sentence, vector& words) const { 25 | Cut(sentence, words, true); 26 | } 27 | void Cut(const string& sentence, vector& words, bool hmm) const { 28 | vector tmp; 29 | Cut(sentence, tmp, hmm); 30 | GetStringsFromWords(tmp, words); 31 | } 32 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 33 | PreFilter pre_filter(symbols_, sentence); 34 | PreFilter::Range range; 35 | vector wrs; 36 | wrs.reserve(sentence.size() / 2); 37 | while (pre_filter.HasNext()) { 38 | range = pre_filter.Next(); 39 | Cut(range.begin, range.end, wrs, hmm); 40 | } 41 | words.clear(); 42 | words.reserve(wrs.size()); 43 | GetWordsFromWordRanges(sentence, wrs, words); 44 | } 45 | 46 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 47 | if (!hmm) { 48 | mpSeg_.Cut(begin, end, res); 49 | return; 50 | } 51 | vector words; 52 | assert(end >= begin); 53 | words.reserve(end - begin); 54 | mpSeg_.Cut(begin, end, words); 55 | 56 | vector hmmRes; 57 | hmmRes.reserve(end - begin); 58 | for (size_t i = 0; i < words.size(); i++) { 59 | //if mp Get a word, it's ok, put it into result 60 | if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { 61 | res.push_back(words[i]); 62 | continue; 63 | } 64 | 65 | // if mp Get a single one and it is not in userdict, collect it in sequence 66 | size_t j = i; 67 | while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { 68 | j++; 69 | } 70 | 71 | // Cut the sequence with hmm 72 | assert(j - 1 >= i); 73 | // TODO 74 | hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); 75 | //put hmm result to result 76 | for (size_t k = 0; k < hmmRes.size(); k++) { 77 | res.push_back(hmmRes[k]); 78 | } 79 | 80 | //clear tmp vars 81 | hmmRes.clear(); 82 | 83 | //let i jump over this piece 84 | i = j - 1; 85 | } 86 | } 87 | 88 | const DictTrie* GetDictTrie() const { 89 | return mpSeg_.GetDictTrie(); 90 | } 91 | 92 | bool Tag(const string& src, vector >& res) const { 93 | return tagger_.Tag(src, res, *this); 94 | } 95 | 96 | string LookupTag(const string &str) const { 97 | return tagger_.LookupTag(str, *this); 98 | } 99 | 100 | private: 101 | MPSegment mpSeg_; 102 | HMMSegment hmmSeg_; 103 | PosTagger tagger_; 104 | 105 | }; // class MixSegment 106 | 107 | } // namespace cppjieba 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "SegmentTagged.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace cppjieba { 9 | using namespace limonp; 10 | 11 | static const char* const POS_M = "m"; 12 | static const char* const POS_ENG = "eng"; 13 | static const char* const POS_X = "x"; 14 | 15 | class PosTagger { 16 | public: 17 | PosTagger() { 18 | } 19 | ~PosTagger() { 20 | } 21 | 22 | bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { 23 | vector CutRes; 24 | segment.Cut(src, CutRes); 25 | 26 | for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { 27 | res.push_back(make_pair(*itr, LookupTag(*itr, segment))); 28 | } 29 | return !res.empty(); 30 | } 31 | 32 | string LookupTag(const string &str, const SegmentTagged& segment) const { 33 | const DictUnit *tmp = NULL; 34 | RuneStrArray runes; 35 | const DictTrie * dict = segment.GetDictTrie(); 36 | assert(dict != NULL); 37 | if (!DecodeUTF8RunesInString(str, runes)) { 38 | XLOG(ERROR) << "UTF-8 decode failed for word: " << str; 39 | return POS_X; 40 | } 41 | tmp = dict->Find(runes.begin(), runes.end()); 42 | if (tmp == NULL || tmp->tag.empty()) { 43 | return SpecialRule(runes); 44 | } else { 45 | return tmp->tag; 46 | } 47 | } 48 | 49 | private: 50 | const char* SpecialRule(const RuneStrArray& unicode) const { 51 | size_t m = 0; 52 | size_t eng = 0; 53 | for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { 54 | if (unicode[i].rune < 0x80) { 55 | eng ++; 56 | if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { 57 | m++; 58 | } 59 | } 60 | } 61 | // ascii char is not found 62 | if (eng == 0) { 63 | return POS_X; 64 | } 65 | // all the ascii is number char 66 | if (m == eng) { 67 | return POS_M; 68 | } 69 | // the ascii chars contain english letter 70 | return POS_ENG; 71 | } 72 | 73 | }; // class PosTagger 74 | 75 | } // namespace cppjieba 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/PreFilter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_PRE_FILTER_H 2 | #define CPPJIEBA_PRE_FILTER_H 3 | 4 | #include "Trie.hpp" 5 | #include "limonp/Logging.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class PreFilter { 10 | public: 11 | //TODO use WordRange instead of Range 12 | struct Range { 13 | RuneStrArray::const_iterator begin; 14 | RuneStrArray::const_iterator end; 15 | }; // struct Range 16 | 17 | PreFilter(const unordered_set& symbols, 18 | const string& sentence) 19 | : symbols_(symbols) { 20 | if (!DecodeUTF8RunesInString(sentence, sentence_)) { 21 | XLOG(ERROR) << "UTF-8 decode failed for input sentence"; 22 | } 23 | cursor_ = sentence_.begin(); 24 | } 25 | ~PreFilter() { 26 | } 27 | bool HasNext() const { 28 | return cursor_ != sentence_.end(); 29 | } 30 | Range Next() { 31 | Range range; 32 | range.begin = cursor_; 33 | while (cursor_ != sentence_.end()) { 34 | if (IsIn(symbols_, cursor_->rune)) { 35 | if (range.begin == cursor_) { 36 | cursor_ ++; 37 | } 38 | range.end = cursor_; 39 | return range; 40 | } 41 | cursor_ ++; 42 | } 43 | range.end = sentence_.end(); 44 | return range; 45 | } 46 | private: 47 | RuneStrArray::const_iterator cursor_; 48 | RuneStrArray sentence_; 49 | const unordered_set& symbols_; 50 | }; // class PreFilter 51 | 52 | } // namespace cppjieba 53 | 54 | #endif // CPPJIEBA_PRE_FILTER_H 55 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "FullSegment.hpp" 11 | #include "MixSegment.hpp" 12 | #include "Unicode.hpp" 13 | 14 | namespace cppjieba { 15 | class QuerySegment: public SegmentBase { 16 | public: 17 | QuerySegment(const string& dict, const string& model, const string& userDict = "") 18 | : mixSeg_(dict, model, userDict), 19 | trie_(mixSeg_.GetDictTrie()) { 20 | } 21 | QuerySegment(const DictTrie* dictTrie, const HMMModel* model) 22 | : mixSeg_(dictTrie, model), trie_(dictTrie) { 23 | } 24 | ~QuerySegment() { 25 | } 26 | 27 | void Cut(const string& sentence, vector& words) const { 28 | Cut(sentence, words, true); 29 | } 30 | void Cut(const string& sentence, vector& words, bool hmm) const { 31 | vector tmp; 32 | Cut(sentence, tmp, hmm); 33 | GetStringsFromWords(tmp, words); 34 | } 35 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 36 | PreFilter pre_filter(symbols_, sentence); 37 | PreFilter::Range range; 38 | vector wrs; 39 | wrs.reserve(sentence.size()/2); 40 | while (pre_filter.HasNext()) { 41 | range = pre_filter.Next(); 42 | Cut(range.begin, range.end, wrs, hmm); 43 | } 44 | words.clear(); 45 | words.reserve(wrs.size()); 46 | GetWordsFromWordRanges(sentence, wrs, words); 47 | } 48 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 49 | //use mix Cut first 50 | vector mixRes; 51 | mixSeg_.Cut(begin, end, mixRes, hmm); 52 | 53 | vector fullRes; 54 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { 55 | if (mixResItr->Length() > 2) { 56 | for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { 57 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); 58 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 59 | res.push_back(wr); 60 | } 61 | } 62 | } 63 | if (mixResItr->Length() > 3) { 64 | for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { 65 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); 66 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 67 | res.push_back(wr); 68 | } 69 | } 70 | } 71 | res.push_back(*mixResItr); 72 | } 73 | } 74 | private: 75 | bool IsAllAscii(const Unicode& s) const { 76 | for(size_t i = 0; i < s.size(); i++) { 77 | if (s[i] >= 0x80) { 78 | return false; 79 | } 80 | } 81 | return true; 82 | } 83 | MixSegment mixSeg_; 84 | const DictTrie* trie_; 85 | }; // QuerySegment 86 | 87 | } // namespace cppjieba 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/SegmentBase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTBASE_H 2 | #define CPPJIEBA_SEGMENTBASE_H 3 | 4 | #include "limonp/Logging.hpp" 5 | #include "PreFilter.hpp" 6 | #include 7 | 8 | 9 | namespace cppjieba { 10 | 11 | const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82"; 12 | 13 | using namespace limonp; 14 | 15 | class SegmentBase { 16 | public: 17 | SegmentBase() { 18 | XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); 19 | } 20 | virtual ~SegmentBase() { 21 | } 22 | 23 | virtual void Cut(const string& sentence, vector& words) const = 0; 24 | 25 | bool ResetSeparators(const string& s) { 26 | symbols_.clear(); 27 | RuneStrArray runes; 28 | if (!DecodeUTF8RunesInString(s, runes)) { 29 | XLOG(ERROR) << "UTF-8 decode failed for separators: " << s; 30 | return false; 31 | } 32 | for (size_t i = 0; i < runes.size(); i++) { 33 | if (!symbols_.insert(runes[i].rune).second) { 34 | XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; 35 | return false; 36 | } 37 | } 38 | return true; 39 | } 40 | protected: 41 | unordered_set symbols_; 42 | }; // class SegmentBase 43 | 44 | } // cppjieba 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/SegmentTagged.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTTAGGED_H 2 | #define CPPJIEBA_SEGMENTTAGGED_H 3 | 4 | #include "SegmentBase.hpp" 5 | 6 | namespace cppjieba { 7 | 8 | class SegmentTagged : public SegmentBase{ 9 | public: 10 | SegmentTagged() { 11 | } 12 | virtual ~SegmentTagged() { 13 | } 14 | 15 | virtual bool Tag(const string& src, vector >& res) const = 0; 16 | 17 | virtual const DictTrie* GetDictTrie() const = 0; 18 | 19 | }; // class SegmentTagged 20 | 21 | } // cppjieba 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/TextRankExtractor.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H 2 | #define CPPJIEBA_TEXTRANK_EXTRACTOR_H 3 | 4 | #include 5 | #include "Jieba.hpp" 6 | 7 | namespace cppjieba { 8 | using namespace limonp; 9 | using namespace std; 10 | 11 | class TextRankExtractor { 12 | public: 13 | typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word 14 | private: 15 | typedef std::map WordMap; 16 | 17 | class WordGraph{ 18 | private: 19 | typedef double Score; 20 | typedef string Node; 21 | typedef std::set NodeSet; 22 | 23 | typedef std::map Edges; 24 | typedef std::map Graph; 25 | //typedef std::unordered_map Edges; 26 | //typedef std::unordered_map Graph; 27 | 28 | double d; 29 | Graph graph; 30 | NodeSet nodeSet; 31 | public: 32 | WordGraph(): d(0.85) {}; 33 | WordGraph(double in_d): d(in_d) {}; 34 | 35 | void addEdge(Node start,Node end,double weight){ 36 | Edges temp; 37 | Edges::iterator gotEdges; 38 | nodeSet.insert(start); 39 | nodeSet.insert(end); 40 | graph[start][end]+=weight; 41 | graph[end][start]+=weight; 42 | } 43 | 44 | void rank(WordMap &ws,size_t rankTime=10){ 45 | WordMap outSum; 46 | Score wsdef, min_rank, max_rank; 47 | 48 | if( graph.size() == 0) 49 | return; 50 | 51 | wsdef = 1.0 / graph.size(); 52 | 53 | for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ 54 | // edges->first start节点;edge->first end节点;edge->second 权重 55 | ws[edges->first].word=edges->first; 56 | ws[edges->first].weight=wsdef; 57 | outSum[edges->first].weight=0; 58 | for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ 59 | outSum[edges->first].weight+=edge->second; 60 | } 61 | } 62 | //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? 63 | for( size_t i=0; ifirst end节点;edge->second 权重 68 | s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; 69 | ws[*node].weight = (1 - d) + d * s; 70 | } 71 | } 72 | 73 | min_rank=max_rank=ws.begin()->second.weight; 74 | for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ 75 | if( i->second.weight < min_rank ){ 76 | min_rank = i->second.weight; 77 | } 78 | if( i->second.weight > max_rank ){ 79 | max_rank = i->second.weight; 80 | } 81 | } 82 | for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ 83 | ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); 84 | } 85 | } 86 | }; 87 | 88 | public: 89 | TextRankExtractor(const string& dictPath, 90 | const string& hmmFilePath, 91 | const string& stopWordPath, 92 | const string& userDict = "") 93 | : segment_(dictPath, hmmFilePath, userDict) { 94 | LoadStopWordDict(stopWordPath); 95 | } 96 | TextRankExtractor(const DictTrie* dictTrie, 97 | const HMMModel* model, 98 | const string& stopWordPath) 99 | : segment_(dictTrie, model) { 100 | LoadStopWordDict(stopWordPath); 101 | } 102 | TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { 103 | LoadStopWordDict(stopWordPath); 104 | } 105 | ~TextRankExtractor() { 106 | } 107 | 108 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 109 | vector topWords; 110 | Extract(sentence, topWords, topN); 111 | for (size_t i = 0; i < topWords.size(); i++) { 112 | keywords.push_back(topWords[i].word); 113 | } 114 | } 115 | 116 | void Extract(const string& sentence, vector >& keywords, size_t topN) const { 117 | vector topWords; 118 | Extract(sentence, topWords, topN); 119 | for (size_t i = 0; i < topWords.size(); i++) { 120 | keywords.push_back(pair(topWords[i].word, topWords[i].weight)); 121 | } 122 | } 123 | 124 | void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { 125 | vector words; 126 | segment_.Cut(sentence, words); 127 | 128 | TextRankExtractor::WordGraph graph; 129 | WordMap wordmap; 130 | size_t offset = 0; 131 | 132 | for(size_t i=0; i < words.size(); i++){ 133 | size_t t = offset; 134 | offset += words[i].size(); 135 | if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { 136 | continue; 137 | } 138 | for(size_t j=i+1,skip=0;jsecond); 158 | } 159 | 160 | topN = min(topN, keywords.size()); 161 | partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); 162 | keywords.resize(topN); 163 | } 164 | private: 165 | void LoadStopWordDict(const string& filePath) { 166 | ifstream ifs(filePath.c_str()); 167 | XCHECK(ifs.is_open()) << "open " << filePath << " failed"; 168 | string line ; 169 | while (getline(ifs, line)) { 170 | stopWords_.insert(line); 171 | } 172 | assert(stopWords_.size()); 173 | } 174 | 175 | static bool Compare(const Word &x,const Word &y){ 176 | return x.weight > y.weight; 177 | } 178 | 179 | MixSegment segment_; 180 | unordered_set stopWords_; 181 | }; // class TextRankExtractor 182 | 183 | inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { 184 | return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 185 | } 186 | } // namespace cppjieba 187 | 188 | #endif 189 | 190 | 191 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/Trie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_TRIE_HPP 2 | #define CPPJIEBA_TRIE_HPP 3 | 4 | #include 5 | #include 6 | #include "limonp/StdExtension.hpp" 7 | #include "Unicode.hpp" 8 | 9 | namespace cppjieba { 10 | 11 | using namespace std; 12 | 13 | const size_t MAX_WORD_LENGTH = 512; 14 | 15 | struct DictUnit { 16 | Unicode word; 17 | double weight; 18 | string tag; 19 | }; // struct DictUnit 20 | 21 | // for debugging 22 | // inline ostream & operator << (ostream& os, const DictUnit& unit) { 23 | // string s; 24 | // s << unit.word; 25 | // return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); 26 | // } 27 | 28 | struct Dag { 29 | RuneStr runestr; 30 | // [offset, nexts.first] 31 | limonp::LocalVector > nexts; 32 | const DictUnit * pInfo; 33 | double weight; 34 | size_t nextPos; // TODO 35 | Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) { 36 | } 37 | }; // struct Dag 38 | 39 | typedef Rune TrieKey; 40 | 41 | class TrieNode { 42 | public : 43 | TrieNode(): next(NULL), ptValue(NULL) { 44 | } 45 | public: 46 | typedef unordered_map NextMap; 47 | NextMap *next; 48 | const DictUnit *ptValue; 49 | }; 50 | 51 | class Trie { 52 | public: 53 | Trie(const vector& keys, const vector& valuePointers) 54 | : root_(new TrieNode) { 55 | CreateTrie(keys, valuePointers); 56 | } 57 | ~Trie() { 58 | DeleteNode(root_); 59 | } 60 | 61 | const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { 62 | if (begin == end) { 63 | return NULL; 64 | } 65 | 66 | const TrieNode* ptNode = root_; 67 | TrieNode::NextMap::const_iterator citer; 68 | for (RuneStrArray::const_iterator it = begin; it != end; it++) { 69 | if (NULL == ptNode->next) { 70 | return NULL; 71 | } 72 | citer = ptNode->next->find(it->rune); 73 | if (ptNode->next->end() == citer) { 74 | return NULL; 75 | } 76 | ptNode = citer->second; 77 | } 78 | return ptNode->ptValue; 79 | } 80 | 81 | void Find(RuneStrArray::const_iterator begin, 82 | RuneStrArray::const_iterator end, 83 | vector&res, 84 | size_t max_word_len = MAX_WORD_LENGTH) const { 85 | assert(root_ != NULL); 86 | res.resize(end - begin); 87 | 88 | const TrieNode *ptNode = NULL; 89 | TrieNode::NextMap::const_iterator citer; 90 | for (size_t i = 0; i < size_t(end - begin); i++) { 91 | res[i].runestr = *(begin + i); 92 | 93 | if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) { 94 | ptNode = citer->second; 95 | } else { 96 | ptNode = NULL; 97 | } 98 | if (ptNode != NULL) { 99 | res[i].nexts.push_back(pair(i, ptNode->ptValue)); 100 | } else { 101 | res[i].nexts.push_back(pair(i, static_cast(NULL))); 102 | } 103 | 104 | for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) { 105 | if (ptNode == NULL || ptNode->next == NULL) { 106 | break; 107 | } 108 | citer = ptNode->next->find((begin + j)->rune); 109 | if (ptNode->next->end() == citer) { 110 | break; 111 | } 112 | ptNode = citer->second; 113 | if (NULL != ptNode->ptValue) { 114 | res[i].nexts.push_back(pair(j, ptNode->ptValue)); 115 | } 116 | } 117 | } 118 | } 119 | 120 | void InsertNode(const Unicode& key, const DictUnit* ptValue) { 121 | if (key.begin() == key.end()) { 122 | return; 123 | } 124 | 125 | TrieNode::NextMap::const_iterator kmIter; 126 | TrieNode *ptNode = root_; 127 | for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { 128 | if (NULL == ptNode->next) { 129 | ptNode->next = new TrieNode::NextMap; 130 | } 131 | kmIter = ptNode->next->find(*citer); 132 | if (ptNode->next->end() == kmIter) { 133 | TrieNode *nextNode = new TrieNode; 134 | 135 | ptNode->next->insert(make_pair(*citer, nextNode)); 136 | ptNode = nextNode; 137 | } else { 138 | ptNode = kmIter->second; 139 | } 140 | } 141 | assert(ptNode != NULL); 142 | ptNode->ptValue = ptValue; 143 | } 144 | void DeleteNode(const Unicode& key, const DictUnit* ptValue) { 145 | if (key.begin() == key.end()) { 146 | return; 147 | } 148 | //定义一个NextMap迭代器 149 | TrieNode::NextMap::const_iterator kmIter; 150 | //定义一个指向root的TrieNode指针 151 | TrieNode *ptNode = root_; 152 | for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { 153 | //链表不存在元素 154 | if (NULL == ptNode->next) { 155 | return; 156 | } 157 | kmIter = ptNode->next->find(*citer); 158 | //如果map中不存在,跳出循环 159 | if (ptNode->next->end() == kmIter) { 160 | break; 161 | } 162 | //从unordered_map中擦除该项 163 | ptNode->next->erase(*citer); 164 | //删除该node 165 | ptNode = kmIter->second; 166 | delete ptNode; 167 | break; 168 | } 169 | return; 170 | } 171 | private: 172 | void CreateTrie(const vector& keys, const vector& valuePointers) { 173 | if (valuePointers.empty() || keys.empty()) { 174 | return; 175 | } 176 | assert(keys.size() == valuePointers.size()); 177 | 178 | for (size_t i = 0; i < keys.size(); i++) { 179 | InsertNode(keys[i], valuePointers[i]); 180 | } 181 | } 182 | 183 | void DeleteNode(TrieNode* node) { 184 | if (NULL == node) { 185 | return; 186 | } 187 | if (NULL != node->next) { 188 | for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) { 189 | DeleteNode(it->second); 190 | } 191 | delete node->next; 192 | } 193 | delete node; 194 | } 195 | 196 | TrieNode* root_; 197 | }; // class Trie 198 | } // namespace cppjieba 199 | 200 | #endif // CPPJIEBA_TRIE_HPP 201 | -------------------------------------------------------------------------------- /src/3rd_include/cppjieba/Unicode.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_UNICODE_H 2 | #define CPPJIEBA_UNICODE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "limonp/LocalVector.hpp" 10 | 11 | namespace cppjieba { 12 | 13 | using std::string; 14 | using std::vector; 15 | 16 | typedef uint32_t Rune; 17 | 18 | struct Word { 19 | string word; 20 | uint32_t offset; 21 | uint32_t unicode_offset; 22 | uint32_t unicode_length; 23 | Word(const string& w, uint32_t o) 24 | : word(w), offset(o) { 25 | } 26 | Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) 27 | : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { 28 | } 29 | }; // struct Word 30 | 31 | inline std::ostream& operator << (std::ostream& os, const Word& w) { 32 | return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; 33 | } 34 | 35 | struct RuneStr { 36 | Rune rune; 37 | uint32_t offset; 38 | uint32_t len; 39 | uint32_t unicode_offset; 40 | uint32_t unicode_length; 41 | RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) { 42 | } 43 | RuneStr(Rune r, uint32_t o, uint32_t l) 44 | : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) { 45 | } 46 | RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length) 47 | : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) { 48 | } 49 | }; // struct RuneStr 50 | 51 | inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { 52 | return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}"; 53 | } 54 | 55 | typedef limonp::LocalVector Unicode; 56 | typedef limonp::LocalVector RuneStrArray; 57 | 58 | // [left, right] 59 | struct WordRange { 60 | RuneStrArray::const_iterator left; 61 | RuneStrArray::const_iterator right; 62 | WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) 63 | : left(l), right(r) { 64 | } 65 | size_t Length() const { 66 | return right - left + 1; 67 | } 68 | bool IsAllAscii() const { 69 | for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) { 70 | if (iter->rune >= 0x80) { 71 | return false; 72 | } 73 | } 74 | return true; 75 | } 76 | }; // struct WordRange 77 | 78 | struct RuneStrLite { 79 | uint32_t rune; 80 | uint32_t len; 81 | RuneStrLite(): rune(0), len(0) { 82 | } 83 | RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) { 84 | } 85 | }; // struct RuneStrLite 86 | 87 | inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) { 88 | RuneStrLite rp(0, 0); 89 | if (str == NULL || len == 0) { 90 | return rp; 91 | } 92 | if (!(str[0] & 0x80)) { // 0xxxxxxx 93 | // 7bit, total 7bit 94 | rp.rune = (uint8_t)(str[0]) & 0x7f; 95 | rp.len = 1; 96 | } else if ((uint8_t)str[0] <= 0xdf && 1 < len) { 97 | // 110xxxxxx 98 | // 5bit, total 5bit 99 | rp.rune = (uint8_t)(str[0]) & 0x1f; 100 | 101 | // 6bit, total 11bit 102 | rp.rune <<= 6; 103 | rp.rune |= (uint8_t)(str[1]) & 0x3f; 104 | rp.len = 2; 105 | } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx 106 | // 4bit, total 4bit 107 | rp.rune = (uint8_t)(str[0]) & 0x0f; 108 | 109 | // 6bit, total 10bit 110 | rp.rune <<= 6; 111 | rp.rune |= (uint8_t)(str[1]) & 0x3f; 112 | 113 | // 6bit, total 16bit 114 | rp.rune <<= 6; 115 | rp.rune |= (uint8_t)(str[2]) & 0x3f; 116 | 117 | rp.len = 3; 118 | } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx 119 | // 3bit, total 3bit 120 | rp.rune = (uint8_t)(str[0]) & 0x07; 121 | 122 | // 6bit, total 9bit 123 | rp.rune <<= 6; 124 | rp.rune |= (uint8_t)(str[1]) & 0x3f; 125 | 126 | // 6bit, total 15bit 127 | rp.rune <<= 6; 128 | rp.rune |= (uint8_t)(str[2]) & 0x3f; 129 | 130 | // 6bit, total 21bit 131 | rp.rune <<= 6; 132 | rp.rune |= (uint8_t)(str[3]) & 0x3f; 133 | 134 | rp.len = 4; 135 | } else { 136 | rp.rune = 0; 137 | rp.len = 0; 138 | } 139 | return rp; 140 | } 141 | 142 | inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) { 143 | runes.clear(); 144 | runes.reserve(len / 2); 145 | for (uint32_t i = 0, j = 0; i < len;) { 146 | RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i); 147 | if (rp.len == 0) { 148 | runes.clear(); 149 | return false; 150 | } 151 | RuneStr x(rp.rune, i, rp.len, j, 1); 152 | runes.push_back(x); 153 | i += rp.len; 154 | ++j; 155 | } 156 | return true; 157 | } 158 | 159 | inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) { 160 | return DecodeUTF8RunesInString(s.c_str(), s.size(), runes); 161 | } 162 | 163 | inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) { 164 | unicode.clear(); 165 | RuneStrArray runes; 166 | if (!DecodeUTF8RunesInString(s, len, runes)) { 167 | return false; 168 | } 169 | unicode.reserve(runes.size()); 170 | for (size_t i = 0; i < runes.size(); i++) { 171 | unicode.push_back(runes[i].rune); 172 | } 173 | return true; 174 | } 175 | 176 | inline bool IsSingleWord(const string& str) { 177 | RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size()); 178 | return rp.len == str.size(); 179 | } 180 | 181 | inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) { 182 | return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode); 183 | } 184 | 185 | inline Unicode DecodeUTF8RunesInString(const string& s) { 186 | Unicode result; 187 | DecodeUTF8RunesInString(s, result); 188 | return result; 189 | } 190 | 191 | 192 | // [left, right] 193 | inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { 194 | assert(right->offset >= left->offset); 195 | uint32_t len = right->offset - left->offset + right->len; 196 | uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length; 197 | return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length); 198 | } 199 | 200 | inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { 201 | assert(right->offset >= left->offset); 202 | uint32_t len = right->offset - left->offset + right->len; 203 | return s.substr(left->offset, len); 204 | } 205 | 206 | inline void GetWordsFromWordRanges(const string& s, const vector& wrs, vector& words) { 207 | for (size_t i = 0; i < wrs.size(); i++) { 208 | words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right)); 209 | } 210 | } 211 | 212 | inline vector GetWordsFromWordRanges(const string& s, const vector& wrs) { 213 | vector result; 214 | GetWordsFromWordRanges(s, wrs, result); 215 | return result; 216 | } 217 | 218 | inline void GetStringsFromWords(const vector& words, vector& strs) { 219 | strs.resize(words.size()); 220 | for (size_t i = 0; i < words.size(); ++i) { 221 | strs[i] = words[i].word; 222 | } 223 | } 224 | 225 | } // namespace cppjieba 226 | 227 | #endif // CPPJIEBA_UNICODE_H 228 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/ArgvContext.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | 6 | #ifndef LIMONP_ARGV_FUNCTS_H 7 | #define LIMONP_ARGV_FUNCTS_H 8 | 9 | #include 10 | #include 11 | #include "StringUtil.hpp" 12 | 13 | namespace limonp { 14 | 15 | using namespace std; 16 | 17 | class ArgvContext { 18 | public : 19 | ArgvContext(int argc, const char* const * argv) { 20 | for(int i = 0; i < argc; i++) { 21 | if(StartsWith(argv[i], "-")) { 22 | if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) { 23 | mpss_[argv[i]] = argv[i+1]; 24 | i++; 25 | } else { 26 | sset_.insert(argv[i]); 27 | } 28 | } else { 29 | args_.push_back(argv[i]); 30 | } 31 | } 32 | } 33 | ~ArgvContext() { 34 | } 35 | 36 | friend ostream& operator << (ostream& os, const ArgvContext& args); 37 | string operator [](size_t i) const { 38 | if(i < args_.size()) { 39 | return args_[i]; 40 | } 41 | return ""; 42 | } 43 | string operator [](const string& key) const { 44 | map::const_iterator it = mpss_.find(key); 45 | if(it != mpss_.end()) { 46 | return it->second; 47 | } 48 | return ""; 49 | } 50 | 51 | bool HasKey(const string& key) const { 52 | if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) { 53 | return true; 54 | } 55 | return false; 56 | } 57 | 58 | private: 59 | vector args_; 60 | map mpss_; 61 | set sset_; 62 | }; // class ArgvContext 63 | 64 | inline ostream& operator << (ostream& os, const ArgvContext& args) { 65 | return os< 14 | class Closure0: public ClosureInterface { 15 | public: 16 | Closure0(Funct fun) { 17 | fun_ = fun; 18 | } 19 | virtual ~Closure0() { 20 | } 21 | virtual void Run() { 22 | (*fun_)(); 23 | } 24 | private: 25 | Funct fun_; 26 | }; 27 | 28 | template 29 | class Closure1: public ClosureInterface { 30 | public: 31 | Closure1(Funct fun, Arg1 arg1) { 32 | fun_ = fun; 33 | arg1_ = arg1; 34 | } 35 | virtual ~Closure1() { 36 | } 37 | virtual void Run() { 38 | (*fun_)(arg1_); 39 | } 40 | private: 41 | Funct fun_; 42 | Arg1 arg1_; 43 | }; 44 | 45 | template 46 | class Closure2: public ClosureInterface { 47 | public: 48 | Closure2(Funct fun, Arg1 arg1, Arg2 arg2) { 49 | fun_ = fun; 50 | arg1_ = arg1; 51 | arg2_ = arg2; 52 | } 53 | virtual ~Closure2() { 54 | } 55 | virtual void Run() { 56 | (*fun_)(arg1_, arg2_); 57 | } 58 | private: 59 | Funct fun_; 60 | Arg1 arg1_; 61 | Arg2 arg2_; 62 | }; 63 | 64 | template 65 | class Closure3: public ClosureInterface { 66 | public: 67 | Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) { 68 | fun_ = fun; 69 | arg1_ = arg1; 70 | arg2_ = arg2; 71 | arg3_ = arg3; 72 | } 73 | virtual ~Closure3() { 74 | } 75 | virtual void Run() { 76 | (*fun_)(arg1_, arg2_, arg3_); 77 | } 78 | private: 79 | Funct fun_; 80 | Arg1 arg1_; 81 | Arg2 arg2_; 82 | Arg3 arg3_; 83 | }; 84 | 85 | template 86 | class ObjClosure0: public ClosureInterface { 87 | public: 88 | ObjClosure0(Obj* p, Funct fun) { 89 | p_ = p; 90 | fun_ = fun; 91 | } 92 | virtual ~ObjClosure0() { 93 | } 94 | virtual void Run() { 95 | (p_->*fun_)(); 96 | } 97 | private: 98 | Obj* p_; 99 | Funct fun_; 100 | }; 101 | 102 | template 103 | class ObjClosure1: public ClosureInterface { 104 | public: 105 | ObjClosure1(Obj* p, Funct fun, Arg1 arg1) { 106 | p_ = p; 107 | fun_ = fun; 108 | arg1_ = arg1; 109 | } 110 | virtual ~ObjClosure1() { 111 | } 112 | virtual void Run() { 113 | (p_->*fun_)(arg1_); 114 | } 115 | private: 116 | Obj* p_; 117 | Funct fun_; 118 | Arg1 arg1_; 119 | }; 120 | 121 | template 122 | class ObjClosure2: public ClosureInterface { 123 | public: 124 | ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) { 125 | p_ = p; 126 | fun_ = fun; 127 | arg1_ = arg1; 128 | arg2_ = arg2; 129 | } 130 | virtual ~ObjClosure2() { 131 | } 132 | virtual void Run() { 133 | (p_->*fun_)(arg1_, arg2_); 134 | } 135 | private: 136 | Obj* p_; 137 | Funct fun_; 138 | Arg1 arg1_; 139 | Arg2 arg2_; 140 | }; 141 | template 142 | class ObjClosure3: public ClosureInterface { 143 | public: 144 | ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) { 145 | p_ = p; 146 | fun_ = fun; 147 | arg1_ = arg1; 148 | arg2_ = arg2; 149 | arg3_ = arg3; 150 | } 151 | virtual ~ObjClosure3() { 152 | } 153 | virtual void Run() { 154 | (p_->*fun_)(arg1_, arg2_, arg3_); 155 | } 156 | private: 157 | Obj* p_; 158 | Funct fun_; 159 | Arg1 arg1_; 160 | Arg2 arg2_; 161 | Arg3 arg3_; 162 | }; 163 | 164 | template 165 | ClosureInterface* NewClosure(R (*fun)()) { 166 | return new Closure0(fun); 167 | } 168 | 169 | template 170 | ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) { 171 | return new Closure1(fun, arg1); 172 | } 173 | 174 | template 175 | ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) { 176 | return new Closure2(fun, arg1, arg2); 177 | } 178 | 179 | template 180 | ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) { 181 | return new Closure3(fun, arg1, arg2, arg3); 182 | } 183 | 184 | template 185 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) { 186 | return new ObjClosure0(obj, fun); 187 | } 188 | 189 | template 190 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) { 191 | return new ObjClosure1(obj, fun, arg1); 192 | } 193 | 194 | template 195 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) { 196 | return new ObjClosure2(obj, fun, arg1, arg2); 197 | } 198 | 199 | template 200 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) { 201 | return new ObjClosure3(obj, fun, arg1, arg2, arg3); 202 | } 203 | 204 | } // namespace limonp 205 | 206 | #endif // LIMONP_CLOSURE_HPP 207 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/Colors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_COLOR_PRINT_HPP 2 | #define LIMONP_COLOR_PRINT_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace limonp { 8 | 9 | using std::string; 10 | 11 | enum Color { 12 | BLACK = 30, 13 | RED, 14 | GREEN, 15 | YELLOW, 16 | BLUE, 17 | PURPLE 18 | }; // enum Color 19 | 20 | static void ColorPrintln(enum Color color, const char * fmt, ...) { 21 | va_list ap; 22 | printf("\033[0;%dm", color); 23 | va_start(ap, fmt); 24 | vprintf(fmt, ap); 25 | va_end(ap); 26 | printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly 27 | } 28 | 29 | } // namespace limonp 30 | 31 | #endif // LIMONP_COLOR_PRINT_HPP 32 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/Condition.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CONDITION_HPP 2 | #define LIMONP_CONDITION_HPP 3 | 4 | #include "MutexLock.hpp" 5 | 6 | namespace limonp { 7 | 8 | class Condition : NonCopyable { 9 | public: 10 | explicit Condition(MutexLock& mutex) 11 | : mutex_(mutex) { 12 | XCHECK(!pthread_cond_init(&pcond_, NULL)); 13 | } 14 | 15 | ~Condition() { 16 | XCHECK(!pthread_cond_destroy(&pcond_)); 17 | } 18 | 19 | void Wait() { 20 | XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex())); 21 | } 22 | 23 | void Notify() { 24 | XCHECK(!pthread_cond_signal(&pcond_)); 25 | } 26 | 27 | void NotifyAll() { 28 | XCHECK(!pthread_cond_broadcast(&pcond_)); 29 | } 30 | 31 | private: 32 | MutexLock& mutex_; 33 | pthread_cond_t pcond_; 34 | }; // class Condition 35 | 36 | } // namespace limonp 37 | 38 | #endif // LIMONP_CONDITION_HPP 39 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "StringUtil.hpp" 13 | 14 | namespace limonp { 15 | 16 | using namespace std; 17 | 18 | class Config { 19 | public: 20 | explicit Config(const string& filePath) { 21 | LoadFile(filePath); 22 | } 23 | 24 | operator bool () { 25 | return !map_.empty(); 26 | } 27 | 28 | string Get(const string& key, const string& defaultvalue) const { 29 | map::const_iterator it = map_.find(key); 30 | if(map_.end() != it) { 31 | return it->second; 32 | } 33 | return defaultvalue; 34 | } 35 | int Get(const string& key, int defaultvalue) const { 36 | string str = Get(key, ""); 37 | if("" == str) { 38 | return defaultvalue; 39 | } 40 | return atoi(str.c_str()); 41 | } 42 | const char* operator [] (const char* key) const { 43 | if(NULL == key) { 44 | return NULL; 45 | } 46 | map::const_iterator it = map_.find(key); 47 | if(map_.end() != it) { 48 | return it->second.c_str(); 49 | } 50 | return NULL; 51 | } 52 | 53 | string GetConfigInfo() const { 54 | string res; 55 | res << *this; 56 | return res; 57 | } 58 | 59 | private: 60 | void LoadFile(const string& filePath) { 61 | ifstream ifs(filePath.c_str()); 62 | assert(ifs); 63 | string line; 64 | vector vecBuf; 65 | size_t lineno = 0; 66 | while(getline(ifs, line)) { 67 | lineno ++; 68 | Trim(line); 69 | if(line.empty() || StartsWith(line, "#")) { 70 | continue; 71 | } 72 | vecBuf.clear(); 73 | Split(line, vecBuf, "="); 74 | if(2 != vecBuf.size()) { 75 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 76 | assert(false); 77 | continue; 78 | } 79 | string& key = vecBuf[0]; 80 | string& value = vecBuf[1]; 81 | Trim(key); 82 | Trim(value); 83 | if(!map_.insert(make_pair(key, value)).second) { 84 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 85 | assert(false); 86 | continue; 87 | } 88 | } 89 | ifs.close(); 90 | } 91 | 92 | friend ostream& operator << (ostream& os, const Config& config); 93 | 94 | map map_; 95 | }; // class Config 96 | 97 | inline ostream& operator << (ostream& os, const Config& config) { 98 | return os << config.map_; 99 | } 100 | 101 | } // namespace limonp 102 | 103 | #endif // LIMONP_CONFIG_H 104 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/ForcePublic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FORCE_PUBLIC_H 2 | #define LIMONP_FORCE_PUBLIC_H 3 | 4 | #define private public 5 | #define protected public 6 | 7 | #endif // LIMONP_FORCE_PUBLIC_H 8 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/LocalVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOCAL_VECTOR_HPP 2 | #define LIMONP_LOCAL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace limonp { 10 | using namespace std; 11 | /* 12 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 13 | * LocalVector is simple and not well-tested. 14 | */ 15 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 16 | template 17 | class LocalVector { 18 | public: 19 | typedef const T* const_iterator ; 20 | typedef T value_type; 21 | typedef size_t size_type; 22 | private: 23 | T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; 24 | T * ptr_; 25 | size_t size_; 26 | size_t capacity_; 27 | public: 28 | LocalVector() { 29 | init_(); 30 | }; 31 | LocalVector(const LocalVector& vec) { 32 | init_(); 33 | *this = vec; 34 | } 35 | LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster 36 | init_(); 37 | while(begin != end) { 38 | push_back(*begin++); 39 | } 40 | } 41 | LocalVector(size_t size, const T& t) { // TODO: make it faster 42 | init_(); 43 | while(size--) { 44 | push_back(t); 45 | } 46 | } 47 | ~LocalVector() { 48 | if(ptr_ != buffer_) { 49 | free(ptr_); 50 | } 51 | }; 52 | public: 53 | LocalVector& operator = (const LocalVector& vec) { 54 | clear(); 55 | size_ = vec.size(); 56 | capacity_ = vec.capacity(); 57 | if(vec.buffer_ == vec.ptr_) { 58 | memcpy(static_cast(buffer_), vec.buffer_, sizeof(T) * size_); 59 | ptr_ = buffer_; 60 | } else { 61 | ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); 62 | assert(ptr_); 63 | memcpy(static_cast(ptr_), vec.ptr_, vec.size() * sizeof(T)); 64 | } 65 | return *this; 66 | } 67 | private: 68 | void init_() { 69 | ptr_ = buffer_; 70 | size_ = 0; 71 | capacity_ = LOCAL_VECTOR_BUFFER_SIZE; 72 | } 73 | public: 74 | T& operator [] (size_t i) { 75 | return ptr_[i]; 76 | } 77 | const T& operator [] (size_t i) const { 78 | return ptr_[i]; 79 | } 80 | void push_back(const T& t) { 81 | if(size_ == capacity_) { 82 | assert(capacity_); 83 | reserve(capacity_ * 2); 84 | } 85 | ptr_[size_ ++ ] = t; 86 | } 87 | void reserve(size_t size) { 88 | if(size <= capacity_) { 89 | return; 90 | } 91 | T * next = (T*)malloc(sizeof(T) * size); 92 | assert(next); 93 | T * old = ptr_; 94 | ptr_ = next; 95 | memcpy(static_cast(ptr_), old, sizeof(T) * capacity_); 96 | capacity_ = size; 97 | if(old != buffer_) { 98 | free(old); 99 | } 100 | } 101 | bool empty() const { 102 | return 0 == size(); 103 | } 104 | size_t size() const { 105 | return size_; 106 | } 107 | size_t capacity() const { 108 | return capacity_; 109 | } 110 | const_iterator begin() const { 111 | return ptr_; 112 | } 113 | const_iterator end() const { 114 | return ptr_ + size_; 115 | } 116 | void clear() { 117 | if(ptr_ != buffer_) { 118 | free(ptr_); 119 | } 120 | init_(); 121 | } 122 | }; 123 | 124 | template 125 | ostream & operator << (ostream& os, const LocalVector& vec) { 126 | if(vec.empty()) { 127 | return os << "[]"; 128 | } 129 | os<<"[\""< 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef XLOG 11 | #error "XLOG has been defined already" 12 | #endif // XLOG 13 | #ifdef XCHECK 14 | #error "XCHECK has been defined already" 15 | #endif // XCHECK 16 | 17 | #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 18 | #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. " 19 | 20 | namespace limonp { 21 | 22 | enum { 23 | LL_DEBUG = 0, 24 | LL_INFO = 1, 25 | LL_WARNING = 2, 26 | LL_ERROR = 3, 27 | LL_FATAL = 4, 28 | }; // enum 29 | 30 | static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"}; 31 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 32 | 33 | class Logger { 34 | public: 35 | Logger(size_t level, const char* filename, int lineno) 36 | : level_(level) { 37 | #ifdef LOGGING_LEVEL 38 | if (level_ < LOGGING_LEVEL) { 39 | return; 40 | } 41 | #endif 42 | assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY)); 43 | 44 | char buf[32]; 45 | 46 | time_t timeNow; 47 | time(&timeNow); 48 | 49 | struct tm tmNow; 50 | 51 | #if defined(_WIN32) || defined(_WIN64) 52 | errno_t e = localtime_s(&tmNow, &timeNow); 53 | assert(e == 0); 54 | #else 55 | struct tm * tm_tmp = localtime_r(&timeNow, &tmNow); 56 | assert(tm_tmp != nullptr); 57 | #endif 58 | 59 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, &tmNow); 60 | 61 | stream_ << buf 62 | << " " << filename 63 | << ":" << lineno 64 | << " " << LOG_LEVEL_ARRAY[level_] 65 | << " "; 66 | } 67 | ~Logger() { 68 | #ifdef LOGGING_LEVEL 69 | if (level_ < LOGGING_LEVEL) { 70 | return; 71 | } 72 | #endif 73 | std::cerr << stream_.str() << std::endl; 74 | if (level_ == LL_FATAL) { 75 | abort(); 76 | } 77 | } 78 | 79 | std::ostream& Stream() { 80 | return stream_; 81 | } 82 | 83 | private: 84 | std::ostringstream stream_; 85 | size_t level_; 86 | }; // class Logger 87 | 88 | } // namespace limonp 89 | 90 | #endif // LIMONP_LOGGING_HPP 91 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | namespace limonp { 7 | 8 | class NonCopyable { 9 | protected: 10 | NonCopyable() { 11 | } 12 | ~NonCopyable() { 13 | } 14 | private: 15 | NonCopyable(const NonCopyable& ); 16 | const NonCopyable& operator=(const NonCopyable& ); 17 | }; // class NonCopyable 18 | 19 | } // namespace limonp 20 | 21 | #endif // LIMONP_NONCOPYABLE_H 22 | -------------------------------------------------------------------------------- /src/3rd_include/limonp/StdExtension.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_STD_EXTEMSION_HPP 2 | #define LIMONP_STD_EXTEMSION_HPP 3 | 4 | #include 5 | 6 | #ifdef __APPLE__ 7 | #include 8 | #include 9 | #elif(__cplusplus >= 201103L) 10 | #include 11 | #include 12 | #elif defined _MSC_VER 13 | #include 14 | #include 15 | #else 16 | #include 17 | #include 18 | namespace std { 19 | using std::tr1::unordered_map; 20 | using std::tr1::unordered_set; 21 | } 22 | 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace std { 33 | 34 | template 35 | ostream& operator << (ostream& os, const vector& v) { 36 | if(v.empty()) { 37 | return os << "[]"; 38 | } 39 | os<<"["< 48 | inline ostream& operator << (ostream& os, const vector& v) { 49 | if(v.empty()) { 50 | return os << "[]"; 51 | } 52 | os<<"[\""< 61 | ostream& operator << (ostream& os, const deque& dq) { 62 | if(dq.empty()) { 63 | return os << "[]"; 64 | } 65 | os<<"[\""< 75 | ostream& operator << (ostream& os, const pair& pr) { 76 | os << pr.first << ":" << pr.second ; 77 | return os; 78 | } 79 | 80 | 81 | template 82 | string& operator << (string& str, const T& obj) { 83 | stringstream ss; 84 | ss << obj; // call ostream& operator << (ostream& os, 85 | return str = ss.str(); 86 | } 87 | 88 | template 89 | ostream& operator << (ostream& os, const map& mp) { 90 | if(mp.empty()) { 91 | os<<"{}"; 92 | return os; 93 | } 94 | os<<'{'; 95 | typename map::const_iterator it = mp.begin(); 96 | os<<*it; 97 | it++; 98 | while(it != mp.end()) { 99 | os<<", "<<*it; 100 | it++; 101 | } 102 | os<<'}'; 103 | return os; 104 | } 105 | template 106 | ostream& operator << (ostream& os, const std::unordered_map& mp) { 107 | if(mp.empty()) { 108 | return os << "{}"; 109 | } 110 | os<<'{'; 111 | typename std::unordered_map::const_iterator it = mp.begin(); 112 | os<<*it; 113 | it++; 114 | while(it != mp.end()) { 115 | os<<", "<<*it++; 116 | } 117 | return os<<'}'; 118 | } 119 | 120 | template 121 | ostream& operator << (ostream& os, const set& st) { 122 | if(st.empty()) { 123 | os << "{}"; 124 | return os; 125 | } 126 | os<<'{'; 127 | typename set::const_iterator it = st.begin(); 128 | os<<*it; 129 | it++; 130 | while(it != st.end()) { 131 | os<<", "<<*it; 132 | it++; 133 | } 134 | os<<'}'; 135 | return os; 136 | } 137 | 138 | template 139 | bool IsIn(const ContainType& contain, const KeyType& key) { 140 | return contain.end() != contain.find(key); 141 | } 142 | 143 | template 144 | basic_string & operator << (basic_string & s, ifstream & ifs) { 145 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 146 | } 147 | 148 | template 149 | ofstream & operator << (ofstream & ofs, const basic_string& s) { 150 | ostreambuf_iterator itr (ofs); 151 | copy(s.begin(), s.end(), itr); 152 | return ofs; 153 | } 154 | 155 | } // namespace std 156 | 157 | #endif 158 | -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/TextNormalizer.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by tao on 25-4-9. 3 | // 4 | 5 | #ifndef TEXTNORMALIZATION_H 6 | #define TEXTNORMALIZATION_H 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | class TextNormalizer { 14 | public: 15 | TextNormalizer(); 16 | std::vector normalize(const std::string& text); 17 | static std::string normalize_sentence(const std::string& sentence); 18 | 19 | private: 20 | std::regex SENTENCE_SPLITOR; 21 | std::vector _split(const std::string& text, const std::string& lang = "zh") const; 22 | static std::string _post_replace(const std::string& sentence); 23 | }; 24 | 25 | #endif //TEXTNORMALIZATION_H 26 | -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/chinese_converter.h: -------------------------------------------------------------------------------- 1 | #ifndef CHINESE_CONVERTER_H 2 | #define CHINESE_CONVERTER_H 3 | 4 | #include 5 | #include 6 | 7 | class ChineseConverter { 8 | public: 9 | static std::string traditionalToSimplified(const std::string& text); 10 | static std::string simplifiedToTraditional(const std::string& text); 11 | 12 | private: 13 | static std::unordered_map t2s_dict; 14 | static std::unordered_map s2t_dict; 15 | static void initializeDicts(); 16 | static bool is_initialized; 17 | }; 18 | 19 | #endif // CHINESE_CONVERTER_H -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/chronology.h: -------------------------------------------------------------------------------- 1 | #ifndef CHRONOLOGY_H 2 | #define CHRONOLOGY_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "num.h" 9 | 10 | class Chronology { 11 | public: 12 | // 静态处理函数 13 | static std::string time_num2str(const std::string& num_string); 14 | static std::string replace_time(const std::smatch& match); 15 | static std::string replace_date(const std::smatch& match); 16 | static std::string replace_date2(const std::smatch& match); 17 | 18 | // 静态正则表达式 19 | static inline auto RE_TIME = std::regex(R"(([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?)"); 20 | static inline auto RE_TIME_RANGE = std::regex(R"(([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?(~|-)([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?)"); 21 | // static inline auto RE_DATE = std::regex(R"(((\d{4}|\d{2})年)?((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?)"); 22 | static inline auto RE_DATE = std::regex(R"(((\d{2,4})年)?(([1-9]|1[0-2])月)?(((0?[1-9])|([12][0-9])|30|31)(日|号))?)"); 23 | static inline auto RE_DATE2 = std::regex(R"((\d{4})([-/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01]))"); 24 | }; 25 | 26 | #endif //CHRONOLOGY_H -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/constants.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | class Constants { 8 | public: 9 | // 获取单例实例 10 | static Constants& getInstance(); 11 | 12 | // 全角转半角 13 | std::string fullToHalf(const std::string& text); 14 | 15 | // 半角转全角 16 | std::string halfToFull(const std::string& text); 17 | 18 | // 获取非汉字部分(NSW: Non-Standard-Word) 19 | std::vector getNSWs(const std::string& text); 20 | 21 | private: 22 | Constants(); // 私有构造函数 23 | ~Constants() = default; 24 | 25 | // 禁止拷贝和赋值 26 | Constants(const Constants&) = delete; 27 | Constants& operator=(const Constants&) = delete; 28 | 29 | void initializeMaps(); 30 | 31 | // 全角半角映射表 32 | std::unordered_map f2h_ascii_letters; 33 | std::unordered_map h2f_ascii_letters; 34 | std::unordered_map f2h_digits; 35 | std::unordered_map h2f_digits; 36 | std::unordered_map f2h_punctuations; 37 | std::unordered_map h2f_punctuations; 38 | std::unordered_map f2h_space; 39 | std::unordered_map h2f_space; 40 | 41 | // 用于NSW提取的正则表达式 42 | std::regex re_nsw; 43 | bool is_initialized; 44 | 45 | // 常量定义 46 | static const std::string ASCII_LETTERS; 47 | static const std::string DIGITS; 48 | static const std::string PUNCTUATIONS; 49 | 50 | // NSW正则表达式模式 51 | #ifdef SUPPORT_UCS4 52 | static const std::string NSW_PATTERN; 53 | #else 54 | static const std::string NSW_PATTERN_NO_UCS4; 55 | #endif 56 | }; -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/num.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by tao on 25-4-9. 3 | // 4 | 5 | #ifndef NUM_H 6 | #define NUM_H 7 | 8 | /** 9 | * Rules to verbalize numbers into Chinese characters. 10 | * https://zh.wikipedia.org/wiki/中文数字#現代中文 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | class Num { 19 | public: 20 | // 静态成员变量 21 | static const std::map UNITS; 22 | static const std::map DIGITS; 23 | static const std::string COM_QUANTIFIERS; 24 | 25 | // // General number pattern 26 | static inline auto RE_NUMBER = std::regex(R"((-?)((\d+)(\.\d+)?)|(\.(\d+)))"); 27 | // // Range pattern 28 | static inline auto RE_RANGE = std::regex(R"(((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+))))"); 29 | static inline auto RE_FRAC = std::regex(R"((-?)(\d+)/(\d+))"); 30 | // static inline auto RE_PERCENTAGE = std::regex(R"((-?)(\d+(\.\d+))?%)"); 31 | static inline auto RE_PERCENTAGE = std::regex(R"((-?)(\d+)(\.\d+)?%)"); 32 | static inline auto RE_INTEGER = std::regex(R"((-)(\d+))"); 33 | static inline auto RE_DEFAULT_NUM = std::regex(R"(\d{3}\d*)"); 34 | // Pure decimal numbers (both signed and unsigned) 35 | static inline auto RE_DECIMAL_NUM = std::regex(R"((-?)((\d+)(\.\d+))|(\.(\d+)))"); 36 | 37 | static inline auto TMP_QUANTS = "(\\d+)([多余几\\+])?" + COM_QUANTIFIERS; 38 | // Positive numbers with quantifiers 39 | static inline auto RE_POSITIVE_QUANTIFIERS = std::regex(TMP_QUANTS) ; 40 | static inline auto RE_DIGITAL = std::regex(R"(\d+)"); 41 | 42 | 43 | // 辅助方法 44 | static std::vector _get_value(const std::string& value_string, bool use_zero = true); 45 | static std::string verbalize_cardinal(const std::string& value_string); 46 | static std::string verbalize_digit(const std::string& value_string, bool alt_one = false); 47 | static std::string get_digit(const std::string& num_string) { 48 | std::smatch match; 49 | if (std::regex_search(num_string, match, RE_DIGITAL)) { 50 | return match.str(); 51 | } 52 | 53 | return ""; 54 | } 55 | 56 | 57 | // 公共方法 58 | static std::string num2str(const std::string& value_string); 59 | static std::string replace_frac(const std::smatch& match); 60 | static std::string replace_percentage(const std::smatch& match); 61 | static std::string replace_negative_num(const std::smatch& match); 62 | static std::string replace_default_num(const std::smatch& match); 63 | static std::string replace_positive_quantifier(const std::smatch& match); 64 | static std::string replace_number(const std::smatch& match); 65 | static std::string replace_range(const std::smatch& match); 66 | 67 | }; 68 | 69 | 70 | // 示例用法 71 | /* 72 | int main() { 73 | std::string test = "123.45"; 74 | std::cout << ChineseNumberConverter::num2str(test) << std::endl; 75 | return 0; 76 | } 77 | */ 78 | #endif //NUM_H 79 | -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/phonecode.h: -------------------------------------------------------------------------------- 1 | #ifndef PHONE_NORMALIZER_HPP 2 | #define PHONE_NORMALIZER_HPP 3 | 4 | #include 5 | #include 6 | 7 | class PhoneNormalizer { 8 | public: 9 | // 将电话号码转换为字符串表示 10 | static std::string phone2str(const std::string& phone_string, bool mobile = true); 11 | 12 | // 替换固定电话号码 13 | static std::string replace_phone(const std::smatch& match); 14 | 15 | // 替换手机号码 16 | static std::string replace_mobile(const std::smatch& match); 17 | 18 | 19 | // Mobile phone numbers 20 | // Matches: 13812345678, +8613812345678, +86 13812345678 21 | static inline auto RE_MOBILE_PHONE = std::regex( 22 | R"((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})" 23 | ); 24 | 25 | // Telephone numbers 26 | // Matches: 010-1234567, 0512-1234567, 12345678 27 | static inline auto RE_TELEPHONE = std::regex( 28 | R"((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})" 29 | ); 30 | 31 | // National uniform numbers (400 numbers) 32 | // Matches: 400-123-4567, 4001234567 33 | static inline auto RE_NATIONAL_UNIFORM_NUMBER = std::regex( 34 | R"(400(-)?\d{3}(-)?\d{4})" 35 | ); 36 | 37 | // 禁止创建实例 38 | PhoneNormalizer() = delete; 39 | }; 40 | #endif // PHONE_NORMALIZER_HPP -------------------------------------------------------------------------------- /src/3rd_include/zh_normalization/quantifier.h: -------------------------------------------------------------------------------- 1 | #ifndef QUANTIFIER_H 2 | #define QUANTIFIER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class Quantifier { 9 | public: 10 | static std::string replace_temperature(const std::smatch& match); 11 | static std::string replace_measure(std::string& sentence); 12 | static inline auto RE_TEMPERATURE = std::regex(R"((-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度))"); 13 | static const std::regex RE_PUNCTUATION; 14 | 15 | private: 16 | static const std::map measure_dict; 17 | }; 18 | 19 | #endif //QUANTIFIER_H -------------------------------------------------------------------------------- /src/cpp-pinyin/CanTone.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace Pinyin 4 | { 5 | std::u16string CanTone::tone3ToNormal(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) { 6 | return {pinyin.begin(), pinyin.end() - 1}; 7 | } 8 | } // Pinyin 9 | -------------------------------------------------------------------------------- /src/cpp-pinyin/DictUtil.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "cpp-pinyin/DictUtil.h" 10 | #include "cpp-pinyin/U16Str.h" 11 | 12 | namespace Pinyin 13 | { 14 | // Helper function to read and open file 15 | static std::ifstream openFile(const std::filesystem::path &dict_dir) { 16 | #ifdef _WIN32 17 | const std::wstring wdict_dir = dict_dir.wstring(); 18 | return std::ifstream(wdict_dir.c_str()); 19 | #else 20 | return std::ifstream(dict_dir.c_str()); 21 | #endif 22 | } 23 | 24 | // Helper function to trim whitespace from a string 25 | static void trim(std::string &str) { 26 | str.erase(0, str.find_first_not_of(" \t\r\n")); 27 | str.erase(str.find_last_not_of(" \t\r\n") + 1); 28 | } 29 | 30 | // Common function for reading lines and processing key-value pairs 31 | template 32 | static bool processFile(std::ifstream &file, std::unordered_map &resultMap, 33 | const char &sep1, KeyFunc keyProcessor, ValueFunc valueProcessor) { 34 | if (!file.is_open()) { 35 | std::cerr << "Error: Unable to open file" << std::endl; 36 | return false; 37 | } 38 | 39 | std::string line; 40 | while (std::getline(file, line)) { 41 | trim(line); 42 | std::istringstream iss(line); 43 | std::string key, value; 44 | if (std::getline(iss, key, sep1) && std::getline(iss, value)) { 45 | resultMap[keyProcessor(key)] = valueProcessor(value); 46 | } 47 | } 48 | return true; 49 | } 50 | 51 | static std::vector split(const std::string &s, const std::string &delimiter) { 52 | std::vector tokens; 53 | if (delimiter.empty()) { 54 | for (char c : s) { 55 | tokens.emplace_back(1, c); 56 | } 57 | } else { 58 | std::string::size_type start = 0; 59 | std::string::size_type end = s.find(delimiter); 60 | while (end != std::string::npos) { 61 | tokens.push_back(s.substr(start, end - start)); 62 | start = end + delimiter.size(); 63 | end = s.find(delimiter, start); 64 | } 65 | tokens.push_back(s.substr(start)); 66 | } 67 | return tokens; 68 | } 69 | 70 | bool loadDict(const std::filesystem::path &dict_dir, 71 | std::unordered_map &resultMap, const char &sep1) { 72 | std::ifstream file = openFile(dict_dir); 73 | return processFile(file, resultMap, sep1, 74 | [](const std::string &key) { return utf8strToU16str(key)[0]; }, 75 | [](const std::string &value) { return utf8strToU16str(value)[0]; }); 76 | } 77 | 78 | bool loadDict(const std::filesystem::path &dict_dir, 79 | std::unordered_map &resultMap, const char &sep1) { 80 | std::ifstream file = openFile(dict_dir); 81 | return processFile(file, resultMap, sep1, 82 | [](const std::string &key) { return utf8strToU16str(key)[0]; }, 83 | [](const std::string &value) { return utf8strToU16str(value); }); 84 | } 85 | 86 | bool loadDict(const std::filesystem::path &dict_dir, 87 | std::unordered_map> &resultMap, const char &sep1, 88 | const std::string &sep2) { 89 | std::ifstream file = openFile(dict_dir); 90 | return processFile(file, resultMap, sep1, 91 | [](const std::string &key) { return utf8strToU16str(key)[0]; }, 92 | [&sep2](const std::string &value) 93 | { 94 | std::vector u8strlist; 95 | for (const auto &str : split(value, sep2)) { 96 | if (!str.empty()) 97 | u8strlist.emplace_back(utf8strToU16str(str)); 98 | } 99 | return u8strlist; 100 | }); 101 | } 102 | 103 | bool loadDict(const std::filesystem::path &dict_dir, 104 | std::unordered_map> &resultMap, const char &sep1, 105 | const std::string &sep2) { 106 | std::ifstream file = openFile(dict_dir); 107 | return processFile(file, resultMap, sep1, 108 | [](const std::string &key) { return utf8strToU16str(key); }, 109 | [&sep2](const std::string &value) 110 | { 111 | std::vector u8strlist; 112 | for (const auto &str : split(value, sep2)) { 113 | if (!str.empty()) 114 | u8strlist.emplace_back(utf8strToU16str(str)); 115 | } 116 | return u8strlist; 117 | }); 118 | } 119 | 120 | bool loadAdditionalDict(const std::filesystem::path &dict_dir, 121 | std::unordered_map> &resultMap, 122 | const char &sep1, 123 | const std::string &sep2, 124 | const std::function & 125 | converterForDefaultPinyin) { 126 | std::ifstream file = openFile(dict_dir); 127 | return processFile(file, resultMap, sep1, 128 | [](const std::string &key) { return utf8strToU16str(key); }, 129 | [&sep2, &converterForDefaultPinyin](const std::string &value) 130 | { 131 | std::vector u8strlist; 132 | for (const auto &str : split(value, sep2)) { 133 | if (!str.empty()) 134 | u8strlist.emplace_back(converterForDefaultPinyin(utf8strToU16str(str))); 135 | } 136 | return u8strlist; 137 | }); 138 | } 139 | } // namespace Pinyin 140 | -------------------------------------------------------------------------------- /src/cpp-pinyin/G2pglobal.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | namespace Pinyin 7 | { 8 | class G2pGlobal { 9 | public: 10 | std::filesystem::path path; 11 | }; 12 | 13 | auto m_global = std::make_unique(); 14 | 15 | std::filesystem::path dictionaryPath() { 16 | return m_global->path; 17 | } 18 | 19 | void setDictionaryPath(const std::filesystem::path &dir) { 20 | m_global->path = dir; 21 | } 22 | 23 | bool isLetter(const char16_t &c) { 24 | return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 25 | } 26 | 27 | bool isHanzi(const char16_t &c) { 28 | return c >= 0x4e00 && c <= 0x9fa5; 29 | } 30 | 31 | bool isKana(const char16_t &c) { 32 | return (c >= 0x3040 && c <= 0x309F) || (c >= 0x30A0 && c <= 0x30FF); 33 | } 34 | 35 | bool isDigit(const char16_t &c) { 36 | return c >= '0' && c <= '9'; 37 | } 38 | 39 | bool isSpace(const char16_t &c) { 40 | return c == ' '; 41 | } 42 | 43 | bool isSpecialKana(const char16_t &c) { 44 | static const std::unordered_set specialKana = { 45 | u'ャ', u'ュ', u'ョ', u'ゃ', u'ゅ', u'ょ', 46 | u'ァ', u'ィ', u'ゥ', u'ェ', u'ォ', u'ぁ', u'ぃ', u'ぅ', u'ぇ', u'ぉ' 47 | }; 48 | return specialKana.find(c) != specialKana.end(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/cpp-pinyin/Jyutping.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace Pinyin 4 | { 5 | PinyinResVector Jyutping::hanziToPinyin(const std::string &hans, CanTone::Style style, Error error, 6 | bool candidates) const { 7 | /* 8 | @param hans : raw utf-8 std::string. 9 | @param ManTone::Style : Preserve the pinyin tone. 10 | @param errorType : Ignore words that have failed conversion. Default: Keep original. 11 | @param candidates : Return all possible pinyin candidates. Default: true. 12 | @return PinyinResVector. 13 | */ 14 | return ChineseG2p::hanziToPinyin(hans, static_cast(style), error, candidates, false, false); 15 | } 16 | 17 | PinyinResVector Jyutping::hanziToPinyin(const std::vector &hans, CanTone::Style style, 18 | Error error, bool candidates) const { 19 | /* 20 | @param hans : raw utf-8 std::string vector, each element of the vector is a character. 21 | @param ManTone::Style : Preserve the pinyin tone. 22 | @param errorType : Ignore words that have failed conversion. Default: Keep original. 23 | @param candidates : Return all possible pinyin candidates. Default: true. 24 | @return PinyinResVector. 25 | */ 26 | return ChineseG2p::hanziToPinyin(hans, static_cast(style), error, candidates, false, false); 27 | } 28 | 29 | // Convert to Simplified Chinese. utf-8 std::string 30 | std::vector Jyutping::getDefaultPinyin(const std::string &hanzi, CanTone::Style style) const { 31 | return ChineseG2p::getDefaultPinyin(hanzi, static_cast(style), false, false); 32 | } 33 | } // Pinyin 34 | -------------------------------------------------------------------------------- /src/cpp-pinyin/ManTone.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | namespace Pinyin 8 | { 9 | // 映射表,音调符号 -> (无音调元音, 对应的调号) 10 | static const std::unordered_map> toneToNum = { 11 | {u'ā', {u'a', u'1'}}, {u'á', {u'a', u'2'}}, {u'ǎ', {u'a', u'3'}}, {u'à', {u'a', u'4'}}, 12 | {u'ō', {u'o', u'1'}}, {u'ó', {u'o', u'2'}}, {u'ǒ', {u'o', u'3'}}, {u'ò', {u'o', u'4'}}, 13 | {u'ē', {u'e', u'1'}}, {u'é', {u'e', u'2'}}, {u'ě', {u'e', u'3'}}, {u'è', {u'e', u'4'}}, 14 | {u'ī', {u'i', u'1'}}, {u'í', {u'i', u'2'}}, {u'ǐ', {u'i', u'3'}}, {u'ì', {u'i', u'4'}}, 15 | {u'ū', {u'u', u'1'}}, {u'ú', {u'u', u'2'}}, {u'ǔ', {u'u', u'3'}}, {u'ù', {u'u', u'4'}}, 16 | {u'ǖ', {u'v', u'1'}}, {u'ǘ', {u'v', u'2'}}, {u'ǚ', {u'v', u'3'}}, {u'ǜ', {u'v', u'4'}}, 17 | {u'ń', {u'n', u'2'}}, {u'ň', {u'n', u'3'}}, {u'ǹ', {u'n', u'4'}}, 18 | {u'ḿ', {u'm', u'2'}} 19 | }; 20 | 21 | std::u16string ManTone::toneToNormal(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) { 22 | std::u16string result; 23 | result.reserve(pinyin.size()); 24 | 25 | for (const char16_t &ch : pinyin) { 26 | if (isLetter(ch)) { 27 | result += ch; 28 | } else { 29 | const auto &it = toneToNum.find(ch); 30 | result += it != toneToNum.end() ? it->second.first : ch; 31 | } 32 | } 33 | 34 | if (!v_to_u) 35 | std::replace(result.begin(), result.end(), u'ü', u'v'); 36 | 37 | return result; 38 | } 39 | 40 | std::u16string ManTone::toneToTone(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) { 41 | if (v_to_u) 42 | return pinyin; 43 | 44 | std::u16string result; 45 | result.reserve(pinyin.size()); 46 | 47 | for (const char16_t &ch : pinyin) { 48 | if (isLetter(ch)) { 49 | result += ch; 50 | } else { 51 | result += ch == u'ü' ? u'v' : ch; 52 | } 53 | } 54 | 55 | return result; 56 | } 57 | 58 | std::u16string ManTone::toneToTone2(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) { 59 | std::u16string result; 60 | result.reserve(pinyin.size() + 1); 61 | 62 | for (const char16_t &ch : pinyin) { 63 | if (isLetter(ch)) { 64 | result += ch; 65 | } else { 66 | const auto &it = toneToNum.find(ch); 67 | if (it != toneToNum.end()) { 68 | result += it->second.first; 69 | const char16_t &toneNumber = it->second.second; 70 | if (!(!neutral_tone_with_five && toneNumber == u'5')) 71 | result += toneNumber; 72 | } else { 73 | if (!v_to_u && ch == u'ü') { 74 | result += u'v'; 75 | continue; 76 | } 77 | result += ch; 78 | } 79 | } 80 | } 81 | return result; 82 | } 83 | 84 | 85 | std::u16string ManTone::toneToTone3(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) { 86 | std::u16string result; 87 | result.reserve(pinyin.size() + 1); 88 | 89 | char16_t toneNumber = u'5'; 90 | 91 | for (const char16_t &ch : pinyin) { 92 | if (isLetter(ch)) { 93 | result += ch; 94 | } else { 95 | const auto &it = toneToNum.find(ch); 96 | if (it != toneToNum.end()) { 97 | result += it->second.first; 98 | toneNumber = it->second.second; 99 | } else { 100 | if (!v_to_u && ch == u'ü') { 101 | result += u'v'; 102 | continue; 103 | } 104 | result += ch; 105 | } 106 | } 107 | } 108 | 109 | result += toneNumber; 110 | 111 | if (!neutral_tone_with_five && toneNumber == u'5') 112 | result = result.substr(0, result.length() - 1); 113 | return result; 114 | } 115 | } // Pinyin 116 | -------------------------------------------------------------------------------- /src/cpp-pinyin/ManToneUtil.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cpp-pinyin/ManToneUtil.h" 6 | 7 | namespace Pinyin 8 | { 9 | // 定义 phonetic_symbol_reverse 映射表 10 | static const std::unordered_map phoneticSymbolReverse = { 11 | {u"a1", u'ā'}, {u"a2", u'á'}, {u"a3", u'ǎ'}, {u"a4", u'à'}, 12 | {u"e1", u'ē'}, {u"e2", u'é'}, {u"e3", u'ě'}, {u"e4", u'è'}, 13 | {u"i1", u'ī'}, {u"i2", u'í'}, {u"i3", u'ǐ'}, {u"i4", u'ì'}, 14 | {u"o1", u'ō'}, {u"o2", u'ó'}, {u"o3", u'ǒ'}, {u"o4", u'ò'}, 15 | {u"u1", u'ū'}, {u"u2", u'ú'}, {u"u3", u'ǔ'}, {u"u4", u'ù'}, 16 | {u"v1", u'ǖ'}, {u"v2", u'ǘ'}, {u"v3", u'ǚ'}, {u"v4", u'ǜ'}, 17 | }; 18 | 19 | // https://github.com/mozillazg/python-pinyin/blob/master/pypinyin/style/_tone_rule.py 20 | int rightMarkIndex(const std::u16string &pinyin_no_tone) { 21 | // 'iou', 'uei', 'uen': 根据还原前的拼音进行标记 22 | if (pinyin_no_tone.find(u"iou") != std::string::npos) { 23 | return pinyin_no_tone.find('u'); 24 | } 25 | if (pinyin_no_tone.find(u"uei") != std::string::npos) { 26 | return pinyin_no_tone.find('i'); 27 | } 28 | if (pinyin_no_tone.find(u"uen") != std::string::npos) { 29 | return pinyin_no_tone.find('u'); 30 | } 31 | 32 | // 有 'a' 不放过, 没 'a' 找 'o'、'e' 33 | static const std::vector vowels = {u'a', u'o', u'e'}; 34 | for (const char16_t c : vowels) { 35 | const auto pos = pinyin_no_tone.find(c); 36 | if (pos != std::u16string::npos) { 37 | return pos; 38 | } 39 | } 40 | 41 | // 'i'、'u' 若是连在一起,谁在后面就标谁 42 | static const std::vector combos = {u"iu", u"ui"}; 43 | for (const std::u16string &combo : combos) { 44 | const auto pos = pinyin_no_tone.find(combo); 45 | if (pos != std::u16string::npos) { 46 | return pos + 1; 47 | } 48 | } 49 | 50 | // 'i'、'u'、'v'、'ü' 51 | static const std::vector other_vowels = {u'i', u'u', u'v', u'ü'}; 52 | for (const char16_t c : other_vowels) { 53 | const auto pos = pinyin_no_tone.find(c); 54 | if (pos != std::u16string::npos) { 55 | return pos; 56 | } 57 | } 58 | 59 | // 'n', 'm', 'ê' 60 | static const std::vector final_chars = {u'n', u'm', u'ê'}; 61 | for (const char16_t c : final_chars) { 62 | const auto pos = pinyin_no_tone.find(c); 63 | if (pos != std::u16string::npos) { 64 | return pos; 65 | } 66 | } 67 | 68 | // 如果没有找到合适的位置,则返回-1表示没有可以标记的位置 69 | return -1; 70 | } 71 | 72 | static bool isToneNumber(const char16_t c) { 73 | return c >= u'1' && c <= u'5'; 74 | } 75 | 76 | static bool isPhoneticSymbol(const char16_t c) { 77 | return std::u16string(u"aeiouüv").find(c) != std::u16string::npos; 78 | } 79 | 80 | static std::u16string toneToTone(const std::u16string &tone2) { 81 | // 替换 "ü" 为 "v" 并去掉 5 和 0 82 | std::u16string string; 83 | for (const char16_t c : tone2) 84 | string += c == u'ü' ? u'v' : c; 85 | 86 | string.erase(std::remove(string.begin(), string.end(), u'5'), string.end()); 87 | string.erase(std::remove(string.begin(), string.end(), u'0'), string.end()); 88 | 89 | std::vector result; 90 | 91 | int pos = 0; 92 | while (pos < string.size()) { 93 | const char16_t ¤tChar = string[pos]; 94 | if (isPhoneticSymbol(currentChar)) { 95 | if (pos + 1 < string.length() && isToneNumber(string[pos + 1])) { 96 | const auto str = string.substr(pos, 2); 97 | const auto it = phoneticSymbolReverse.find(str); 98 | if (it != phoneticSymbolReverse.end()) { 99 | result.emplace_back(it->second); 100 | pos += 2; 101 | } else { 102 | result.emplace_back(currentChar); 103 | pos++; 104 | } 105 | } else { 106 | result.emplace_back(currentChar); 107 | pos++; 108 | } 109 | } else { 110 | result.emplace_back(currentChar); 111 | pos++; 112 | } 113 | } 114 | 115 | std::u16string result_str; 116 | for (const char16_t c : result) 117 | result_str += c == u'ü' ? u'v' : c; 118 | 119 | return result_str; 120 | } 121 | 122 | static std::u16string tone3ToTone2(const std::u16string &pinyin) { 123 | const auto no_number_tone3 = pinyin.size() > 1 && isToneNumber(pinyin.back()) 124 | ? pinyin.substr(0, pinyin.size() - 1) 125 | : pinyin; 126 | auto mark_index = rightMarkIndex(no_number_tone3); 127 | if (mark_index == -1) 128 | mark_index = no_number_tone3.size() - 1; 129 | 130 | const std::u16string before = no_number_tone3.substr(0, mark_index + 1); 131 | const std::u16string after = no_number_tone3.substr(mark_index + 1); 132 | const std::u16string number = pinyin.substr(pinyin.size() - 1); 133 | 134 | return before + number + after; 135 | } 136 | 137 | std::u16string tone3ToTone(const std::u16string &pinyin) { 138 | const auto tone2 = tone3ToTone2(pinyin); 139 | return toneToTone(tone2); 140 | } 141 | 142 | 143 | } // Pinyin 144 | -------------------------------------------------------------------------------- /src/cpp-pinyin/Pinyin.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace Pinyin 4 | { 5 | PinyinResVector Pinyin::hanziToPinyin(const std::string &hans, ManTone::Style style, Error error, bool candidates, 6 | bool v_to_u, bool neutral_tone_with_five) const { 7 | /* 8 | @param hans : raw utf-8 std::string. 9 | @param ManTone::Style : Preserve the pinyin tone. 10 | @param errorType : Ignore words that have failed conversion. Default: Keep original. 11 | @param candidates : Return all possible pinyin candidates. Default: true. 12 | @param v_to_u : Convert v to ü. Default: false. 13 | @param neutral_tone_with_five : Use 5 as neutral tone. Default: false. 14 | @return PinyinResVector. 15 | */ 16 | return ChineseG2p::hanziToPinyin(hans, static_cast(style), error, candidates, v_to_u, 17 | neutral_tone_with_five); 18 | } 19 | 20 | PinyinResVector Pinyin::hanziToPinyin(const std::vector &hans, ManTone::Style style, 21 | Error error, bool candidates, bool v_to_u, 22 | bool neutral_tone_with_five) const { 23 | /* 24 | @param hans : raw utf-8 std::string vector, each element of the vector is a character. 25 | @param ManTone::Style : Preserve the pinyin tone. 26 | @param errorType : Ignore words that have failed conversion. Default: Keep original. 27 | @param candidates : Return all possible pinyin candidates. Default: true. 28 | @param v_to_u : Convert v to ü. Default: false. 29 | @param neutral_tone_with_five : Use 5 as neutral tone. Default: false. 30 | @return PinyinResVector. 31 | */ 32 | return ChineseG2p::hanziToPinyin(hans, static_cast(style), error, candidates, v_to_u, 33 | neutral_tone_with_five); 34 | } 35 | 36 | // Convert to Simplified Chinese. utf-8 std::string 37 | std::vector Pinyin::getDefaultPinyin(const std::string &hanzi, ManTone::Style style, 38 | bool v_to_u, bool neutral_tone_with_five) const { 39 | return ChineseG2p::getDefaultPinyin(hanzi, static_cast(style), v_to_u, neutral_tone_with_five); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/cpp-pinyin/PinyinRes.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace Pinyin 4 | { 5 | // Convert PinyinResVector to std::vector 6 | std::vector PinyinResVector::toStdVector() const { 7 | std::vector result; 8 | result.reserve(this->size()); 9 | for (const auto &res : *this) { 10 | result.emplace_back(res.error ? res.hanzi : res.pinyin); 11 | } 12 | return result; 13 | } 14 | 15 | // Convert PinyinResVector to std::string with delimiter 16 | std::string PinyinResVector::toStdStr(const std::string &delimiter) const { 17 | std::string result; 18 | bool first = true; 19 | 20 | for (const auto &res : *this) { 21 | if (!first) { 22 | result += delimiter; 23 | } 24 | result += res.error ? res.hanzi : res.pinyin; 25 | first = false; 26 | } 27 | 28 | return result; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/cpp-pinyin/ToneConverter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace Pinyin 4 | { 5 | std::u16string ToneConverter::convert(std::u16string str, int style, bool v_to_u, 6 | bool neutral_tone_with_five) const { 7 | const auto it = m_converts.find(style); 8 | 9 | if (it == m_converts.end()) { 10 | return str; 11 | } 12 | return it->second(str, v_to_u, neutral_tone_with_five); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/cpp-pinyin/U16Str.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | namespace Pinyin 8 | { 9 | std::string u16strToUtf8str(const char16_t &ch16) { 10 | std::string utf8str; 11 | utf8str.reserve(3); // UTF-16 characters could expand into 3 bytes in UTF-8 12 | if (ch16 <= 0x7F) { 13 | // 1-byte UTF-8 14 | utf8str.push_back(static_cast(ch16)); 15 | } else if (ch16 <= 0x7FF) { 16 | // 2-byte UTF-8 17 | utf8str.push_back(static_cast(0xC0 | ((ch16 >> 6) & 0x1F))); 18 | utf8str.push_back(static_cast(0x80 | (ch16 & 0x3F))); 19 | } else { 20 | // 3-byte UTF-8 21 | utf8str.push_back(static_cast(0xE0 | ((ch16 >> 12) & 0x0F))); 22 | utf8str.push_back(static_cast(0x80 | ((ch16 >> 6) & 0x3F))); 23 | utf8str.push_back(static_cast(0x80 | (ch16 & 0x3F))); 24 | } 25 | return utf8str; 26 | } 27 | 28 | std::string u16strToUtf8str(const std::u16string &u16str) { 29 | std::string utf8str; 30 | utf8str.reserve(u16str.size() * 3); // UTF-16 characters could expand into 3 bytes in UTF-8 31 | 32 | for (size_t i = 0; i < u16str.size(); ++i) { 33 | const uint16_t ch = u16str[i]; 34 | 35 | if (ch < 0x80) { 36 | // 1-byte sequence 37 | utf8str.push_back(static_cast(ch)); 38 | } else if (ch < 0x800) { 39 | // 2-byte sequence 40 | utf8str.push_back(static_cast(0xC0 | (ch >> 6))); 41 | utf8str.push_back(static_cast(0x80 | (ch & 0x3F))); 42 | } else if (ch >= 0xD800 && ch <= 0xDBFF) { 43 | // High surrogate (part of a 4-byte UTF-16 character) 44 | if (i + 1 >= u16str.size()) 45 | throw std::invalid_argument("Invalid UTF-16 surrogate pair"); 46 | 47 | const uint16_t low = u16str[i + 1]; 48 | if (low < 0xDC00 || low > 0xDFFF) 49 | throw std::invalid_argument("Invalid UTF-16 surrogate pair"); 50 | 51 | const uint32_t codepoint = ((ch - 0xD800) << 10) + (low - 0xDC00) + 0x10000; 52 | utf8str.push_back(static_cast(0xF0 | (codepoint >> 18))); 53 | utf8str.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); 54 | utf8str.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); 55 | utf8str.push_back(static_cast(0x80 | (codepoint & 0x3F))); 56 | ++i; // Skip next low surrogate 57 | } else { 58 | // 3-byte sequence 59 | utf8str.push_back(static_cast(0xE0 | (ch >> 12))); 60 | utf8str.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); 61 | utf8str.push_back(static_cast(0x80 | (ch & 0x3F))); 62 | } 63 | } 64 | 65 | return utf8str; 66 | } 67 | 68 | std::u16string utf8strToU16str(const std::string &utf8str) { 69 | std::u16string u16str; 70 | u16str.reserve(utf8str.size()); 71 | 72 | size_t i = 0; 73 | while (i < utf8str.size()) { 74 | const unsigned char c = utf8str[i]; 75 | 76 | if (c < 0x80) { 77 | // 1-byte sequence 78 | u16str.push_back(c); 79 | ++i; 80 | } else if (c < 0xE0) { 81 | // 2-byte sequence 82 | if (i + 1 >= utf8str.size()) 83 | throw std::invalid_argument("Invalid UTF-8 sequence"); 84 | u16str.push_back(((c & 0x1F) << 6) | (utf8str[i + 1] & 0x3F)); 85 | i += 2; 86 | } else if (c < 0xF0) { 87 | // 3-byte sequence 88 | if (i + 2 >= utf8str.size()) 89 | throw std::invalid_argument("Invalid UTF-8 sequence"); 90 | u16str.push_back(((c & 0x0F) << 12) | ((utf8str[i + 1] & 0x3F) << 6) | (utf8str[i + 2] & 0x3F)); 91 | i += 3; 92 | } else { 93 | // 4-byte sequence (assuming UTF-32 character, but storing in UTF-16) 94 | if (i + 3 >= utf8str.size()) 95 | throw std::invalid_argument("Invalid UTF-8 sequence"); 96 | uint32_t codepoint = ((c & 0x07) << 18) | ((utf8str[i + 1] & 0x3F) << 12) | 97 | ((utf8str[i + 2] & 0x3F) << 6) | (utf8str[i + 3] & 0x3F); 98 | codepoint -= 0x10000; 99 | u16str.push_back(0xD800 | (codepoint >> 10)); // High surrogate 100 | u16str.push_back(0xDC00 | (codepoint & 0x3FF)); // Low surrogate 101 | i += 4; 102 | } 103 | } 104 | 105 | return u16str; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/tokenizer.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // tokenizer.hpp 3 | // 4 | // Created by MNN on 2023/09/25. 5 | // ZhaodeWang 6 | // 7 | 8 | #ifndef TOKENIZER_hpp 9 | #define TOKENIZER_hpp 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | // #include 17 | #include 18 | class string_view_ { 19 | public: 20 | string_view_() : data_(nullptr), size_(0) {} 21 | string_view_(const char* data) : data_(data), size_(std::strlen(data)) {} 22 | string_view_(const char* data, std::size_t size) : data_(data), size_(size) {} 23 | string_view_(const std::string& str) : data_(str.data()), size_(str.size()) {} 24 | constexpr string_view_(const string_view_&) noexcept = default; 25 | string_view_& operator=(const string_view_&) noexcept = default; 26 | const char& operator[](size_t pos) const { return data_[pos]; } 27 | constexpr const char* data() const noexcept { return data_; } 28 | constexpr std::size_t size() const noexcept { return size_; } 29 | constexpr bool empty() const { return size_ == 0; } 30 | std::string to_string() const { return std::string(data_, size_); } 31 | bool operator==(const string_view_& other) const noexcept { 32 | return size_ == other.size_ && strncmp(data_, other.data_, size_) == 0; 33 | } 34 | void remove_prefix(size_t n) { 35 | if (n < size_) { 36 | data_ += n; 37 | size_ -= n; 38 | } else { 39 | data_ = ""; 40 | size_ = 0; 41 | } 42 | } 43 | private: 44 | const char* data_; 45 | std::size_t size_ = 0; 46 | }; 47 | // std::string_view impl in c++11 end 48 | 49 | namespace std { 50 | template<> 51 | class hash { 52 | public: 53 | size_t operator()(const string_view_& sv) const { 54 | size_t result = 0; 55 | for (size_t i = 0; i < sv.size(); ++i) { 56 | result = (result * 31) + static_cast(sv[i]); 57 | } 58 | return result; 59 | } 60 | }; 61 | } 62 | namespace MNN { 63 | namespace Transformer { 64 | // std::string_view impl in c++11 start 65 | 66 | class Tokenizer { 67 | public: 68 | static constexpr int MAGIC_NUMBER = 430; 69 | enum TokenizerType { 70 | SENTENCEPIECE = 0, 71 | TIKTOIKEN = 1, 72 | BERT = 2, 73 | HUGGINGFACE = 3 74 | }; 75 | Tokenizer() = default; 76 | virtual ~Tokenizer() = default; 77 | static Tokenizer* createTokenizer(const std::string& filename); 78 | bool is_stop(int token); 79 | bool is_special(int token); 80 | std::vector encode(const std::string& str); 81 | virtual std::string decode(int id) = 0; 82 | protected: 83 | virtual void load_special(std::ifstream& file); 84 | virtual bool load_vocab(std::ifstream& file) = 0; 85 | virtual void encode(const std::string& str, std::vector& ids) = 0; 86 | std::vector special_tokens_; 87 | std::vector stop_tokens_; 88 | std::vector prefix_tokens_; 89 | private: 90 | std::string mTemplate; 91 | }; 92 | 93 | class Sentencepiece : public Tokenizer { 94 | public: 95 | Sentencepiece() = default; 96 | virtual std::string decode(int id) override; 97 | protected: 98 | virtual bool load_vocab(std::ifstream& file) override; 99 | virtual void encode(const std::string& str, std::vector& ids) override; 100 | private: 101 | enum ModelType { 102 | UNIGRAM = 1, 103 | BPE = 2, 104 | WORD = 3, 105 | CHAR = 4 106 | }; 107 | enum PieceType { 108 | NORMAL = 1, 109 | UNKNOWN = 2, 110 | CONTROL = 3, 111 | USER_DEFINED = 4, 112 | UNUSED = 5, 113 | BYTE = 6 114 | }; 115 | struct SentencePiece { 116 | std::string piece; 117 | float score; 118 | PieceType type = PieceType::NORMAL; 119 | SentencePiece() {} 120 | SentencePiece(const std::string& p, float s, PieceType t) : piece(p), score(s), type(t) {} 121 | }; 122 | using EncodeResult = std::vector>; 123 | private: 124 | // model train type 125 | ModelType type_ = BPE; 126 | // byte fall back enable 127 | bool byte_fall_back_ = true; 128 | // unknown id. 129 | int unk_id_ = 0; 130 | // pieces from model 131 | std::vector sentence_pieces_; 132 | // piece -> id map for normal pieces 133 | std::unordered_map pieces_; 134 | // piece -> id map for control, unknown, and byte pieces 135 | std::unordered_map reserved_id_map_; 136 | private: 137 | float get_score(int id) const; 138 | bool is_unused(int id) const; 139 | bool is_control(int id) const; 140 | int piece_to_id(const std::string& w) const; 141 | std::string byte_to_piece(unsigned char c) const; 142 | EncodeResult bpe_encode(string_view_ str, float alpha = 0.f); 143 | }; 144 | 145 | class Tiktoken : public Tokenizer { 146 | public: 147 | Tiktoken() = default; 148 | virtual std::string decode(int id) override; 149 | protected: 150 | virtual bool load_vocab(std::ifstream& file) override; 151 | virtual void encode(const std::string& str, std::vector& ids) override; 152 | std::unordered_map encoder_; 153 | std::vector decoder_; 154 | }; 155 | 156 | class BertTokenizer : public Tiktoken { 157 | public: 158 | BertTokenizer() = default; 159 | protected: 160 | virtual void encode(const std::string& str, std::vector& ids) override; 161 | private: 162 | std::vector word_piece(const std::string& token); 163 | }; 164 | 165 | class HuggingfaceTokenizer : public Tokenizer { 166 | struct hash_pair_wstring { 167 | size_t operator()(const std::pair& p) const { 168 | auto hash1 = std::hash{}(p.first); 169 | auto hash2 = std::hash{}(p.second); 170 | // If hash1 == hash2, their XOR is zero. 171 | return (hash1 != hash2) ? hash1 ^ hash2 : hash1; 172 | } 173 | }; 174 | using BPERanks = std::unordered_map, int, hash_pair_wstring>; 175 | public: 176 | HuggingfaceTokenizer() = default; 177 | virtual std::string decode(int id) override; 178 | protected: 179 | virtual bool load_vocab(std::ifstream& file) override; 180 | virtual void encode(const std::string& str, std::vector& ids) override; 181 | private: 182 | void bpe(const std::wstring& token, const BPERanks& bpe_ranks, std::vector* result); 183 | BPERanks bpe_ranks_; 184 | std::unordered_map b2u_; 185 | std::unordered_map u2b_; 186 | std::unordered_map encoder_; 187 | std::vector decoder_; 188 | }; 189 | }; 190 | }; 191 | 192 | #endif // TOKENIZER_hpp 193 | -------------------------------------------------------------------------------- /src/tts.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // tts.cpp 3 | // 4 | // Created by MNN on 2025/2/20. 5 | // ZhaodeWang 6 | // 7 | 8 | #include "tts.hpp" 9 | #include "ttsconfig.hpp" 10 | #include "tokenizer.hpp" 11 | #include "zhg2p.hpp" 12 | #include