├── .gitmodules
├── CMakeLists.txt
├── README.md
├── export.py
├── include
    └── tts.hpp
├── resource
    ├── jieba
    │   ├── hmm_model.utf8
    │   ├── idf.utf8
    │   ├── jieba.dict.utf8
    │   ├── pos_dict
    │   │   ├── char_state_tab.utf8
    │   │   ├── prob_emit.utf8
    │   │   ├── prob_start.utf8
    │   │   └── prob_trans.utf8
    │   ├── stop_words.utf8
    │   └── user.dict.utf8
    └── pinyin
    │   └── mandarin
    │       ├── phrases_dict.txt
    │       ├── phrases_map.txt
    │       ├── trans_word.txt
    │       ├── user_dict.txt
    │       └── word.txt
└── src
    ├── 3rd_include
        ├── cpp-pinyin
        │   ├── CanTone.h
        │   ├── ChineseG2p.h
        │   ├── ChineseG2p_p.h
        │   ├── DictUtil.h
        │   ├── G2pglobal.h
        │   ├── Jyutping.h
        │   ├── ManTone.h
        │   ├── ManToneUtil.h
        │   ├── Pinyin.h
        │   ├── PinyinGlobal.h
        │   ├── PinyinRes.h
        │   ├── ToFinal.hpp
        │   ├── ToneConverter.h
        │   └── U16Str.h
        ├── cppjieba
        │   ├── DictTrie.hpp
        │   ├── FullSegment.hpp
        │   ├── HMMModel.hpp
        │   ├── HMMSegment.hpp
        │   ├── Jieba.hpp
        │   ├── KeywordExtractor.hpp
        │   ├── MPSegment.hpp
        │   ├── MixSegment.hpp
        │   ├── PosTagger.hpp
        │   ├── PreFilter.hpp
        │   ├── QuerySegment.hpp
        │   ├── SegmentBase.hpp
        │   ├── SegmentTagged.hpp
        │   ├── TextRankExtractor.hpp
        │   ├── Trie.hpp
        │   └── Unicode.hpp
        ├── limonp
        │   ├── ArgvContext.hpp
        │   ├── Closure.hpp
        │   ├── Colors.hpp
        │   ├── Condition.hpp
        │   ├── Config.hpp
        │   ├── ForcePublic.hpp
        │   ├── LocalVector.hpp
        │   ├── Logging.hpp
        │   ├── NonCopyable.hpp
        │   ├── StdExtension.hpp
        │   └── StringUtil.hpp
        └── zh_normalization
        │   ├── TextNormalizer.h
        │   ├── chinese_converter.h
        │   ├── chronology.h
        │   ├── constants.h
        │   ├── num.h
        │   ├── phonecode.h
        │   └── quantifier.h
    ├── cpp-pinyin
        ├── CanTone.cpp
        ├── ChineseG2p.cpp
        ├── DictUtil.cpp
        ├── G2pglobal.cpp
        ├── Jyutping.cpp
        ├── ManTone.cpp
        ├── ManToneUtil.cpp
        ├── Pinyin.cpp
        ├── PinyinRes.cpp
        ├── ToneConverter.cpp
        └── U16Str.cpp
    ├── tokenizer.cpp
    ├── tokenizer.hpp
    ├── tts.cpp
    ├── tts_demo.cpp
    ├── ttsconfig.hpp
    ├── zh_normalization
        ├── TextNormalizer.cpp
        ├── chinese_converter.cpp
        ├── chronology.cpp
        ├── constants.cpp
        ├── num.cpp
        ├── phonecode.cpp
        └── quantifier.cpp
    ├── zhg2p.cpp
    └── zhg2p.hpp


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "MNN"]
2 | 	path = MNN
3 | 	url = https://github.com/alibaba/MNN.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(mnn-tts)
 3 | 
 4 | if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 5 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
 6 | endif()
 7 | 
 8 | if (MSVC)
 9 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++17")
10 |     add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/source-charset:utf-8>")
11 | else()
12 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
13 | endif()
14 | 
15 | set(MNN_LOW_MEMORY ON CACHE BOOL "Open MNN_LOW_MEMORY" FORCE)
16 | set(MNN_SUPPORT_TRANSFORMER_FUSE ON CACHE BOOL "Open MNN_SUPPORT_TRANSFORMER_FUSE" FORCE)
17 | set(MNN_BUILD_AUDIO ON CACHE BOOL "Open MNN_BUILD_AUDIO" FORCE)
18 | add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/MNN)
19 | 
20 | # include dir
21 | include_directories(${CMAKE_CURRENT_LIST_DIR}/include/
22 |         ${CMAKE_CURRENT_LIST_DIR}/src/3rd_include/
23 |         ${CMAKE_CURRENT_LIST_DIR}/MNN/include/
24 |         ${CMAKE_CURRENT_LIST_DIR}/MNN/3rd_party/
25 |         ${CMAKE_CURRENT_LIST_DIR}/MNN/tools/audio/include/
26 | )
27 | 
28 | # source files
29 | FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp
30 |         ${CMAKE_CURRENT_LIST_DIR}/src/cpp-pinyin/*.cpp
31 |         ${CMAKE_CURRENT_LIST_DIR}/src/zh_normalization/*.cpp)
32 | add_executable(tts_demo ${SRCS})
33 | 
34 | target_link_libraries(tts_demo MNN MNN_Express MNNAudio)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mnn-tts
 2 | 
 3 | 目前仅支持[Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)。
 4 | 
 5 | 
 6 | ## 模型导出
 7 | 
 8 | ```
 9 | cd mnn-tts
10 | huggingface-cli download --resume-download onnx-community/Kokoro-82M-v1.0-ONNX --local-dir Kokoro-82M-v1.0-ONNX
11 | python export.py ./Kokoro-82M-v1.0-ONNX
12 | ```
13 | 
14 | ## 模型测试
15 | 
16 | ```
17 | mkdir build
18 | cd build
19 | cmake .. && make -j32
20 | ./tts_demo ../model/config.json 你好
21 | open output.wav
22 | ```
23 | 
24 | ## 参考项目
25 | - [kokoro](https://github.com/hexgrad/kokoro)
26 | - [misaki](https://pypi.org/project/misaki/)
27 | - [cppjieba](https://github.com/yanyiwu/cppjieba)
28 | - [cpp-pinyin](https://github.com/wolfgitpr/cpp-pinyin)


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import glob
  5 | import base64
  6 | import argparse
  7 | import numpy as np
  8 | import MNN.expr as expr
  9 | from MNN.tools import mnnconvert
 10 | 
 11 | RESET = "\033[0m"
 12 | GREEN = "\033[32;1m"
 13 | YELLOW = "\033[33;4m"
 14 | 
 15 | class Kokoro:
 16 | 
 17 |     def __init__(self, args):
 18 |         self.model_path = args.path
 19 |         self.dst_path = args.dst_path
 20 |         if not os.path.exists(self.dst_path):
 21 |             os.makedirs(self.dst_path)
 22 |         if os.path.exists(args.mnnconvert):
 23 |             self.mnnconvert = args.mnnconvert
 24 |         else:
 25 |             self.mnnconvert = None
 26 | 
 27 |     def convert(self, onnx_path, mnn_path):
 28 |         convert_args = [
 29 |             '',
 30 |             '-f',
 31 |             'ONNX',
 32 |             '--modelFile',
 33 |             str(onnx_path),
 34 |             '--MNNModel',
 35 |             str(mnn_path),
 36 |             '--weightQuantBits',
 37 |             '8',
 38 |             #'--weightQuantBlock',
 39 |             #'128'
 40 |         ]
 41 |         sfd = os.dup(1)
 42 |         log_fp = open('./.export.log', "a")
 43 |         log_fd = log_fp.fileno()
 44 |         # mnnconvert ... > .export.log
 45 |         os.dup2(log_fd, 1)
 46 |         try:
 47 |             sys.argv = convert_args
 48 |             sys.argc = len(convert_args)
 49 |             if self.mnnconvert is None:
 50 |                 mnnconvert.main()
 51 |             else:
 52 |                 convert_args[0] = self.mnnconvert
 53 |                 cmd = ' '.join(convert_args)
 54 |                 message = os.popen(cmd).read()
 55 |                 print(message)
 56 |             sys.argv = []
 57 |         finally:
 58 |             os.dup2(sfd, 1)
 59 |             os.close(log_fd)
 60 | 
 61 |     def export_model(self):
 62 |         onnx_file = os.path.join(self.model_path, "onnx", "model.onnx")
 63 |         self.convert(onnx_file, f'{self.dst_path}/tts.mnn')
 64 | 
 65 |     def export_voice(self):
 66 |         voices = []
 67 |         self.styles = []
 68 |         for voice_file in glob.glob(os.path.join(self.model_path, "voices", "*.bin")):
 69 |             style = os.path.basename(voice_file).split('.')[0]
 70 |             if style != 'zf_xiaoxiao': continue
 71 |             voice = np.fromfile(voice_file, dtype=np.float32).reshape(-1, 1, 256)
 72 |             voices.append(expr.const(voice, voice.shape, expr.NCHW, expr.float))
 73 |         expr.save(voices, f'{self.dst_path}/voices.mnn')
 74 | 
 75 |     def export_config(self):
 76 |         tts_config = {}
 77 |         tts_config['styles'] = self.styles
 78 |         with open(f'{self.dst_path}/tts_config.json', 'w', encoding='utf-8') as f:
 79 |             json.dump(tts_config, f, ensure_ascii=False, indent=4)
 80 |         with open(f'{self.dst_path}/config.json', 'w', encoding='utf-8') as f:
 81 |             config = {
 82 |                 "tts_model": "tts.mnn",
 83 |                 "voices": "voices.mnn",
 84 |                 "tokenizer_file": "tokenizer.txt",
 85 |                 "backend_type": "cpu",
 86 |                 "thread_num": 4,
 87 |                 "precision": "low",
 88 |                 "memory": "low",
 89 |             }
 90 |             json.dump(config, f, ensure_ascii=False, indent=4)
 91 | 
 92 |     def export_tokenizer(self):
 93 |         # TOKENIZER MAGIC NUMBER
 94 |         MAGIC_NUMBER = 430
 95 |         # TOKENIZER TYPE
 96 |         SENTENCEPIECE = 0; TIKTOIKEN = 1; BERT = 2; HUGGINGFACE = 3
 97 |         def write_line(fp, *args):
 98 |             for arg in args:
 99 |                 for token in arg:
100 |                     fp.write(str(token) + ' ')
101 |             fp.write('\n')
102 |         def write_header(fp, type, stop_ids, speicals, prefix = []):
103 |             fp.write(f'{MAGIC_NUMBER} {type}\n')
104 |             fp.write(f'{len(speicals)} {len(stop_ids)} {len(prefix)}\n')
105 |             write_line(fp, speicals, stop_ids, prefix)
106 | 
107 |         tokenizer_file = os.path.join(self.model_path, "tokenizer.json")
108 |         with open(tokenizer_file, "r", encoding="utf-8") as f:
109 |             vocab_dict = json.load(f)['model']['vocab']
110 |             vocab_size = 0
111 |             for k, v in vocab_dict.items():
112 |                 vocab_size = max(vocab_size, v)
113 |             vocab_list = ['' for i in range(vocab_size + 1)]
114 |             for k, v in vocab_dict.items():
115 |                 vocab_list[v] = k
116 |         file_path = os.path.join(self.dst_path, "tokenizer.txt")
117 |         with open(file_path, "w", encoding="utf8") as fp:
118 |             write_header(fp, TIKTOIKEN, [], [], [])
119 |             fp.write(f'{len(vocab_list)}\n')
120 |             for v in vocab_list:
121 |                 line = base64.b64encode(v.encode('utf-8')).decode("utf8") + "\n"
122 |                 fp.write(line)
123 | 
124 |     def export(self):
125 |         self.export_model()
126 |         self.export_voice()
127 |         self.export_tokenizer()
128 |         self.export_config()
129 |         print(f'{GREEN}[SUCCESS]{RESET} export model to {YELLOW}{self.dst_path}{RESET}')
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     parser = argparse.ArgumentParser(description='tts_exporter', formatter_class=argparse.RawTextHelpFormatter)
134 |     parser.add_argument('--path', type=str, required=True, help='path of model.')
135 |     parser.add_argument('--dst_path', type=str, default='./model', help='export onnx/mnn model to path, defaut is `./model`.')
136 |     parser.add_argument('--mnnconvert', type=str, default='../../../build/MNNConvert', help='local mnnconvert path, if invalid, using pymnn.')
137 |     args = parser.parse_args()
138 |     kokoro = Kokoro(args)
139 |     kokoro.export()


--------------------------------------------------------------------------------
/include/tts.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  tts.hpp
 3 | //
 4 | //  Created by MNN on 2025/2/20.
 5 | //  ZhaodeWang
 6 | //
 7 | 
 8 | #ifndef TTS_hpp
 9 | #define TTS_hpp
10 | 
11 | #include <vector>
12 | #include <memory>
13 | #include <string>
14 | #include <fstream>
15 | #include <sstream>
16 | #include <iostream>
17 | #include <streambuf>
18 | #include <functional>
19 | #include <unordered_map>
20 | 
21 | #include <MNN/expr/Expr.hpp>
22 | #include <MNN/expr/Module.hpp>
23 | #include <MNN/expr/MathOp.hpp>
24 | #include <MNN/expr/NeuralNetWorkOp.hpp>
25 | 
26 | namespace MNN {
27 | namespace Transformer {
28 | 
29 | class TtsConfig;
30 | class Tokenizer;
31 | class Zhg2p;
32 | 
33 | class MNN_PUBLIC Tts {
34 | public:
35 |     static Tts* createTTS(const std::string& config_path);
36 |     static void save(const std::string& file, Express::VARP wavform);
37 |     Tts(std::shared_ptr<TtsConfig> config) : config_(config) {}
38 |     virtual ~Tts();
39 |     void load();
40 |     Express::VARP generate(const std::string& text, float speed = 1.0f);
41 | private:
42 |     std::shared_ptr<TtsConfig> config_;
43 |     std::shared_ptr<Tokenizer> tokenizer_;
44 |     std::shared_ptr<Express::Executor::RuntimeManager> runtime_manager_;
45 |     std::shared_ptr<Express::Module> module_;
46 |     std::vector<Express::VARP> voices_;
47 |     std::shared_ptr<Zhg2p> g2p_;
48 | };
49 | 
50 | }
51 | }
52 | 
53 | #endif // TTS_hpp
54 | 


--------------------------------------------------------------------------------
/resource/jieba/pos_dict/prob_start.utf8:
--------------------------------------------------------------------------------
  1 | #初始状态的概率
  2 | #格式
  3 | #状态:概率
  4 | B,a:-4.7623052146
  5 | B,ad:-6.68006603678
  6 | B,ag:-3.14e+100
  7 | B,an:-8.69708322302
  8 | B,b:-5.01837436211
  9 | B,bg:-3.14e+100
 10 | B,c:-3.42388018495
 11 | B,d:-3.97504752976
 12 | B,df:-8.88897423083
 13 | B,dg:-3.14e+100
 14 | B,e:-8.56355183039
 15 | B,en:-3.14e+100
 16 | B,f:-5.49163041848
 17 | B,g:-3.14e+100
 18 | B,h:-13.53336513
 19 | B,i:-6.11578472756
 20 | B,in:-3.14e+100
 21 | B,j:-5.05761912847
 22 | B,jn:-3.14e+100
 23 | B,k:-3.14e+100
 24 | B,l:-4.90588358466
 25 | B,ln:-3.14e+100
 26 | B,m:-3.6524299819
 27 | B,mg:-3.14e+100
 28 | B,mq:-6.7869530014
 29 | B,n:-1.69662577975
 30 | B,ng:-3.14e+100
 31 | B,nr:-2.23104959138
 32 | B,nrfg:-5.87372217541
 33 | B,nrt:-4.98564273352
 34 | B,ns:-2.8228438315
 35 | B,nt:-4.84609166818
 36 | B,nz:-3.94698846058
 37 | B,o:-8.43349870215
 38 | B,p:-4.20098413209
 39 | B,q:-6.99812385896
 40 | B,qe:-3.14e+100
 41 | B,qg:-3.14e+100
 42 | B,r:-3.40981877908
 43 | B,rg:-3.14e+100
 44 | B,rr:-12.4347528413
 45 | B,rz:-7.94611647157
 46 | B,s:-5.52267359084
 47 | B,t:-3.36474790945
 48 | B,tg:-3.14e+100
 49 | B,u:-9.1639172775
 50 | B,ud:-3.14e+100
 51 | B,ug:-3.14e+100
 52 | B,uj:-3.14e+100
 53 | B,ul:-3.14e+100
 54 | B,uv:-3.14e+100
 55 | B,uz:-3.14e+100
 56 | B,v:-2.67405848743
 57 | B,vd:-9.04472876024
 58 | B,vg:-3.14e+100
 59 | B,vi:-12.4347528413
 60 | B,vn:-4.33156108902
 61 | B,vq:-12.1470707689
 62 | B,w:-3.14e+100
 63 | B,x:-3.14e+100
 64 | B,y:-9.84448567586
 65 | B,yg:-3.14e+100
 66 | B,z:-7.04568111149
 67 | B,zg:-3.14e+100
 68 | E,a:-3.14e+100
 69 | E,ad:-3.14e+100
 70 | E,ag:-3.14e+100
 71 | E,an:-3.14e+100
 72 | E,b:-3.14e+100
 73 | E,bg:-3.14e+100
 74 | E,c:-3.14e+100
 75 | E,d:-3.14e+100
 76 | E,df:-3.14e+100
 77 | E,dg:-3.14e+100
 78 | E,e:-3.14e+100
 79 | E,en:-3.14e+100
 80 | E,f:-3.14e+100
 81 | E,g:-3.14e+100
 82 | E,h:-3.14e+100
 83 | E,i:-3.14e+100
 84 | E,in:-3.14e+100
 85 | E,j:-3.14e+100
 86 | E,jn:-3.14e+100
 87 | E,k:-3.14e+100
 88 | E,l:-3.14e+100
 89 | E,ln:-3.14e+100
 90 | E,m:-3.14e+100
 91 | E,mg:-3.14e+100
 92 | E,mq:-3.14e+100
 93 | E,n:-3.14e+100
 94 | E,ng:-3.14e+100
 95 | E,nr:-3.14e+100
 96 | E,nrfg:-3.14e+100
 97 | E,nrt:-3.14e+100
 98 | E,ns:-3.14e+100
 99 | E,nt:-3.14e+100
100 | E,nz:-3.14e+100
101 | E,o:-3.14e+100
102 | E,p:-3.14e+100
103 | E,q:-3.14e+100
104 | E,qe:-3.14e+100
105 | E,qg:-3.14e+100
106 | E,r:-3.14e+100
107 | E,rg:-3.14e+100
108 | E,rr:-3.14e+100
109 | E,rz:-3.14e+100
110 | E,s:-3.14e+100
111 | E,t:-3.14e+100
112 | E,tg:-3.14e+100
113 | E,u:-3.14e+100
114 | E,ud:-3.14e+100
115 | E,ug:-3.14e+100
116 | E,uj:-3.14e+100
117 | E,ul:-3.14e+100
118 | E,uv:-3.14e+100
119 | E,uz:-3.14e+100
120 | E,v:-3.14e+100
121 | E,vd:-3.14e+100
122 | E,vg:-3.14e+100
123 | E,vi:-3.14e+100
124 | E,vn:-3.14e+100
125 | E,vq:-3.14e+100
126 | E,w:-3.14e+100
127 | E,x:-3.14e+100
128 | E,y:-3.14e+100
129 | E,yg:-3.14e+100
130 | E,z:-3.14e+100
131 | E,zg:-3.14e+100
132 | M,a:-3.14e+100
133 | M,ad:-3.14e+100
134 | M,ag:-3.14e+100
135 | M,an:-3.14e+100
136 | M,b:-3.14e+100
137 | M,bg:-3.14e+100
138 | M,c:-3.14e+100
139 | M,d:-3.14e+100
140 | M,df:-3.14e+100
141 | M,dg:-3.14e+100
142 | M,e:-3.14e+100
143 | M,en:-3.14e+100
144 | M,f:-3.14e+100
145 | M,g:-3.14e+100
146 | M,h:-3.14e+100
147 | M,i:-3.14e+100
148 | M,in:-3.14e+100
149 | M,j:-3.14e+100
150 | M,jn:-3.14e+100
151 | M,k:-3.14e+100
152 | M,l:-3.14e+100
153 | M,ln:-3.14e+100
154 | M,m:-3.14e+100
155 | M,mg:-3.14e+100
156 | M,mq:-3.14e+100
157 | M,n:-3.14e+100
158 | M,ng:-3.14e+100
159 | M,nr:-3.14e+100
160 | M,nrfg:-3.14e+100
161 | M,nrt:-3.14e+100
162 | M,ns:-3.14e+100
163 | M,nt:-3.14e+100
164 | M,nz:-3.14e+100
165 | M,o:-3.14e+100
166 | M,p:-3.14e+100
167 | M,q:-3.14e+100
168 | M,qe:-3.14e+100
169 | M,qg:-3.14e+100
170 | M,r:-3.14e+100
171 | M,rg:-3.14e+100
172 | M,rr:-3.14e+100
173 | M,rz:-3.14e+100
174 | M,s:-3.14e+100
175 | M,t:-3.14e+100
176 | M,tg:-3.14e+100
177 | M,u:-3.14e+100
178 | M,ud:-3.14e+100
179 | M,ug:-3.14e+100
180 | M,uj:-3.14e+100
181 | M,ul:-3.14e+100
182 | M,uv:-3.14e+100
183 | M,uz:-3.14e+100
184 | M,v:-3.14e+100
185 | M,vd:-3.14e+100
186 | M,vg:-3.14e+100
187 | M,vi:-3.14e+100
188 | M,vn:-3.14e+100
189 | M,vq:-3.14e+100
190 | M,w:-3.14e+100
191 | M,x:-3.14e+100
192 | M,y:-3.14e+100
193 | M,yg:-3.14e+100
194 | M,z:-3.14e+100
195 | M,zg:-3.14e+100
196 | S,a:-3.90253968313
197 | S,ad:-11.0484584802
198 | S,ag:-6.95411391796
199 | S,an:-12.8402179494
200 | S,b:-6.47288876397
201 | S,bg:-3.14e+100
202 | S,c:-4.78696679586
203 | S,d:-3.90391976418
204 | S,df:-3.14e+100
205 | S,dg:-8.9483976513
206 | S,e:-5.94251300628
207 | S,en:-3.14e+100
208 | S,f:-5.19482024998
209 | S,g:-6.50782681533
210 | S,h:-8.65056320738
211 | S,i:-3.14e+100
212 | S,in:-3.14e+100
213 | S,j:-4.91199211964
214 | S,jn:-3.14e+100
215 | S,k:-6.94032059583
216 | S,l:-3.14e+100
217 | S,ln:-3.14e+100
218 | S,m:-3.26920065212
219 | S,mg:-10.8253149289
220 | S,mq:-3.14e+100
221 | S,n:-3.85514838976
222 | S,ng:-4.9134348611
223 | S,nr:-4.48366310396
224 | S,nrfg:-3.14e+100
225 | S,nrt:-3.14e+100
226 | S,ns:-3.14e+100
227 | S,nt:-12.1470707689
228 | S,nz:-3.14e+100
229 | S,o:-8.46446092775
230 | S,p:-2.98684018136
231 | S,q:-4.88865861826
232 | S,qe:-3.14e+100
233 | S,qg:-3.14e+100
234 | S,r:-2.76353367841
235 | S,rg:-10.2752685919
236 | S,rr:-3.14e+100
237 | S,rz:-3.14e+100
238 | S,s:-3.14e+100
239 | S,t:-3.14e+100
240 | S,tg:-6.27284253188
241 | S,u:-6.94032059583
242 | S,ud:-7.72823016105
243 | S,ug:-7.53940370266
244 | S,uj:-6.85251045118
245 | S,ul:-8.41537131755
246 | S,uv:-8.15808672229
247 | S,uz:-9.29925862537
248 | S,v:-3.05329230341
249 | S,vd:-3.14e+100
250 | S,vg:-5.94301818437
251 | S,vi:-3.14e+100
252 | S,vn:-11.4539235883
253 | S,vq:-3.14e+100
254 | S,w:-3.14e+100
255 | S,x:-8.42741965607
256 | S,y:-6.19707946995
257 | S,yg:-13.53336513
258 | S,z:-3.14e+100
259 | S,zg:-3.14e+100
260 | 


--------------------------------------------------------------------------------
/resource/jieba/user.dict.utf8:
--------------------------------------------------------------------------------
1 | 云计算
2 | 韩玉鉴赏
3 | 蓝翔 nz
4 | 区块链 10 nz
5 | 


--------------------------------------------------------------------------------
/resource/pinyin/mandarin/user_dict.txt:
--------------------------------------------------------------------------------
  1 | 陟罚臧否:zhi4 fa2 zang2 pi3
  2 | 汤汤:shang1 shang1
  3 | 到了:dao4 le1
  4 | 脖颈:bo2 geng3
  5 | 破的:po4 de5
  6 | 重场:zhong4 chang3
  7 | 很重:hen3 zhong4
  8 | 跪地:gui4 di4
  9 | 都会:dou1 hui4
 10 | 乐都:le4 dou1
 11 | 花都:hua1 dou1
 12 | 中都:zhong1 dou1
 13 | 上都:shang4 dou1
 14 | 大都:da4 dou1
 15 | 曾都:ceng2 dou1
 16 | 陪都:pei2 dou1
 17 | 京都:jing1 dou1
 18 | 国都:guo2 dou1
 19 | 成都:cheng2 du1
 20 | 莞然:wan3 ran2
 21 | 着边:zhe5 bian1
 22 | 彷佛:fang3 fu2
 23 | 得要:de2 yao4
 24 | 吱吱:zhi1 zhi1
 25 | 非得:fei1 de2
 26 | 美的:mei3 de5
 27 | 中的:zhong1 de5
 28 | 席地:xi2 di4
 29 | 一地:yi2 di4
 30 | 之地:zhi1 di4
 31 | 今朝:jin2 zhao1
 32 | 本色:ben3 se4
 33 | 执著:zhi2 zhuo2
 34 | 朝霞:zhao1 xia2
 35 | 盛满:cheng2 man3
 36 | 着眼:zhe5 yan3
 37 | 着地:zhuo2 de5
 38 | 泥地:ni2 di4
 39 | 雪地:xue3 di4
 40 | 地煞:di4 sha4
 41 | 地久:di4 jiu3
 42 | 地裂:di4 lie4
 43 | 一宿:yi4 xiu3
 44 | 一觉:yi2 jiao4
 45 | 乐经:le4 jing1
 46 | 将进酒:qiang1 jin4 jiu3
 47 | 天和地:tian1 he2 di4
 48 | 天塌地陷:tian1 ta1 di4 xian4
 49 | 流血:liu2 xie3
 50 | 纶巾:guan1 jin1
 51 | 穿着:chuan1 zhe5
 52 | 都没:dou1 mei2
 53 | 都是:dou1 shi4
 54 | 一行:yi4 hang2
 55 | 一朝:yi4 zhao1
 56 | 两行:liang3 hang2
 57 | 面的:mian4 de5
 58 | 没入:mo4 ru4
 59 | 还重:hai2 zhong4
 60 | 情重:qing2 zhong4
 61 | 重色:zhong4 se4
 62 | 澄清:cheng2 qing1
 63 | 两行:liang3 hang2
 64 | 几行:ji3 hang2
 65 | 重头:chong2 tou2
 66 | 好重:hao3 zhong4
 67 | 狗血:gou3 xue4
 68 | 屏住:bing3 zhu4
 69 | 满地:man3 di4
 70 | 彷佛:fang3 fu2
 71 | 端的:duan1 de5
 72 | 了了:liao3 le5
 73 | 诗行:shi1 hang2
 74 | 传来:chuan2 lai2
 75 | 朝暮:zhao1 mu4
 76 | 吞没了:tun1 mo4 le5
 77 | 淹没了:yan1 mo4 le5
 78 | 言重复:yan2 chong2 fu4
 79 | 牵着手:qian2 zhe5 shou3
 80 | 一行行:yi4 hang2 hang2
 81 | 还愿意:hai2 yuan4 yi4
 82 | 重感冒:zhong4 gan3 mao4
 83 | 地之角:di4 zhi1 jiao3
 84 | 相似的:xiang1 si4 de5
 85 | 睡不着:shui4 bu4 zhao2
 86 | 类似的:lei4 si4 de5
 87 | 得知了:de2 zhi1 le5
 88 | 没日没夜:mei2 ri4 mei2 ye4
 89 | 天昏地暗:tian1 hun1 di4 an4
 90 | 装模作样:zhuang1 mu2 zuo4 yang4
 91 | 了无音讯:liao3 wu2 yin2 xun4
 92 | 心事重重:xin1 shi4 chong2 chong2
 93 | 重重关卡:chong2 chong2 guan1 qia3
 94 | 情深意重:qing2 shen1 yi4 zhong4
 95 | 了无牵挂:liao3 wu2 qian1 gua4
 96 | 喜怒哀乐:xi3 nu4 ai1 le4
 97 | 天南地北:tian1 nan2 di4 bei3
 98 | 天翻地覆:tian1 fan1 di4 fu4
 99 | 草长莺飞:cao3 zhang3 ying1 fei1
100 | 随随便便:sui2 sui2 bian4 bian4
101 | 想着想着:xiang3 zhe5 xiang3 zhe5


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/CanTone.h:
--------------------------------------------------------------------------------
 1 | #ifndef CANTONECONVERTER_H
 2 | #define CANTONECONVERTER_H
 3 | 
 4 | #include <cpp-pinyin/PinyinGlobal.h>
 5 | #include <cpp-pinyin/ToneConverter.h>
 6 | 
 7 | namespace Pinyin
 8 | {
 9 |     class CPP_PINYIN_EXPORT CanTone final : public ToneConverter {
10 |     public:
11 |         enum Style {
12 |             // 普通风格，不带声调。如： 中国 -> ``zung gwok``
13 |             NORMAL = 0,
14 |             // 声调风格3，即拼音声调在各个拼音之后，用数字 [1-4] 进行表示。如： 中国 -> ``zung1 gwok3``
15 |             TONE3 = 8
16 |         };
17 | 
18 |         CanTone() {
19 |             m_converts.insert({static_cast<int>(Style::NORMAL), tone3ToNormal});
20 |         };
21 |         ~CanTone() override = default;
22 | 
23 |         static std::u16string tone3ToNormal(const std::u16string &pinyin, bool v_to_u = false,
24 |                                             bool neutral_tone_with_five = false);
25 |     };
26 | } // Pinyin
27 | 
28 | #endif //CANTONECONVERTER_H
29 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/ChineseG2p.h:
--------------------------------------------------------------------------------
 1 | #ifndef ChineseG2p_H
 2 | #define ChineseG2p_H
 3 | 
 4 | #include <filesystem>
 5 | #include <memory>
 6 | 
 7 | #include <cpp-pinyin/PinyinGlobal.h>
 8 | #include <cpp-pinyin/PinyinRes.h>
 9 | #include <cpp-pinyin/ToneConverter.h>
10 | 
11 | namespace Pinyin
12 | {
13 |     enum CPP_PINYIN_EXPORT Error {
14 |         // Keep original characters
15 |         Default = 0,
16 |         // Ignore this character (do not export)
17 |         Ignore = 1
18 |     };
19 | 
20 |     class ChineseG2pPrivate;
21 | 
22 |     class CPP_PINYIN_EXPORT ChineseG2p {
23 |     public:
24 |         explicit ChineseG2p(const std::string &language);
25 | 
26 |         ~ChineseG2p();
27 | 
28 |         bool initialized() const;
29 | 
30 |         bool loadUserDict(const std::filesystem::path &filePath) const;
31 | 
32 |         void setToneConverter(const ToneConverter &toneConverter) const;
33 | 
34 |         std::string tradToSim(const std::string &oneHanzi) const;
35 | 
36 |         bool isPolyphonic(const std::string &oneHanzi) const;
37 | 
38 |     protected:
39 |         PinyinResVector hanziToPinyin(const std::string &hans, int style = 0,
40 |                                       Error error = Default, bool candidates = true, bool v_to_u = false,
41 |                                       bool neutral_tone_with_five = false) const;
42 | 
43 |         PinyinResVector hanziToPinyin(const std::vector<std::string> &hans,
44 |                                       int style = 0, Error error = Default,
45 |                                       bool candidates = true, bool v_to_u = false,
46 |                                       bool neutral_tone_with_five = false) const;
47 | 
48 |         std::vector<std::string> getDefaultPinyin(const std::string &oneHanzi, int style = 0, bool v_to_u = false,
49 |                                                   bool neutral_tone_with_five = false) const;
50 | 
51 |         std::unique_ptr<ChineseG2pPrivate> d_ptr;
52 | 
53 |     private:
54 |         PinyinResVector hanziToPinyin(const std::vector<std::u16string> &hans,
55 |                                       int style = 0, Error error = Default,
56 |                                       bool candidates = true, bool v_to_u = false,
57 |                                       bool neutral_tone_with_five = false) const;
58 |         PinyinResVector hanziToPinyin(const std::vector<char16_t> &hansList, int style = 0,
59 |                                       Error error = Default, bool candidates = true, bool v_to_u = false,
60 |                                       bool neutral_tone_with_five = false) const;
61 |     };
62 | }
63 | 
64 | #endif // ChineseG2p_H
65 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/ChineseG2p_p.h:
--------------------------------------------------------------------------------
 1 | #ifndef ChineseG2pPRIVATE_H
 2 | #define ChineseG2pPRIVATE_H
 3 | 
 4 | #include <unordered_map>
 5 | #include <unordered_set>
 6 | 
 7 | #include <cpp-pinyin/ToneConverter.h>
 8 | 
 9 | #include "cpp-pinyin/U16Str.h"
10 | 
11 | namespace Pinyin
12 | {
13 |     class ChineseG2pPrivate final {
14 |     public:
15 |         explicit ChineseG2pPrivate(std::string language);
16 |         ~ChineseG2pPrivate();
17 | 
18 |         void init();
19 | 
20 |         bool initialized = false;
21 | 
22 |         std::unordered_map<char16_t, std::u16string> phrases_map;
23 |         std::unordered_map<std::u16string, std::vector<std::u16string>> phrases_dict;
24 |         std::unordered_map<char16_t, std::vector<std::u16string>> word_dict;
25 |         std::unordered_map<char16_t, char16_t> trans_dict;
26 | 
27 |         std::string m_language;
28 |         ToneConverter m_toneConverter;
29 | 
30 |         inline bool isPolyphonic(const char16_t &oneHanzi) const {
31 |             return phrases_map.find(oneHanzi) != phrases_map.end();
32 |         }
33 | 
34 |         inline char16_t tradToSim(const char16_t &oneHanzi) const {
35 |             const auto &it = trans_dict.find(oneHanzi);
36 |             return it != trans_dict.end() ? it->second : oneHanzi;
37 |         }
38 | 
39 |         inline std::u16string toneConvert(const std::u16string &pinyin, int style, bool v_to_u = false,
40 |                                           bool neutral_tone_with_five = false) const {
41 |             return m_toneConverter.convert({pinyin.begin(), pinyin.end()}, style, v_to_u, neutral_tone_with_five);
42 |         }
43 | 
44 |         inline std::vector<std::u16string> toneConvert(const std::vector<std::u16string> &pinyin, int style,
45 |                                                        bool v_to_u = false,
46 |                                                        bool neutral_tone_with_five = false) const {
47 |             std::vector<std::u16string> tonePinyin;
48 |             tonePinyin.reserve(pinyin.size());
49 |             for (const std::u16string &p : pinyin) {
50 |                 tonePinyin.push_back(toneConvert(p, style, v_to_u, neutral_tone_with_five));
51 |             }
52 |             return tonePinyin;
53 |         }
54 | 
55 |         std::unordered_set<std::string> toneSeen;
56 |         std::vector<std::string> toneCandidates;
57 | 
58 |         inline std::vector<std::string> getDefaultPinyin(const char16_t &oneHanzi, int style = 0,
59 |                                                          bool v_to_u = false,
60 |                                                          bool neutral_tone_with_five = false) {
61 |             const auto &it = word_dict.find(oneHanzi);
62 |             if (it == word_dict.end())
63 |                 return {u16strToUtf8str(oneHanzi)};
64 | 
65 |             const std::vector<std::u16string> &candidates = it->second;
66 | 
67 |             toneCandidates.clear();
68 |             toneSeen.clear();
69 | 
70 |             for (const std::u16string &pinyin : candidates) {
71 |                 const auto &tarPinyin = u16strToUtf8str(toneConvert(pinyin, style, v_to_u, neutral_tone_with_five));
72 |                 if (toneSeen.insert(tarPinyin).second) {
73 |                     toneCandidates.push_back(tarPinyin);
74 |                 }
75 |             }
76 | 
77 |             if (toneCandidates.empty())
78 |                 return {u16strToUtf8str(oneHanzi)};
79 |             return toneCandidates;
80 |         }
81 | 
82 |         void zhPosition(const std::vector<std::u16string> &input, std::vector<char16_t> &res,
83 |                         std::vector<bool> &positions);
84 |     };
85 | }
86 | 
87 | #endif // ChineseG2pPRIVATE_H
88 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/DictUtil.h:
--------------------------------------------------------------------------------
 1 | #ifndef DICTUTIL_H
 2 | #define DICTUTIL_H
 3 | 
 4 | #include <filesystem>
 5 | #include <functional>
 6 | #include <unordered_map>
 7 | 
 8 | #include "ManToneUtil.h"
 9 | 
10 | namespace Pinyin
11 | {
12 |     bool loadDict(const std::filesystem::path &dict_dir,
13 |                   std::unordered_map<char16_t, char16_t> &resultMap, const char &sep1 = ':');
14 | 
15 |     bool loadDict(const std::filesystem::path &dict_dir,
16 |                   std::unordered_map<char16_t, std::u16string> &resultMap, const char &sep1 = ':');
17 | 
18 |     bool loadDict(const std::filesystem::path &dict_dir,
19 |                   std::unordered_map<char16_t, std::vector<std::u16string>> &resultMap, const char &sep1 = ':',
20 |                   const std::string &sep2 = ",");
21 | 
22 |     bool loadDict(const std::filesystem::path &dict_dir,
23 |                   std::unordered_map<std::u16string, std::vector<std::u16string>> &resultMap, const char &sep1 = ':',
24 |                   const std::string &sep2 = ",");
25 | 
26 |     bool loadAdditionalDict(const std::filesystem::path &dict_dir,
27 |                             std::unordered_map<std::u16string, std::vector<std::u16string>> &resultMap,
28 |                             const char &sep1 = ':',
29 |                             const std::string &sep2 = " ",
30 |                             const std::function<std::u16string(const std::u16string &pinyin)> &converterForDefaultPinyin
31 |                                 = tone3ToTone);
32 | } // Pinyin
33 | 
34 | #endif //DICTUTIL_H
35 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/G2pglobal.h:
--------------------------------------------------------------------------------
 1 | #ifndef G2PGLOBAL_H
 2 | #define G2PGLOBAL_H
 3 | 
 4 | #include <filesystem>
 5 | 
 6 | #include <cpp-pinyin/PinyinGlobal.h>
 7 | 
 8 | namespace Pinyin
 9 | {
10 |     std::filesystem::path CPP_PINYIN_EXPORT dictionaryPath();
11 | 
12 |     void CPP_PINYIN_EXPORT setDictionaryPath(const std::filesystem::path &dir);
13 | 
14 |     bool CPP_PINYIN_EXPORT isLetter(const char16_t &c);
15 | 
16 |     bool CPP_PINYIN_EXPORT isHanzi(const char16_t &c);
17 | 
18 |     bool CPP_PINYIN_EXPORT isKana(const char16_t &c);
19 | 
20 |     bool CPP_PINYIN_EXPORT isDigit(const char16_t &c);
21 | 
22 |     bool CPP_PINYIN_EXPORT isSpace(const char16_t &c);
23 | 
24 |     bool CPP_PINYIN_EXPORT isSpecialKana(const char16_t &c);
25 | }
26 | 
27 | #endif // G2PGLOBAL_H
28 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/Jyutping.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATASET_TOOLS_CANTONESE_H
 2 | #define DATASET_TOOLS_CANTONESE_H
 3 | 
 4 | #include <cpp-pinyin/PinyinGlobal.h>
 5 | #include <cpp-pinyin/ChineseG2p.h>
 6 | #include <cpp-pinyin/CanTone.h>
 7 | 
 8 | namespace Pinyin
 9 | {
10 |     class CPP_PINYIN_EXPORT Jyutping final : public ChineseG2p {
11 |     public:
12 |         explicit Jyutping() :
13 |             ChineseG2p("cantonese") {
14 |             this->setToneConverter(m_toneConverter);
15 |         }
16 | 
17 |         ~Jyutping() = default;
18 | 
19 |         PinyinResVector hanziToPinyin(const std::string &hans,
20 |                                       CanTone::Style style = CanTone::Style::TONE3,
21 |                                       Error error = Default, bool candidates = true) const;
22 | 
23 |         PinyinResVector hanziToPinyin(const std::vector<std::string> &hans,
24 |                                       CanTone::Style style = CanTone::Style::TONE3,
25 |                                       Error error = Default, bool candidates = true) const;
26 |         std::vector<std::string> getDefaultPinyin(const std::string &hanzi,
27 |                                                   CanTone::Style style = CanTone::Style::TONE3) const;
28 | 
29 |         CanTone m_toneConverter;
30 |     };
31 | }
32 | #endif // DATASET_TOOLS_CANTONESE_H
33 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/ManTone.h:
--------------------------------------------------------------------------------
 1 | #ifndef TONEUTIL_H
 2 | #define TONEUTIL_H
 3 | 
 4 | #include <cpp-pinyin/PinyinGlobal.h>
 5 | #include <cpp-pinyin/ToneConverter.h>
 6 | 
 7 | namespace Pinyin
 8 | {
 9 |     class CPP_PINYIN_EXPORT ManTone final : public ToneConverter {
10 |     public:
11 |         // https://github.com/mozillazg/python-pinyin/blob/master/pypinyin/constants.py
12 |         enum Style {
13 |             // 普通风格，不带声调。如： 中国 -> ``zhong guo``
14 |             NORMAL = 0,
15 |             // 标准声调风格，拼音声调在韵母第一个字母上（默认风格）。如： 中国 -> ``zhōng guó``
16 |             TONE = 1,
17 |             // 声调风格2，即拼音声调在各个韵母之后，用数字 [1-4] 进行表示。如： 中国 -> ``zho1ng guo2``
18 |             TONE2 = 2,
19 |             // 声调风格3，即拼音声调在各个拼音之后，用数字 [1-4] 进行表示。如： 中国 -> ``zhong1 guo2``
20 |             TONE3 = 8
21 |         };
22 | 
23 |         ManTone() {
24 |             m_converts.insert({static_cast<int>(Style::NORMAL), toneToNormal});
25 |             m_converts.insert({static_cast<int>(Style::TONE), toneToTone});
26 |             m_converts.insert({static_cast<int>(Style::TONE2), toneToTone2});
27 |             m_converts.insert({static_cast<int>(Style::TONE3), toneToTone3});
28 |         };
29 |         ~ManTone() override = default;
30 | 
31 |         static std::u16string toneToNormal(const std::u16string &pinyin, bool v_to_u = false,
32 |                                            bool neutral_tone_with_five = false);
33 | 
34 |         static std::u16string toneToTone(const std::u16string &pinyin, bool v_to_u = false,
35 |                                          bool neutral_tone_with_five = false);
36 | 
37 |         static std::u16string toneToTone2(const std::u16string &pinyin, bool v_to_u = false,
38 |                                           bool neutral_tone_with_five = false);
39 | 
40 |         static std::u16string toneToTone3(const std::u16string &pinyin, bool v_to_u = false,
41 |                                           bool neutral_tone_with_five = false);
42 |     };
43 | 
44 | 
45 | } // Pinyin
46 | 
47 | #endif //TONEUTIL_H
48 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/ManToneUtil.h:
--------------------------------------------------------------------------------
 1 | #ifndef MANTONEUTIL_H
 2 | #define MANTONEUTIL_H
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace Pinyin
 7 | {
 8 |     std::u16string tone3ToTone(const std::u16string &pinyin);
 9 | } // Pinyin
10 | 
11 | #endif //MANTONEUTIL_H
12 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/Pinyin.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATASET_TOOLS_MANDARIN_H
 2 | #define DATASET_TOOLS_MANDARIN_H
 3 | 
 4 | #include <cpp-pinyin/PinyinGlobal.h>
 5 | #include <cpp-pinyin/ChineseG2p.h>
 6 | #include <cpp-pinyin/ManTone.h>
 7 | 
 8 | namespace Pinyin
 9 | {
10 |     class CPP_PINYIN_EXPORT Pinyin final : public ChineseG2p {
11 |     public:
12 |         explicit Pinyin() :
13 |             ChineseG2p("mandarin") {
14 |             this->setToneConverter(m_toneConverter);
15 |         }
16 | 
17 |         ~Pinyin() = default;
18 | 
19 |         PinyinResVector hanziToPinyin(const std::string &hans,
20 |                                       ManTone::Style style = ManTone::Style::TONE,
21 |                                       Error error = Default, bool candidates = true, bool v_to_u = false,
22 |                                       bool neutral_tone_with_five = false) const;
23 | 
24 |         PinyinResVector hanziToPinyin(const std::vector<std::string> &hans,
25 |                                       ManTone::Style style = ManTone::Style::TONE,
26 |                                       Error error = Default, bool candidates = true, bool v_to_u = false,
27 |                                       bool neutral_tone_with_five = false) const;
28 | 
29 |         std::vector<std::string> getDefaultPinyin(const std::string &hanzi,
30 |                                                   ManTone::Style style = ManTone::Style::TONE,
31 |                                                   bool v_to_u = false, bool neutral_tone_with_five = false) const;
32 | 
33 |         ManTone m_toneConverter;
34 |     };
35 | }
36 | 
37 | #endif // DATASET_TOOLS_MANDARIN_H
38 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/PinyinGlobal.h:
--------------------------------------------------------------------------------
 1 | #ifndef PINYINGLOBAL_H
 2 | #define PINYINGLOBAL_H
 3 | 
 4 | #ifdef _MSC_VER
 5 | #  define CPP_PINYIN_DECL_EXPORT __declspec(dllexport)
 6 | #  define CPP_PINYIN_DECL_IMPORT __declspec(dllimport)
 7 | #else
 8 | #  define CPP_PINYIN_DECL_EXPORT __attribute__((visibility("default")))
 9 | #  define CPP_PINYIN_DECL_IMPORT __attribute__((visibility("default")))
10 | #endif
11 | 
12 | #ifndef CPP_PINYIN_EXPORT
13 | #  ifdef CPP_PINYIN_STATIC
14 | #    define CPP_PINYIN_EXPORT
15 | #  else
16 | #    ifdef CPP_PINYIN_LIBRARY
17 | #      define CPP_PINYIN_EXPORT CPP_PINYIN_DECL_EXPORT
18 | #    else
19 | #      define CPP_PINYIN_EXPORT CPP_PINYIN_DECL_IMPORT
20 | #    endif
21 | #  endif
22 | #endif
23 | 
24 | #endif //PINYINGLOBAL_H
25 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/PinyinRes.h:
--------------------------------------------------------------------------------
 1 | #ifndef G2PRES_H
 2 | #define G2PRES_H
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include <cpp-pinyin/PinyinGlobal.h>
 8 | 
 9 | namespace Pinyin
10 | {
11 |     struct CPP_PINYIN_EXPORT PinyinRes {
12 |         std::string hanzi;
13 |         std::string pinyin;
14 |         std::vector<std::string> candidates; //  Candidate pinyin of Polyphonic Characters.
15 |         bool error = true; //  Whether the conversion failed.
16 |     };
17 | 
18 |     class CPP_PINYIN_EXPORT PinyinResVector : public std::vector<PinyinRes> {
19 |     public:
20 |         // Convert PinyinResVector to std::vector<std::string>
21 |         std::vector<std::string> toStdVector() const;
22 | 
23 |         // Convert PinyinResVector to std::string with delimiter
24 |         std::string toStdStr(const std::string &delimiter = " ") const;
25 |     };
26 | }
27 | #endif //G2PRES_H
28 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/ToFinal.hpp:
--------------------------------------------------------------------------------
  1 | #include <codecvt>
  2 | #include <locale>
  3 | #include <map>
  4 | #include <regex>
  5 | #include <set>
  6 | #include <string>
  7 | #include <unordered_set>
  8 | 
  9 | using namespace std;
 10 | 
 11 | // 编码转换工具
 12 | wstring_convert<codecvt_utf8_utf16<wchar_t>> converter;
 13 | 
 14 | // 韵母表
 15 | const unordered_set<wstring> _FINALS = {
 16 |     L"i",    L"u",    L"ü",    L"a",     L"ia",   L"ua",   L"o",     L"uo",
 17 |     L"e",    L"ie",   L"üe",   L"ai",    L"uai",  L"ei",   L"uei",   L"ao",
 18 |     L"iao",  L"ou",   L"iou",  L"an",    L"ian",  L"uan",  L"üan",   L"en",
 19 |     L"in",   L"uen",  L"ün",   L"ang",   L"iang", L"uang", L"eng",   L"ing",
 20 |     L"ueng", L"ong",  L"iong", L"er",    L"ê",
 21 | };
 22 | 
 23 | // u -> ü 映射
 24 | const map<wstring, wstring> UV_MAP = {
 25 |     {L"u", L"ü"}, {L"ū", L"ǖ"}, {L"ú", L"ǘ"}, {L"ǔ", L"ǚ"}, {L"ù", L"ǜ"}};
 26 | const set<wstring> U_TONES = {L"u", L"ū", L"ú", L"ǔ", L"ù"};
 27 | const set<wstring> I_TONES = {L"i", L"ī", L"í", L"ǐ", L"ì"};
 28 | 
 29 | // iu -> iou 映射
 30 | const map<wstring, wstring> IU_MAP = {
 31 |     {L"iu", L"iou"}, {L"iū", L"ioū"}, {L"iú", L"ioú"}, {L"iǔ", L"ioǔ"}, {L"iù", L"ioù"}};
 32 | 
 33 | // ui -> uei 映射
 34 | const map<wstring, wstring> UI_MAP = {
 35 |     {L"ui", L"uei"}, {L"uī", L"ueī"}, {L"uí", L"ueí"}, {L"uǐ", L"ueǐ"}, {L"uì", L"ueì"}};
 36 | 
 37 | // un -> uen 映射
 38 | const map<wstring, wstring> UN_MAP = {
 39 |     {L"un", L"uen"}, {L"ūn", L"ūen"}, {L"ún", L"úen"}, {L"ǔn", L"ǔen"}, {L"ùn", L"ùen"}};
 40 | 
 41 | inline wstring convert_zero_consonant(const wstring& pinyin) {
 42 |     wstring raw = pinyin;
 43 |     if (!pinyin.empty() && pinyin[0] == L'y') {
 44 |         wstring no_y = pinyin.substr(1);
 45 |         if (no_y.empty()) return raw;
 46 | 
 47 |         wstring first(1, no_y[0]);
 48 |         if (U_TONES.count(first)) {
 49 |             wstring replaced = UV_MAP.at(first) + no_y.substr(1);
 50 |             if (_FINALS.count(replaced)) return replaced;
 51 |         } else if (I_TONES.count(first)) {
 52 |             if (_FINALS.count(no_y)) return no_y;
 53 |         } else {
 54 |             wstring new_py = L"i" + no_y;
 55 |             if (_FINALS.count(new_py)) return new_py;
 56 |         }
 57 |         return raw;
 58 |     }
 59 | 
 60 |     if (!pinyin.empty() && pinyin[0] == L'w') {
 61 |         wstring no_w = pinyin.substr(1);
 62 |         if (no_w.empty()) return raw;
 63 | 
 64 |         wstring first(1, no_w[0]);
 65 |         if (U_TONES.count(first)) {
 66 |             if (_FINALS.count(no_w)) return no_w;
 67 |         } else {
 68 |             wstring new_py = L"u" + no_w;
 69 |             if (_FINALS.count(new_py)) return new_py;
 70 |         }
 71 |         return raw;
 72 |     }
 73 |     return pinyin;
 74 | }
 75 | 
 76 | inline wstring convert_uv(const wstring& pinyin) {
 77 |     if (pinyin.length() < 2) return pinyin;
 78 | 
 79 |     wchar_t first = pinyin[0];
 80 |     if (first != L'j' && first != L'q' && first != L'x') return pinyin;
 81 | 
 82 |     wstring second(1, pinyin[1]);
 83 |     if (UV_MAP.find(second) != UV_MAP.end()) {
 84 |         return wstring(1, first) + UV_MAP.at(second) + pinyin.substr(2);
 85 |     }
 86 |     return pinyin;
 87 | }
 88 | 
 89 | inline wstring convert_iou(const wstring& pinyin) {
 90 |     wregex re(LR"((\w+?)(i[ūúǔù]|iu)$)");
 91 |     wsmatch match;
 92 |     if (regex_match(pinyin, match, re) && match.size() == 3) {
 93 |         wstring key = match[2].str();
 94 |         if (IU_MAP.find(key) != IU_MAP.end()) {
 95 |             return match[1].str() + IU_MAP.at(key);
 96 |         }
 97 |     }
 98 |     return pinyin;
 99 | }
100 | 
101 | inline wstring convert_uei(const wstring& pinyin) {
102 |     wregex re(LR"((\w+?)(u[īíǐì]|ui)$)");
103 |     wsmatch match;
104 |     if (regex_match(pinyin, match, re) && match.size() == 3) {
105 |         wstring key = match[2].str();
106 |         if (UI_MAP.find(key) != UI_MAP.end()) {
107 |             return match[1].str() + UI_MAP.at(key);
108 |         }
109 |     }
110 |     return pinyin;
111 | }
112 | 
113 | inline wstring convert_uen(const wstring& pinyin) {
114 |     wregex re(LR"(([a-z]+)(ǔn|un|ùn|ūn|ún)$)");
115 |     wsmatch match;
116 |     if (regex_match(pinyin, match, re) && match.size() == 3) {
117 |         wstring key = match[2].str();
118 |         if (UN_MAP.find(key) != UN_MAP.end()) {
119 |             auto tmp = match[1].str() + UN_MAP.at(key);
120 |             return match[1].str() + UN_MAP.at(key);
121 |         }
122 |     }
123 |     return pinyin;
124 | }
125 | 
126 | inline wstring convert_finals(const wstring& pinyin) {
127 |     wstring result = convert_zero_consonant(pinyin);
128 |     result = convert_uv(result);
129 |     result = convert_iou(result);
130 |     result = convert_uei(result);
131 |     result = convert_uen(result);
132 |     return result;
133 | }
134 | 
135 | // 外部接口
136 | inline string pinyin_to_finals(const string& pinyin) {
137 |     wstring wide_pinyin = converter.from_bytes(pinyin);
138 |     wstring result = convert_finals(wide_pinyin);
139 |     return converter.to_bytes(result);
140 | }


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/ToneConverter.h:
--------------------------------------------------------------------------------
 1 | #ifndef TUNEUTIL_H
 2 | #define TUNEUTIL_H
 3 | 
 4 | #include <functional>
 5 | #include <unordered_map>
 6 | 
 7 | #include <cpp-pinyin/PinyinGlobal.h>
 8 | #include <string>
 9 | 
10 | namespace Pinyin
11 | {
12 |     class CPP_PINYIN_EXPORT ToneConverter {
13 |     public:
14 |         enum Style {
15 |             // 普通风格，不带声调。如： 中国 -> ``zhong guo``
16 |             NORMAL = 0,
17 |             // 标准声调风格，拼音声调在韵母第一个字母上（默认风格）。如： 中国 -> ``zhōng guó``
18 |             TONE = 1
19 |         };
20 | 
21 |         ToneConverter() {}
22 |         virtual ~ToneConverter() {}
23 | 
24 |         std::u16string convert(std::u16string str, int style, bool v_to_u = false, bool neutral_tone_with_five = false) const;
25 | 
26 |     protected:
27 |         std::unordered_map<int, std::function<std::u16string(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five)>>
28 |         m_converts;
29 |     };
30 | }
31 | 
32 | #endif //TUNEUTIL_H
33 | 


--------------------------------------------------------------------------------
/src/3rd_include/cpp-pinyin/U16Str.h:
--------------------------------------------------------------------------------
 1 | #ifndef U16STR_H
 2 | #define U16STR_H
 3 | 
 4 | #include <string>
 5 | 
 6 | #include <cpp-pinyin/PinyinGlobal.h>
 7 | 
 8 | namespace Pinyin
 9 | {
10 |     std::string CPP_PINYIN_EXPORT u16strToUtf8str(const char16_t &ch16);
11 |     std::string CPP_PINYIN_EXPORT u16strToUtf8str(const std::u16string &u16str);
12 |     std::u16string CPP_PINYIN_EXPORT utf8strToU16str(const std::string &utf8str);
13 | }
14 | #endif //U16STR_H
15 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/DictTrie.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_DICT_TRIE_HPP
  2 | #define CPPJIEBA_DICT_TRIE_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <map>
  7 | #include <string>
  8 | #include <cstring>
  9 | #include <cstdlib>
 10 | #include <stdint.h>
 11 | #include <cmath>
 12 | #include <limits>
 13 | #include "limonp/StringUtil.hpp"
 14 | #include "limonp/Logging.hpp"
 15 | #include "Unicode.hpp"
 16 | #include "Trie.hpp"
 17 | 
 18 | namespace cppjieba {
 19 | 
 20 | using namespace limonp;
 21 | 
 22 | const double MIN_DOUBLE = -3.14e+100;
 23 | const double MAX_DOUBLE = 3.14e+100;
 24 | const size_t DICT_COLUMN_NUM = 3;
 25 | const char* const UNKNOWN_TAG = "";
 26 | 
 27 | class DictTrie {
 28 |  public:
 29 |   enum UserWordWeightOption {
 30 |     WordWeightMin,
 31 |     WordWeightMedian,
 32 |     WordWeightMax,
 33 |   }; // enum UserWordWeightOption
 34 | 
 35 |   DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
 36 |     Init(dict_path, user_dict_paths, user_word_weight_opt);
 37 |   }
 38 | 
 39 |   ~DictTrie() {
 40 |     delete trie_;
 41 |   }
 42 | 
 43 |   bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
 44 |     DictUnit node_info;
 45 |     if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
 46 |       return false;
 47 |     }
 48 |     active_node_infos_.push_back(node_info);
 49 |     trie_->InsertNode(node_info.word, &active_node_infos_.back());
 50 |     return true;
 51 |   }
 52 | 
 53 |   bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
 54 |     DictUnit node_info;
 55 |     double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
 56 |     if (!MakeNodeInfo(node_info, word, weight , tag)) {
 57 |       return false;
 58 |     }
 59 |     active_node_infos_.push_back(node_info);
 60 |     trie_->InsertNode(node_info.word, &active_node_infos_.back());
 61 |     return true;
 62 |   }
 63 | 
 64 |   bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
 65 |     DictUnit node_info;
 66 |     if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
 67 |       return false;
 68 |     }
 69 |     trie_->DeleteNode(node_info.word, &node_info);
 70 |     return true;
 71 |   }
 72 |   
 73 |   const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
 74 |     return trie_->Find(begin, end);
 75 |   }
 76 | 
 77 |   void Find(RuneStrArray::const_iterator begin, 
 78 |         RuneStrArray::const_iterator end, 
 79 |         vector<struct Dag>&res,
 80 |         size_t max_word_len = MAX_WORD_LENGTH) const {
 81 |     trie_->Find(begin, end, res, max_word_len);
 82 |   }
 83 | 
 84 |   bool Find(const string& word)
 85 |   {
 86 |     const DictUnit *tmp = NULL;
 87 |     RuneStrArray runes;
 88 |     if (!DecodeUTF8RunesInString(word, runes))
 89 |     {
 90 |       XLOG(ERROR) << "Decode failed.";
 91 |     }
 92 |     tmp = Find(runes.begin(), runes.end());
 93 |     if (tmp == NULL)
 94 |     {
 95 |       return false;
 96 |     }
 97 |     else
 98 |     {
 99 |       return true;
100 |     }
101 |   }
102 | 
103 |   bool IsUserDictSingleChineseWord(const Rune& word) const {
104 |     return IsIn(user_dict_single_chinese_word_, word);
105 |   }
106 | 
107 |   double GetMinWeight() const {
108 |     return min_weight_;
109 |   }
110 | 
111 |   void InserUserDictNode(const string& line) {
112 |     vector<string> buf;
113 |     DictUnit node_info;
114 |     Split(line, buf, " ");
115 |     if(buf.size() == 1){
116 |           MakeNodeInfo(node_info, 
117 |                 buf[0], 
118 |                 user_word_default_weight_,
119 |                 UNKNOWN_TAG);
120 |         } else if (buf.size() == 2) {
121 |           MakeNodeInfo(node_info, 
122 |                 buf[0], 
123 |                 user_word_default_weight_,
124 |                 buf[1]);
125 |         } else if (buf.size() == 3) {
126 |           int freq = atoi(buf[1].c_str());
127 |           assert(freq_sum_ > 0.0);
128 |           double weight = log(1.0 * freq / freq_sum_);
129 |           MakeNodeInfo(node_info, buf[0], weight, buf[2]);
130 |         }
131 |         static_node_infos_.push_back(node_info);
132 |         if (node_info.word.size() == 1) {
133 |           user_dict_single_chinese_word_.insert(node_info.word[0]);
134 |         }
135 |   }
136 |   
137 |   void LoadUserDict(const vector<string>& buf) {
138 |     for (size_t i = 0; i < buf.size(); i++) {
139 |       InserUserDictNode(buf[i]);
140 |     }
141 |   }
142 | 
143 |    void LoadUserDict(const set<string>& buf) {
144 |     std::set<string>::const_iterator iter;
145 |     for (iter = buf.begin(); iter != buf.end(); iter++){
146 |       InserUserDictNode(*iter);
147 |     }
148 |   }
149 | 
150 |   void LoadUserDict(const string& filePaths) {
151 |     vector<string> files = limonp::Split(filePaths, "|;");
152 |     for (size_t i = 0; i < files.size(); i++) {
153 |       ifstream ifs(files[i].c_str());
154 |       XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; 
155 |       string line;
156 |       
157 |       while(getline(ifs, line)) {
158 |         if (line.size() == 0) {
159 |           continue;
160 |         }
161 |         InserUserDictNode(line);
162 |       }
163 |     }
164 |   }
165 | 
166 | 
167 |  private:
168 |   void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
169 |     LoadDict(dict_path);
170 |     freq_sum_ = CalcFreqSum(static_node_infos_);
171 |     CalculateWeight(static_node_infos_, freq_sum_);
172 |     SetStaticWordWeights(user_word_weight_opt);
173 | 
174 |     if (user_dict_paths.size()) {
175 |       LoadUserDict(user_dict_paths);
176 |     }
177 |     Shrink(static_node_infos_);
178 |     CreateTrie(static_node_infos_);
179 |   }
180 |   
181 |   void CreateTrie(const vector<DictUnit>& dictUnits) {
182 |     assert(dictUnits.size());
183 |     vector<Unicode> words;
184 |     vector<const DictUnit*> valuePointers;
185 |     for (size_t i = 0 ; i < dictUnits.size(); i ++) {
186 |       words.push_back(dictUnits[i].word);
187 |       valuePointers.push_back(&dictUnits[i]);
188 |     }
189 | 
190 |     trie_ = new Trie(words, valuePointers);
191 |   }
192 | 
193 |   
194 | 
195 | 
196 |   bool MakeNodeInfo(DictUnit& node_info,
197 |         const string& word, 
198 |         double weight, 
199 |         const string& tag) {
200 |     if (!DecodeUTF8RunesInString(word, node_info.word)) {
201 |       XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word;
202 |       return false;
203 |     }
204 |     node_info.weight = weight;
205 |     node_info.tag = tag;
206 |     return true;
207 |   }
208 | 
209 |   void LoadDict(const string& filePath) {
210 |     ifstream ifs(filePath.c_str());
211 |     XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
212 |     string line;
213 |     vector<string> buf;
214 | 
215 |     DictUnit node_info;
216 |     while (getline(ifs, line)) {
217 |       Split(line, buf, " ");
218 |       XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
219 |       MakeNodeInfo(node_info, 
220 |             buf[0], 
221 |             atof(buf[1].c_str()), 
222 |             buf[2]);
223 |       static_node_infos_.push_back(node_info);
224 |     }
225 |   }
226 | 
227 |   static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
228 |     return lhs.weight < rhs.weight;
229 |   }
230 | 
231 |   void SetStaticWordWeights(UserWordWeightOption option) {
232 |     XCHECK(!static_node_infos_.empty());
233 |     vector<DictUnit> x = static_node_infos_;
234 |     sort(x.begin(), x.end(), WeightCompare);
235 |     min_weight_ = x[0].weight;
236 |     max_weight_ = x[x.size() - 1].weight;
237 |     median_weight_ = x[x.size() / 2].weight;
238 |     switch (option) {
239 |      case WordWeightMin:
240 |        user_word_default_weight_ = min_weight_;
241 |        break;
242 |      case WordWeightMedian:
243 |        user_word_default_weight_ = median_weight_;
244 |        break;
245 |      default:
246 |        user_word_default_weight_ = max_weight_;
247 |        break;
248 |     }
249 |   }
250 | 
251 |   double CalcFreqSum(const vector<DictUnit>& node_infos) const {
252 |     double sum = 0.0;
253 |     for (size_t i = 0; i < node_infos.size(); i++) {
254 |       sum += node_infos[i].weight;
255 |     }
256 |     return sum;
257 |   }
258 | 
259 |   void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
260 |     assert(sum > 0.0);
261 |     for (size_t i = 0; i < node_infos.size(); i++) {
262 |       DictUnit& node_info = node_infos[i];
263 |       assert(node_info.weight > 0.0);
264 |       node_info.weight = log(double(node_info.weight)/sum);
265 |     }
266 |   }
267 | 
268 |   void Shrink(vector<DictUnit>& units) const {
269 |     vector<DictUnit>(units.begin(), units.end()).swap(units);
270 |   }
271 | 
272 |   vector<DictUnit> static_node_infos_;
273 |   deque<DictUnit> active_node_infos_; // must not be vector
274 |   Trie * trie_;
275 | 
276 |   double freq_sum_;
277 |   double min_weight_;
278 |   double max_weight_;
279 |   double median_weight_;
280 |   double user_word_default_weight_;
281 |   unordered_set<Rune> user_dict_single_chinese_word_;
282 | };
283 | }
284 | 
285 | #endif
286 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/FullSegment.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_FULLSEGMENT_H
 2 | #define CPPJIEBA_FULLSEGMENT_H
 3 | 
 4 | #include <algorithm>
 5 | #include <set>
 6 | #include <cassert>
 7 | #include "limonp/Logging.hpp"
 8 | #include "DictTrie.hpp"
 9 | #include "SegmentBase.hpp"
10 | #include "Unicode.hpp"
11 | 
12 | namespace cppjieba {
13 | class FullSegment: public SegmentBase {
14 |  public:
15 |   FullSegment(const string& dictPath) {
16 |     dictTrie_ = new DictTrie(dictPath);
17 |     isNeedDestroy_ = true;
18 |   }
19 |   FullSegment(const DictTrie* dictTrie)
20 |     : dictTrie_(dictTrie), isNeedDestroy_(false) {
21 |     assert(dictTrie_);
22 |   }
23 |   ~FullSegment() {
24 |     if (isNeedDestroy_) {
25 |       delete dictTrie_;
26 |     }
27 |   }
28 |   void Cut(const string& sentence, 
29 |         vector<string>& words) const {
30 |     vector<Word> tmp;
31 |     Cut(sentence, tmp);
32 |     GetStringsFromWords(tmp, words);
33 |   }
34 |   void Cut(const string& sentence, 
35 |         vector<Word>& words) const {
36 |     PreFilter pre_filter(symbols_, sentence);
37 |     PreFilter::Range range;
38 |     vector<WordRange> wrs;
39 |     wrs.reserve(sentence.size()/2);
40 |     while (pre_filter.HasNext()) {
41 |       range = pre_filter.Next();
42 |       Cut(range.begin, range.end, wrs);
43 |     }
44 |     words.clear();
45 |     words.reserve(wrs.size());
46 |     GetWordsFromWordRanges(sentence, wrs, words);
47 |   }
48 |   void Cut(RuneStrArray::const_iterator begin, 
49 |         RuneStrArray::const_iterator end, 
50 |         vector<WordRange>& res) const {
51 |     // result of searching in trie tree
52 |     LocalVector<pair<size_t, const DictUnit*> > tRes;
53 | 
54 |     // max index of res's words
55 |     size_t maxIdx = 0;
56 | 
57 |     // always equals to (uItr - begin)
58 |     size_t uIdx = 0;
59 | 
60 |     // tmp variables
61 |     size_t wordLen = 0;
62 |     assert(dictTrie_);
63 |     vector<struct Dag> dags;
64 |     dictTrie_->Find(begin, end, dags);
65 |     for (size_t i = 0; i < dags.size(); i++) {
66 |       for (size_t j = 0; j < dags[i].nexts.size(); j++) {
67 |         size_t nextoffset = dags[i].nexts[j].first;
68 |         assert(nextoffset < dags.size());
69 |         const DictUnit* du = dags[i].nexts[j].second;
70 |         if (du == NULL) {
71 |           if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
72 |             WordRange wr(begin + i, begin + nextoffset);
73 |             res.push_back(wr);
74 |           }
75 |         } else {
76 |           wordLen = du->word.size();
77 |           if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
78 |             WordRange wr(begin + i, begin + nextoffset);
79 |             res.push_back(wr);
80 |           }
81 |         }
82 |         maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
83 |       }
84 |       uIdx++;
85 |     }
86 |   }
87 |  private:
88 |   const DictTrie* dictTrie_;
89 |   bool isNeedDestroy_;
90 | };
91 | }
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/HMMModel.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_HMMMODEL_H
  2 | #define CPPJIEBA_HMMMODEL_H
  3 | 
  4 | #include "limonp/StringUtil.hpp"
  5 | #include "Trie.hpp"
  6 | 
  7 | namespace cppjieba {
  8 | 
  9 | using namespace limonp;
 10 | typedef unordered_map<Rune, double> EmitProbMap;
 11 | 
 12 | struct HMMModel {
 13 |   /*
 14 |    * STATUS:
 15 |    * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
 16 |    * */
 17 |   enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
 18 | 
 19 |   HMMModel(const string& modelPath) {
 20 |     memset(startProb, 0, sizeof(startProb));
 21 |     memset(transProb, 0, sizeof(transProb));
 22 |     statMap[0] = 'B';
 23 |     statMap[1] = 'E';
 24 |     statMap[2] = 'M';
 25 |     statMap[3] = 'S';
 26 |     emitProbVec.push_back(&emitProbB);
 27 |     emitProbVec.push_back(&emitProbE);
 28 |     emitProbVec.push_back(&emitProbM);
 29 |     emitProbVec.push_back(&emitProbS);
 30 |     LoadModel(modelPath);
 31 |   }
 32 |   ~HMMModel() {
 33 |   }
 34 |   void LoadModel(const string& filePath) {
 35 |     ifstream ifile(filePath.c_str());
 36 |     XCHECK(ifile.is_open()) << "open " << filePath << " failed";
 37 |     string line;
 38 |     vector<string> tmp;
 39 |     vector<string> tmp2;
 40 |     //Load startProb
 41 |     XCHECK(GetLine(ifile, line));
 42 |     Split(line, tmp, " ");
 43 |     XCHECK(tmp.size() == STATUS_SUM);
 44 |     for (size_t j = 0; j< tmp.size(); j++) {
 45 |       startProb[j] = atof(tmp[j].c_str());
 46 |     }
 47 | 
 48 |     //Load transProb
 49 |     for (size_t i = 0; i < STATUS_SUM; i++) {
 50 |       XCHECK(GetLine(ifile, line));
 51 |       Split(line, tmp, " ");
 52 |       XCHECK(tmp.size() == STATUS_SUM);
 53 |       for (size_t j =0; j < STATUS_SUM; j++) {
 54 |         transProb[i][j] = atof(tmp[j].c_str());
 55 |       }
 56 |     }
 57 | 
 58 |     //Load emitProbB
 59 |     XCHECK(GetLine(ifile, line));
 60 |     XCHECK(LoadEmitProb(line, emitProbB));
 61 | 
 62 |     //Load emitProbE
 63 |     XCHECK(GetLine(ifile, line));
 64 |     XCHECK(LoadEmitProb(line, emitProbE));
 65 | 
 66 |     //Load emitProbM
 67 |     XCHECK(GetLine(ifile, line));
 68 |     XCHECK(LoadEmitProb(line, emitProbM));
 69 | 
 70 |     //Load emitProbS
 71 |     XCHECK(GetLine(ifile, line));
 72 |     XCHECK(LoadEmitProb(line, emitProbS));
 73 |   }
 74 |   double GetEmitProb(const EmitProbMap* ptMp, Rune key, 
 75 |         double defVal)const {
 76 |     EmitProbMap::const_iterator cit = ptMp->find(key);
 77 |     if (cit == ptMp->end()) {
 78 |       return defVal;
 79 |     }
 80 |     return cit->second;
 81 |   }
 82 |   bool GetLine(ifstream& ifile, string& line) {
 83 |     while (getline(ifile, line)) {
 84 |       Trim(line);
 85 |       if (line.empty()) {
 86 |         continue;
 87 |       }
 88 |       if (StartsWith(line, "#")) {
 89 |         continue;
 90 |       }
 91 |       return true;
 92 |     }
 93 |     return false;
 94 |   }
 95 |   bool LoadEmitProb(const string& line, EmitProbMap& mp) {
 96 |     if (line.empty()) {
 97 |       return false;
 98 |     }
 99 |     vector<string> tmp, tmp2;
100 |     Unicode unicode;
101 |     Split(line, tmp, ",");
102 |     for (size_t i = 0; i < tmp.size(); i++) {
103 |       Split(tmp[i], tmp2, ":");
104 |       if (2 != tmp2.size()) {
105 |         XLOG(ERROR) << "emitProb illegal.";
106 |         return false;
107 |       }
108 |       if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
109 |         XLOG(ERROR) << "TransCode failed.";
110 |         return false;
111 |       }
112 |       mp[unicode[0]] = atof(tmp2[1].c_str());
113 |     }
114 |     return true;
115 |   }
116 | 
117 |   char statMap[STATUS_SUM];
118 |   double startProb[STATUS_SUM];
119 |   double transProb[STATUS_SUM][STATUS_SUM];
120 |   EmitProbMap emitProbB;
121 |   EmitProbMap emitProbE;
122 |   EmitProbMap emitProbM;
123 |   EmitProbMap emitProbS;
124 |   vector<EmitProbMap* > emitProbVec;
125 | }; // struct HMMModel
126 | 
127 | } // namespace cppjieba
128 | 
129 | #endif
130 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/HMMSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIBEA_HMMSEGMENT_H
  2 | #define CPPJIBEA_HMMSEGMENT_H
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <memory.h>
  7 | #include <cassert>
  8 | #include "HMMModel.hpp"
  9 | #include "SegmentBase.hpp"
 10 | 
 11 | namespace cppjieba {
 12 | class HMMSegment: public SegmentBase {
 13 |  public:
 14 |   HMMSegment(const string& filePath)
 15 |   : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
 16 |   }
 17 |   HMMSegment(const HMMModel* model) 
 18 |   : model_(model), isNeedDestroy_(false) {
 19 |   }
 20 |   ~HMMSegment() {
 21 |     if (isNeedDestroy_) {
 22 |       delete model_;
 23 |     }
 24 |   }
 25 | 
 26 |   void Cut(const string& sentence, 
 27 |         vector<string>& words) const {
 28 |     vector<Word> tmp;
 29 |     Cut(sentence, tmp);
 30 |     GetStringsFromWords(tmp, words);
 31 |   }
 32 |   void Cut(const string& sentence, 
 33 |         vector<Word>& words) const {
 34 |     PreFilter pre_filter(symbols_, sentence);
 35 |     PreFilter::Range range;
 36 |     vector<WordRange> wrs;
 37 |     wrs.reserve(sentence.size()/2);
 38 |     while (pre_filter.HasNext()) {
 39 |       range = pre_filter.Next();
 40 |       Cut(range.begin, range.end, wrs);
 41 |     }
 42 |     words.clear();
 43 |     words.reserve(wrs.size());
 44 |     GetWordsFromWordRanges(sentence, wrs, words);
 45 |   }
 46 |   void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
 47 |     RuneStrArray::const_iterator left = begin;
 48 |     RuneStrArray::const_iterator right = begin;
 49 |     while (right != end) {
 50 |       if (right->rune < 0x80) {
 51 |         if (left != right) {
 52 |           InternalCut(left, right, res);
 53 |         }
 54 |         left = right;
 55 |         do {
 56 |           right = SequentialLetterRule(left, end);
 57 |           if (right != left) {
 58 |             break;
 59 |           }
 60 |           right = NumbersRule(left, end);
 61 |           if (right != left) {
 62 |             break;
 63 |           }
 64 |           right ++;
 65 |         } while (false);
 66 |         WordRange wr(left, right - 1);
 67 |         res.push_back(wr);
 68 |         left = right;
 69 |       } else {
 70 |         right++;
 71 |       }
 72 |     }
 73 |     if (left != right) {
 74 |       InternalCut(left, right, res);
 75 |     }
 76 |   }
 77 |  private:
 78 |   // sequential letters rule
 79 |   RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
 80 |     Rune x = begin->rune;
 81 |     if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
 82 |       begin ++;
 83 |     } else {
 84 |       return begin;
 85 |     }
 86 |     while (begin != end) {
 87 |       x = begin->rune;
 88 |       if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
 89 |         begin ++;
 90 |       } else {
 91 |         break;
 92 |       }
 93 |     }
 94 |     return begin;
 95 |   }
 96 |   //
 97 |   RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
 98 |     Rune x = begin->rune;
 99 |     if ('0' <= x && x <= '9') {
100 |       begin ++;
101 |     } else {
102 |       return begin;
103 |     }
104 |     while (begin != end) {
105 |       x = begin->rune;
106 |       if ( ('0' <= x && x <= '9') || x == '.') {
107 |         begin++;
108 |       } else {
109 |         break;
110 |       }
111 |     }
112 |     return begin;
113 |   }
114 |   void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
115 |     vector<size_t> status;
116 |     Viterbi(begin, end, status);
117 | 
118 |     RuneStrArray::const_iterator left = begin;
119 |     RuneStrArray::const_iterator right;
120 |     for (size_t i = 0; i < status.size(); i++) {
121 |       if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
122 |         right = begin + i + 1;
123 |         WordRange wr(left, right - 1);
124 |         res.push_back(wr);
125 |         left = right;
126 |       }
127 |     }
128 |   }
129 | 
130 |   void Viterbi(RuneStrArray::const_iterator begin, 
131 |         RuneStrArray::const_iterator end, 
132 |         vector<size_t>& status) const {
133 |     size_t Y = HMMModel::STATUS_SUM;
134 |     size_t X = end - begin;
135 | 
136 |     size_t XYSize = X * Y;
137 |     size_t now, old, stat;
138 |     double tmp, endE, endS;
139 | 
140 |     vector<int> path(XYSize);
141 |     vector<double> weight(XYSize);
142 | 
143 |     //start
144 |     for (size_t y = 0; y < Y; y++) {
145 |       weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
146 |       path[0 + y * X] = -1;
147 |     }
148 | 
149 |     double emitProb;
150 | 
151 |     for (size_t x = 1; x < X; x++) {
152 |       for (size_t y = 0; y < Y; y++) {
153 |         now = x + y*X;
154 |         weight[now] = MIN_DOUBLE;
155 |         path[now] = HMMModel::E; // warning
156 |         emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
157 |         for (size_t preY = 0; preY < Y; preY++) {
158 |           old = x - 1 + preY * X;
159 |           tmp = weight[old] + model_->transProb[preY][y] + emitProb;
160 |           if (tmp > weight[now]) {
161 |             weight[now] = tmp;
162 |             path[now] = preY;
163 |           }
164 |         }
165 |       }
166 |     }
167 | 
168 |     endE = weight[X-1+HMMModel::E*X];
169 |     endS = weight[X-1+HMMModel::S*X];
170 |     stat = 0;
171 |     if (endE >= endS) {
172 |       stat = HMMModel::E;
173 |     } else {
174 |       stat = HMMModel::S;
175 |     }
176 | 
177 |     status.resize(X);
178 |     for (int x = X -1 ; x >= 0; x--) {
179 |       status[x] = stat;
180 |       stat = path[x + stat*X];
181 |     }
182 |   }
183 | 
184 |   const HMMModel* model_;
185 |   bool isNeedDestroy_;
186 | }; // class HMMSegment
187 | 
188 | } // namespace cppjieba
189 | 
190 | #endif
191 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/Jieba.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEAB_JIEBA_H
  2 | #define CPPJIEAB_JIEBA_H
  3 | 
  4 | #include "QuerySegment.hpp"
  5 | #include "KeywordExtractor.hpp"
  6 | 
  7 | namespace cppjieba {
  8 | 
  9 | class Jieba {
 10 |  public:
 11 |   Jieba(const string& dict_path = "", 
 12 |         const string& model_path = "",
 13 |         const string& user_dict_path = "", 
 14 |         const string& idf_path = "", 
 15 |         const string& stop_word_path = "") 
 16 |     : dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")),
 17 |       model_(getPath(model_path, "hmm_model.utf8")),
 18 |       mp_seg_(&dict_trie_),
 19 |       hmm_seg_(&model_),
 20 |       mix_seg_(&dict_trie_, &model_),
 21 |       full_seg_(&dict_trie_),
 22 |       query_seg_(&dict_trie_, &model_),
 23 |       extractor(&dict_trie_, &model_, 
 24 |                 getPath(idf_path, "idf.utf8"), 
 25 |                 getPath(stop_word_path, "stop_words.utf8")) {
 26 |   }
 27 |   ~Jieba() {
 28 |   }
 29 | 
 30 |   struct LocWord {
 31 |     string word;
 32 |     size_t begin;
 33 |     size_t end;
 34 |   }; // struct LocWord
 35 | 
 36 |   void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
 37 |     mix_seg_.Cut(sentence, words, hmm);
 38 |   }
 39 |   void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
 40 |     mix_seg_.Cut(sentence, words, hmm);
 41 |   }
 42 |   void CutAll(const string& sentence, vector<string>& words) const {
 43 |     full_seg_.Cut(sentence, words);
 44 |   }
 45 |   void CutAll(const string& sentence, vector<Word>& words) const {
 46 |     full_seg_.Cut(sentence, words);
 47 |   }
 48 |   void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
 49 |     query_seg_.Cut(sentence, words, hmm);
 50 |   }
 51 |   void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
 52 |     query_seg_.Cut(sentence, words, hmm);
 53 |   }
 54 |   void CutHMM(const string& sentence, vector<string>& words) const {
 55 |     hmm_seg_.Cut(sentence, words);
 56 |   }
 57 |   void CutHMM(const string& sentence, vector<Word>& words) const {
 58 |     hmm_seg_.Cut(sentence, words);
 59 |   }
 60 |   void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
 61 |     mp_seg_.Cut(sentence, words, max_word_len);
 62 |   }
 63 |   void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
 64 |     mp_seg_.Cut(sentence, words, max_word_len);
 65 |   }
 66 |   
 67 |   void Tag(const string& sentence, vector<pair<string, string> >& words) const {
 68 |     mix_seg_.Tag(sentence, words);
 69 |   }
 70 |   string LookupTag(const string &str) const {
 71 |     return mix_seg_.LookupTag(str);
 72 |   }
 73 |   bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
 74 |     return dict_trie_.InsertUserWord(word, tag);
 75 |   }
 76 | 
 77 |   bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
 78 |     return dict_trie_.InsertUserWord(word,freq, tag);
 79 |   }
 80 | 
 81 |   bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
 82 |     return dict_trie_.DeleteUserWord(word, tag);
 83 |   }
 84 |   
 85 |   bool Find(const string& word)
 86 |   {
 87 |     return dict_trie_.Find(word);
 88 |   }
 89 | 
 90 |   void ResetSeparators(const string& s) {
 91 |     //TODO
 92 |     mp_seg_.ResetSeparators(s);
 93 |     hmm_seg_.ResetSeparators(s);
 94 |     mix_seg_.ResetSeparators(s);
 95 |     full_seg_.ResetSeparators(s);
 96 |     query_seg_.ResetSeparators(s);
 97 |   }
 98 | 
 99 |   const DictTrie* GetDictTrie() const {
100 |     return &dict_trie_;
101 |   } 
102 |   
103 |   const HMMModel* GetHMMModel() const {
104 |     return &model_;
105 |   }
106 | 
107 |   void LoadUserDict(const vector<string>& buf)  {
108 |     dict_trie_.LoadUserDict(buf);
109 |   }
110 | 
111 |   void LoadUserDict(const set<string>& buf)  {
112 |     dict_trie_.LoadUserDict(buf);
113 |   }
114 | 
115 |   void LoadUserDict(const string& path)  {
116 |     dict_trie_.LoadUserDict(path);
117 |   }
118 | 
119 |  private:
120 |   static string pathJoin(const string& dir, const string& filename) {
121 |     if (dir.empty()) {
122 |         return filename;
123 |     }
124 |     
125 |     char last_char = dir[dir.length() - 1];
126 |     if (last_char == '/' || last_char == '\\') {
127 |         return dir + filename;
128 |     } else {
129 |         #ifdef _WIN32
130 |         return dir + '\\' + filename;
131 |         #else
132 |         return dir + '/' + filename;
133 |         #endif
134 |     }
135 |   }
136 | 
137 |   static string getCurrentDirectory() {
138 |     string path(__FILE__);
139 |     size_t pos = path.find_last_of("/\\");
140 |     return (pos == string::npos) ? "" : path.substr(0, pos);
141 |   }
142 | 
143 |   static string getPath(const string& path, const string& default_file) {
144 |     if (path.empty()) {
145 |       string current_dir = getCurrentDirectory();
146 |       string parent_dir = current_dir.substr(0, current_dir.find_last_of("/\\"));
147 |       string grandparent_dir = parent_dir.substr(0, parent_dir.find_last_of("/\\"));
148 |       return pathJoin(pathJoin(grandparent_dir, "dict"), default_file);
149 |     }
150 |     return path;
151 |   }
152 | 
153 |   DictTrie dict_trie_;
154 |   HMMModel model_;
155 |   
156 |   // They share the same dict trie and model
157 |   MPSegment mp_seg_;
158 |   HMMSegment hmm_seg_;
159 |   MixSegment mix_seg_;
160 |   FullSegment full_seg_;
161 |   QuerySegment query_seg_;
162 | 
163 |  public:
164 |   KeywordExtractor extractor;
165 | }; // class Jieba
166 | 
167 | } // namespace cppjieba
168 | 
169 | #endif // CPPJIEAB_JIEBA_H
170 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/KeywordExtractor.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
  2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H
  3 | 
  4 | #include <cmath>
  5 | #include <set>
  6 | #include "MixSegment.hpp"
  7 | 
  8 | namespace cppjieba {
  9 | 
 10 | using namespace limonp;
 11 | using namespace std;
 12 | 
 13 | /*utf8*/
 14 | class KeywordExtractor {
 15 |  public:
 16 |   struct Word {
 17 |     string word;
 18 |     vector<size_t> offsets;
 19 |     double weight;
 20 |   }; // struct Word
 21 | 
 22 |   KeywordExtractor(const string& dictPath, 
 23 |         const string& hmmFilePath, 
 24 |         const string& idfPath, 
 25 |         const string& stopWordPath, 
 26 |         const string& userDict = "") 
 27 |     : segment_(dictPath, hmmFilePath, userDict) {
 28 |     LoadIdfDict(idfPath);
 29 |     LoadStopWordDict(stopWordPath);
 30 |   }
 31 |   KeywordExtractor(const DictTrie* dictTrie, 
 32 |         const HMMModel* model,
 33 |         const string& idfPath, 
 34 |         const string& stopWordPath) 
 35 |     : segment_(dictTrie, model) {
 36 |     LoadIdfDict(idfPath);
 37 |     LoadStopWordDict(stopWordPath);
 38 |   }
 39 |   ~KeywordExtractor() {
 40 |   }
 41 | 
 42 |   void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
 43 |     vector<Word> topWords;
 44 |     Extract(sentence, topWords, topN);
 45 |     for (size_t i = 0; i < topWords.size(); i++) {
 46 |       keywords.push_back(topWords[i].word);
 47 |     }
 48 |   }
 49 | 
 50 |   void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
 51 |     vector<Word> topWords;
 52 |     Extract(sentence, topWords, topN);
 53 |     for (size_t i = 0; i < topWords.size(); i++) {
 54 |       keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
 55 |     }
 56 |   }
 57 | 
 58 |   void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
 59 |     vector<string> words;
 60 |     segment_.Cut(sentence, words);
 61 | 
 62 |     map<string, Word> wordmap;
 63 |     size_t offset = 0;
 64 |     for (size_t i = 0; i < words.size(); ++i) {
 65 |       size_t t = offset;
 66 |       offset += words[i].size();
 67 |       if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
 68 |         continue;
 69 |       }
 70 |       wordmap[words[i]].offsets.push_back(t);
 71 |       wordmap[words[i]].weight += 1.0;
 72 |     }
 73 |     if (offset != sentence.size()) {
 74 |       XLOG(ERROR) << "words illegal";
 75 |       return;
 76 |     }
 77 | 
 78 |     keywords.clear();
 79 |     keywords.reserve(wordmap.size());
 80 |     for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
 81 |       unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
 82 |       if (cit != idfMap_.end()) {
 83 |         itr->second.weight *= cit->second;
 84 |       } else {
 85 |         itr->second.weight *= idfAverage_;
 86 |       }
 87 |       itr->second.word = itr->first;
 88 |       keywords.push_back(itr->second);
 89 |     }
 90 |     topN = min(topN, keywords.size());
 91 |     partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
 92 |     keywords.resize(topN);
 93 |   }
 94 |  private:
 95 |   void LoadIdfDict(const string& idfPath) {
 96 |     ifstream ifs(idfPath.c_str());
 97 |     XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
 98 |     string line ;
 99 |     vector<string> buf;
100 |     double idf = 0.0;
101 |     double idfSum = 0.0;
102 |     size_t lineno = 0;
103 |     for (; getline(ifs, line); lineno++) {
104 |       buf.clear();
105 |       if (line.empty()) {
106 |         XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
107 |         continue;
108 |       }
109 |       Split(line, buf, " ");
110 |       if (buf.size() != 2) {
111 |         XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
112 |         continue;
113 |       }
114 |       idf = atof(buf[1].c_str());
115 |       idfMap_[buf[0]] = idf;
116 |       idfSum += idf;
117 | 
118 |     }
119 | 
120 |     assert(lineno);
121 |     idfAverage_ = idfSum / lineno;
122 |     assert(idfAverage_ > 0.0);
123 |   }
124 |   void LoadStopWordDict(const string& filePath) {
125 |     ifstream ifs(filePath.c_str());
126 |     XCHECK(ifs.is_open()) << "open " << filePath << " failed";
127 |     string line ;
128 |     while (getline(ifs, line)) {
129 |       stopWords_.insert(line);
130 |     }
131 |     assert(stopWords_.size());
132 |   }
133 | 
134 |   static bool Compare(const Word& lhs, const Word& rhs) {
135 |     return lhs.weight > rhs.weight;
136 |   }
137 | 
138 |   MixSegment segment_;
139 |   unordered_map<string, double> idfMap_;
140 |   double idfAverage_;
141 | 
142 |   unordered_set<string> stopWords_;
143 | }; // class KeywordExtractor
144 | 
145 | inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
146 |   return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
147 | }
148 | 
149 | } // namespace cppjieba
150 | 
151 | #endif
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/MPSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_MPSEGMENT_H
  2 | #define CPPJIEBA_MPSEGMENT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <set>
  6 | #include <cassert>
  7 | #include "limonp/Logging.hpp"
  8 | #include "DictTrie.hpp"
  9 | #include "SegmentTagged.hpp"
 10 | #include "PosTagger.hpp"
 11 | 
 12 | namespace cppjieba {
 13 | 
 14 | class MPSegment: public SegmentTagged {
 15 |  public:
 16 |   MPSegment(const string& dictPath, const string& userDictPath = "")
 17 |     : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
 18 |   }
 19 |   MPSegment(const DictTrie* dictTrie)
 20 |     : dictTrie_(dictTrie), isNeedDestroy_(false) {
 21 |     assert(dictTrie_);
 22 |   }
 23 |   ~MPSegment() {
 24 |     if (isNeedDestroy_) {
 25 |       delete dictTrie_;
 26 |     }
 27 |   }
 28 | 
 29 |   void Cut(const string& sentence, vector<string>& words) const {
 30 |     Cut(sentence, words, MAX_WORD_LENGTH);
 31 |   }
 32 | 
 33 |   void Cut(const string& sentence,
 34 |         vector<string>& words,
 35 |         size_t max_word_len) const {
 36 |     vector<Word> tmp;
 37 |     Cut(sentence, tmp, max_word_len);
 38 |     GetStringsFromWords(tmp, words);
 39 |   }
 40 |   void Cut(const string& sentence, 
 41 |         vector<Word>& words, 
 42 |         size_t max_word_len = MAX_WORD_LENGTH) const {
 43 |     PreFilter pre_filter(symbols_, sentence);
 44 |     PreFilter::Range range;
 45 |     vector<WordRange> wrs;
 46 |     wrs.reserve(sentence.size()/2);
 47 |     while (pre_filter.HasNext()) {
 48 |       range = pre_filter.Next();
 49 |       Cut(range.begin, range.end, wrs, max_word_len);
 50 |     }
 51 |     words.clear();
 52 |     words.reserve(wrs.size());
 53 |     GetWordsFromWordRanges(sentence, wrs, words);
 54 |   }
 55 |   void Cut(RuneStrArray::const_iterator begin,
 56 |            RuneStrArray::const_iterator end,
 57 |            vector<WordRange>& words,
 58 |            size_t max_word_len = MAX_WORD_LENGTH) const {
 59 |     vector<Dag> dags;
 60 |     dictTrie_->Find(begin, 
 61 |           end, 
 62 |           dags,
 63 |           max_word_len);
 64 |     CalcDP(dags);
 65 |     CutByDag(begin, end, dags, words);
 66 |   }
 67 | 
 68 |   const DictTrie* GetDictTrie() const {
 69 |     return dictTrie_;
 70 |   }
 71 | 
 72 |   bool Tag(const string& src, vector<pair<string, string> >& res) const {
 73 |     return tagger_.Tag(src, res, *this);
 74 |   }
 75 | 
 76 |   bool IsUserDictSingleChineseWord(const Rune& value) const {
 77 |     return dictTrie_->IsUserDictSingleChineseWord(value);
 78 |   }
 79 |  private:
 80 |   void CalcDP(vector<Dag>& dags) const {
 81 |     size_t nextPos;
 82 |     const DictUnit* p;
 83 |     double val;
 84 | 
 85 |     for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
 86 |       rit->pInfo = NULL;
 87 |       rit->weight = MIN_DOUBLE;
 88 |       assert(!rit->nexts.empty());
 89 |       for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
 90 |         nextPos = it->first;
 91 |         p = it->second;
 92 |         val = 0.0;
 93 |         if (nextPos + 1 < dags.size()) {
 94 |           val += dags[nextPos + 1].weight;
 95 |         }
 96 | 
 97 |         if (p) {
 98 |           val += p->weight;
 99 |         } else {
100 |           val += dictTrie_->GetMinWeight();
101 |         }
102 |         if (val > rit->weight) {
103 |           rit->pInfo = p;
104 |           rit->weight = val;
105 |         }
106 |       }
107 |     }
108 |   }
109 |   void CutByDag(RuneStrArray::const_iterator begin, 
110 |         RuneStrArray::const_iterator end, 
111 |         const vector<Dag>& dags, 
112 |         vector<WordRange>& words) const {
113 |     size_t i = 0;
114 |     while (i < dags.size()) {
115 |       const DictUnit* p = dags[i].pInfo;
116 |       if (p) {
117 |         assert(p->word.size() >= 1);
118 |         WordRange wr(begin + i, begin + i + p->word.size() - 1);
119 |         words.push_back(wr);
120 |         i += p->word.size();
121 |       } else { //single chinese word
122 |         WordRange wr(begin + i, begin + i);
123 |         words.push_back(wr);
124 |         i++;
125 |       }
126 |     }
127 |   }
128 | 
129 |   const DictTrie* dictTrie_;
130 |   bool isNeedDestroy_;
131 |   PosTagger tagger_;
132 | 
133 | }; // class MPSegment
134 | 
135 | } // namespace cppjieba
136 | 
137 | #endif
138 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/MixSegment.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_MIXSEGMENT_H
  2 | #define CPPJIEBA_MIXSEGMENT_H
  3 | 
  4 | #include <cassert>
  5 | #include "MPSegment.hpp"
  6 | #include "HMMSegment.hpp"
  7 | #include "limonp/StringUtil.hpp"
  8 | #include "PosTagger.hpp"
  9 | 
 10 | namespace cppjieba {
 11 | class MixSegment: public SegmentTagged {
 12 |  public:
 13 |   MixSegment(const string& mpSegDict, const string& hmmSegDict, 
 14 |         const string& userDict = "") 
 15 |     : mpSeg_(mpSegDict, userDict), 
 16 |       hmmSeg_(hmmSegDict) {
 17 |   }
 18 |   MixSegment(const DictTrie* dictTrie, const HMMModel* model) 
 19 |     : mpSeg_(dictTrie), hmmSeg_(model) {
 20 |   }
 21 |   ~MixSegment() {
 22 |   }
 23 | 
 24 |   void Cut(const string& sentence, vector<string>& words) const {
 25 |     Cut(sentence, words, true);
 26 |   }
 27 |   void Cut(const string& sentence, vector<string>& words, bool hmm) const {
 28 |     vector<Word> tmp;
 29 |     Cut(sentence, tmp, hmm);
 30 |     GetStringsFromWords(tmp, words);
 31 |   }
 32 |   void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
 33 |     PreFilter pre_filter(symbols_, sentence);
 34 |     PreFilter::Range range;
 35 |     vector<WordRange> wrs;
 36 |     wrs.reserve(sentence.size() / 2);
 37 |     while (pre_filter.HasNext()) {
 38 |       range = pre_filter.Next();
 39 |       Cut(range.begin, range.end, wrs, hmm);
 40 |     }
 41 |     words.clear();
 42 |     words.reserve(wrs.size());
 43 |     GetWordsFromWordRanges(sentence, wrs, words);
 44 |   }
 45 | 
 46 |   void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
 47 |     if (!hmm) {
 48 |       mpSeg_.Cut(begin, end, res);
 49 |       return;
 50 |     }
 51 |     vector<WordRange> words;
 52 |     assert(end >= begin);
 53 |     words.reserve(end - begin);
 54 |     mpSeg_.Cut(begin, end, words);
 55 | 
 56 |     vector<WordRange> hmmRes;
 57 |     hmmRes.reserve(end - begin);
 58 |     for (size_t i = 0; i < words.size(); i++) {
 59 |       //if mp Get a word, it's ok, put it into result
 60 |       if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
 61 |         res.push_back(words[i]);
 62 |         continue;
 63 |       }
 64 | 
 65 |       // if mp Get a single one and it is not in userdict, collect it in sequence
 66 |       size_t j = i;
 67 |       while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
 68 |         j++;
 69 |       }
 70 | 
 71 |       // Cut the sequence with hmm
 72 |       assert(j - 1 >= i);
 73 |       // TODO
 74 |       hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
 75 |       //put hmm result to result
 76 |       for (size_t k = 0; k < hmmRes.size(); k++) {
 77 |         res.push_back(hmmRes[k]);
 78 |       }
 79 | 
 80 |       //clear tmp vars
 81 |       hmmRes.clear();
 82 | 
 83 |       //let i jump over this piece
 84 |       i = j - 1;
 85 |     }
 86 |   }
 87 | 
 88 |   const DictTrie* GetDictTrie() const {
 89 |     return mpSeg_.GetDictTrie();
 90 |   }
 91 | 
 92 |   bool Tag(const string& src, vector<pair<string, string> >& res) const {
 93 |     return tagger_.Tag(src, res, *this);
 94 |   }
 95 | 
 96 |   string LookupTag(const string &str) const {
 97 |     return tagger_.LookupTag(str, *this);
 98 |   }
 99 | 
100 |  private:
101 |   MPSegment mpSeg_;
102 |   HMMSegment hmmSeg_;
103 |   PosTagger tagger_;
104 | 
105 | }; // class MixSegment
106 | 
107 | } // namespace cppjieba
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/PosTagger.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_POS_TAGGING_H
 2 | #define CPPJIEBA_POS_TAGGING_H
 3 | 
 4 | #include "limonp/StringUtil.hpp"
 5 | #include "SegmentTagged.hpp"
 6 | #include "DictTrie.hpp"
 7 | 
 8 | namespace cppjieba {
 9 | using namespace limonp;
10 | 
11 | static const char* const POS_M = "m";
12 | static const char* const POS_ENG = "eng";
13 | static const char* const POS_X = "x";
14 | 
15 | class PosTagger {
16 |  public:
17 |   PosTagger() {
18 |   }
19 |   ~PosTagger() {
20 |   }
21 | 
22 |   bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
23 |     vector<string> CutRes;
24 |     segment.Cut(src, CutRes);
25 | 
26 |     for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
27 |       res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
28 |     }
29 |     return !res.empty();
30 |   }
31 | 
32 |   string LookupTag(const string &str, const SegmentTagged& segment) const {
33 |     const DictUnit *tmp = NULL;
34 |     RuneStrArray runes;
35 |     const DictTrie * dict = segment.GetDictTrie();
36 |     assert(dict != NULL);
37 |       if (!DecodeUTF8RunesInString(str, runes)) {
38 |         XLOG(ERROR) << "UTF-8 decode failed for word: " << str;
39 |         return POS_X;
40 |       }
41 |       tmp = dict->Find(runes.begin(), runes.end());
42 |       if (tmp == NULL || tmp->tag.empty()) {
43 |         return SpecialRule(runes);
44 |       } else {
45 |         return tmp->tag;
46 |       }
47 |   }
48 | 
49 |  private:
50 |   const char* SpecialRule(const RuneStrArray& unicode) const {
51 |     size_t m = 0;
52 |     size_t eng = 0;
53 |     for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
54 |       if (unicode[i].rune < 0x80) {
55 |         eng ++;
56 |         if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
57 |           m++;
58 |         }
59 |       }
60 |     }
61 |     // ascii char is not found
62 |     if (eng == 0) {
63 |       return POS_X;
64 |     }
65 |     // all the ascii is number char
66 |     if (m == eng) {
67 |       return POS_M;
68 |     }
69 |     // the ascii chars contain english letter
70 |     return POS_ENG;
71 |   }
72 | 
73 | }; // class PosTagger
74 | 
75 | } // namespace cppjieba
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/PreFilter.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_PRE_FILTER_H
 2 | #define CPPJIEBA_PRE_FILTER_H
 3 | 
 4 | #include "Trie.hpp"
 5 | #include "limonp/Logging.hpp"
 6 | 
 7 | namespace cppjieba {
 8 | 
 9 | class PreFilter {
10 |  public:
11 |   //TODO use WordRange instead of Range
12 |   struct Range {
13 |     RuneStrArray::const_iterator begin;
14 |     RuneStrArray::const_iterator end;
15 |   }; // struct Range
16 | 
17 |   PreFilter(const unordered_set<Rune>& symbols, 
18 |         const string& sentence)
19 |     : symbols_(symbols) {
20 |     if (!DecodeUTF8RunesInString(sentence, sentence_)) {
21 |       XLOG(ERROR) << "UTF-8 decode failed for input sentence"; 
22 |     }
23 |     cursor_ = sentence_.begin();
24 |   }
25 |   ~PreFilter() {
26 |   }
27 |   bool HasNext() const {
28 |     return cursor_ != sentence_.end();
29 |   }
30 |   Range Next() {
31 |     Range range;
32 |     range.begin = cursor_;
33 |     while (cursor_ != sentence_.end()) {
34 |       if (IsIn(symbols_, cursor_->rune)) {
35 |         if (range.begin == cursor_) {
36 |           cursor_ ++;
37 |         }
38 |         range.end = cursor_;
39 |         return range;
40 |       }
41 |       cursor_ ++;
42 |     }
43 |     range.end = sentence_.end();
44 |     return range;
45 |   }
46 |  private:
47 |   RuneStrArray::const_iterator cursor_;
48 |   RuneStrArray sentence_;
49 |   const unordered_set<Rune>& symbols_;
50 | }; // class PreFilter
51 | 
52 | } // namespace cppjieba
53 | 
54 | #endif // CPPJIEBA_PRE_FILTER_H
55 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/QuerySegment.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H
 2 | #define CPPJIEBA_QUERYSEGMENT_H
 3 | 
 4 | #include <algorithm>
 5 | #include <set>
 6 | #include <cassert>
 7 | #include "limonp/Logging.hpp"
 8 | #include "DictTrie.hpp"
 9 | #include "SegmentBase.hpp"
10 | #include "FullSegment.hpp"
11 | #include "MixSegment.hpp"
12 | #include "Unicode.hpp"
13 | 
14 | namespace cppjieba {
15 | class QuerySegment: public SegmentBase {
16 |  public:
17 |   QuerySegment(const string& dict, const string& model, const string& userDict = "")
18 |     : mixSeg_(dict, model, userDict),
19 |       trie_(mixSeg_.GetDictTrie()) {
20 |   }
21 |   QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
22 |     : mixSeg_(dictTrie, model), trie_(dictTrie) {
23 |   }
24 |   ~QuerySegment() {
25 |   }
26 | 
27 |   void Cut(const string& sentence, vector<string>& words) const {
28 |     Cut(sentence, words, true);
29 |   }
30 |   void Cut(const string& sentence, vector<string>& words, bool hmm) const {
31 |     vector<Word> tmp;
32 |     Cut(sentence, tmp, hmm);
33 |     GetStringsFromWords(tmp, words);
34 |   }
35 |   void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
36 |     PreFilter pre_filter(symbols_, sentence);
37 |     PreFilter::Range range;
38 |     vector<WordRange> wrs;
39 |     wrs.reserve(sentence.size()/2);
40 |     while (pre_filter.HasNext()) {
41 |       range = pre_filter.Next();
42 |       Cut(range.begin, range.end, wrs, hmm);
43 |     }
44 |     words.clear();
45 |     words.reserve(wrs.size());
46 |     GetWordsFromWordRanges(sentence, wrs, words);
47 |   }
48 |   void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
49 |     //use mix Cut first
50 |     vector<WordRange> mixRes;
51 |     mixSeg_.Cut(begin, end, mixRes, hmm);
52 | 
53 |     vector<WordRange> fullRes;
54 |     for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
55 |       if (mixResItr->Length() > 2) {
56 |         for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
57 |           WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
58 |           if (trie_->Find(wr.left, wr.right + 1) != NULL) {
59 |             res.push_back(wr);
60 |           }
61 |         }
62 |       }
63 |       if (mixResItr->Length() > 3) {
64 |         for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
65 |           WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
66 |           if (trie_->Find(wr.left, wr.right + 1) != NULL) {
67 |             res.push_back(wr);
68 |           }
69 |         }
70 |       }
71 |       res.push_back(*mixResItr);
72 |     }
73 |   }
74 |  private:
75 |   bool IsAllAscii(const Unicode& s) const {
76 |    for(size_t i = 0; i < s.size(); i++) {
77 |      if (s[i] >= 0x80) {
78 |        return false;
79 |      }
80 |    }
81 |    return true;
82 |   }
83 |   MixSegment mixSeg_;
84 |   const DictTrie* trie_;
85 | }; // QuerySegment
86 | 
87 | } // namespace cppjieba
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/SegmentBase.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_SEGMENTBASE_H
 2 | #define CPPJIEBA_SEGMENTBASE_H
 3 | 
 4 | #include "limonp/Logging.hpp"
 5 | #include "PreFilter.hpp"
 6 | #include <cassert>
 7 | 
 8 | 
 9 | namespace cppjieba {
10 | 
11 | const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
12 | 
13 | using namespace limonp;
14 | 
15 | class SegmentBase {
16 |  public:
17 |   SegmentBase() {
18 |     XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
19 |   }
20 |   virtual ~SegmentBase() {
21 |   }
22 | 
23 |   virtual void Cut(const string& sentence, vector<string>& words) const = 0;
24 | 
25 |   bool ResetSeparators(const string& s) {
26 |     symbols_.clear();
27 |     RuneStrArray runes;
28 |     if (!DecodeUTF8RunesInString(s, runes)) {
29 |       XLOG(ERROR) << "UTF-8 decode failed for separators: " << s;
30 |       return false;
31 |     }
32 |     for (size_t i = 0; i < runes.size(); i++) {
33 |       if (!symbols_.insert(runes[i].rune).second) {
34 |         XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
35 |         return false;
36 |       }
37 |     }
38 |     return true;
39 |   }
40 |  protected:
41 |   unordered_set<Rune> symbols_;
42 | }; // class SegmentBase
43 | 
44 | } // cppjieba
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/SegmentTagged.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPPJIEBA_SEGMENTTAGGED_H
 2 | #define CPPJIEBA_SEGMENTTAGGED_H
 3 | 
 4 | #include "SegmentBase.hpp"
 5 | 
 6 | namespace cppjieba {
 7 | 
 8 | class SegmentTagged : public SegmentBase{
 9 |  public:
10 |   SegmentTagged() {
11 |   }
12 |   virtual ~SegmentTagged() {
13 |   }
14 | 
15 |   virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
16 | 
17 |   virtual const DictTrie* GetDictTrie() const = 0;
18 | 
19 | }; // class SegmentTagged
20 | 
21 | } // cppjieba
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/TextRankExtractor.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
  2 | #define CPPJIEBA_TEXTRANK_EXTRACTOR_H
  3 | 
  4 | #include <cmath>
  5 | #include "Jieba.hpp"
  6 | 
  7 | namespace cppjieba {
  8 |   using namespace limonp;
  9 |   using namespace std;
 10 | 
 11 |   class TextRankExtractor {
 12 |   public:
 13 |     typedef struct _Word {string word;vector<size_t> offsets;double weight;}    Word; // struct Word
 14 |   private:
 15 |     typedef std::map<string,Word> WordMap;
 16 |   
 17 |     class WordGraph{
 18 |     private:
 19 |       typedef double Score;
 20 |       typedef string Node;
 21 |       typedef std::set<Node> NodeSet;
 22 | 
 23 |       typedef std::map<Node,double> Edges;
 24 |       typedef std::map<Node,Edges> Graph;
 25 |       //typedef std::unordered_map<Node,double> Edges;
 26 |       //typedef std::unordered_map<Node,Edges> Graph;
 27 | 
 28 |       double d;
 29 |       Graph graph;
 30 |       NodeSet nodeSet;
 31 |     public:
 32 |       WordGraph(): d(0.85) {};
 33 |       WordGraph(double in_d): d(in_d) {};
 34 | 
 35 |       void addEdge(Node start,Node end,double weight){
 36 |         Edges temp;
 37 |         Edges::iterator gotEdges;
 38 |         nodeSet.insert(start);
 39 |         nodeSet.insert(end);
 40 |         graph[start][end]+=weight;
 41 |         graph[end][start]+=weight;
 42 |       }
 43 | 
 44 |       void rank(WordMap &ws,size_t rankTime=10){
 45 |         WordMap outSum;
 46 |         Score wsdef, min_rank, max_rank;
 47 | 
 48 |         if( graph.size() == 0)
 49 |           return;
 50 | 
 51 |         wsdef = 1.0 / graph.size();
 52 | 
 53 |         for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
 54 |           // edges->first start节点；edge->first end节点；edge->second 权重
 55 |           ws[edges->first].word=edges->first;
 56 |           ws[edges->first].weight=wsdef;
 57 |           outSum[edges->first].weight=0;
 58 |           for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
 59 |             outSum[edges->first].weight+=edge->second;
 60 |           }
 61 |         }
 62 |         //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
 63 |         for( size_t i=0; i<rankTime; i++ ){
 64 |           for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
 65 |             double s = 0;
 66 |             for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
 67 |               // edge->first end节点；edge->second 权重
 68 |               s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
 69 |             ws[*node].weight = (1 - d) + d * s;
 70 |           }
 71 |         }
 72 | 
 73 |         min_rank=max_rank=ws.begin()->second.weight;
 74 |         for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
 75 |           if( i->second.weight < min_rank ){
 76 |             min_rank = i->second.weight;
 77 |           }
 78 |           if( i->second.weight > max_rank ){
 79 |             max_rank = i->second.weight;
 80 |           }
 81 |         }
 82 |         for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
 83 |           ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
 84 |         }
 85 |       }
 86 |     };
 87 | 
 88 |   public: 
 89 |   TextRankExtractor(const string& dictPath, 
 90 |         const string& hmmFilePath, 
 91 |         const string& stopWordPath, 
 92 |         const string& userDict = "") 
 93 |     : segment_(dictPath, hmmFilePath, userDict) {
 94 |     LoadStopWordDict(stopWordPath);
 95 |   }
 96 |   TextRankExtractor(const DictTrie* dictTrie, 
 97 |         const HMMModel* model,
 98 |         const string& stopWordPath) 
 99 |     : segment_(dictTrie, model) {
100 |     LoadStopWordDict(stopWordPath);
101 |   }
102 |     TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
103 |         LoadStopWordDict(stopWordPath);
104 |     }
105 |     ~TextRankExtractor() {
106 |     }
107 | 
108 |     void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
109 |       vector<Word> topWords;
110 |       Extract(sentence, topWords, topN);
111 |       for (size_t i = 0; i < topWords.size(); i++) {
112 |         keywords.push_back(topWords[i].word);
113 |       }
114 |     }
115 | 
116 |     void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
117 |       vector<Word> topWords;
118 |       Extract(sentence, topWords, topN);
119 |       for (size_t i = 0; i < topWords.size(); i++) {
120 |         keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
121 |       }
122 |     }
123 | 
124 |     void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
125 |       vector<string> words;
126 |       segment_.Cut(sentence, words);
127 | 
128 |       TextRankExtractor::WordGraph graph;
129 |       WordMap wordmap;
130 |       size_t offset = 0;
131 | 
132 |       for(size_t i=0; i < words.size(); i++){
133 |         size_t t = offset;
134 |         offset += words[i].size();
135 |         if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
136 |           continue;
137 |         }
138 |         for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
139 |           if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
140 |             skip++;
141 |             continue;
142 |           }
143 |           graph.addEdge(words[i],words[j],1);
144 |         }
145 |         wordmap[words[i]].offsets.push_back(t);
146 |       }
147 |       if (offset != sentence.size()) {
148 |         XLOG(ERROR) << "words illegal";
149 |         return;
150 |       }
151 | 
152 |       graph.rank(wordmap,rankTime);
153 |       
154 |       keywords.clear();
155 |       keywords.reserve(wordmap.size());
156 |       for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
157 |         keywords.push_back(itr->second);
158 |       }
159 |       
160 |       topN = min(topN, keywords.size());
161 |       partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
162 |       keywords.resize(topN);
163 |     }
164 |   private:
165 |     void LoadStopWordDict(const string& filePath) {
166 |       ifstream ifs(filePath.c_str());
167 |       XCHECK(ifs.is_open()) << "open " << filePath << " failed";
168 |       string line ;
169 |       while (getline(ifs, line)) {
170 |         stopWords_.insert(line);
171 |       }
172 |       assert(stopWords_.size());
173 |     }
174 | 
175 |     static bool Compare(const Word &x,const Word &y){
176 |       return x.weight > y.weight;
177 |     }
178 | 
179 |     MixSegment segment_;
180 |     unordered_set<string> stopWords_;
181 |   }; // class TextRankExtractor
182 |   
183 |   inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
184 |     return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
185 |   }
186 | } // namespace cppjieba
187 | 
188 | #endif
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/Trie.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_TRIE_HPP
  2 | #define CPPJIEBA_TRIE_HPP
  3 | 
  4 | #include <vector>
  5 | #include <queue>
  6 | #include "limonp/StdExtension.hpp"
  7 | #include "Unicode.hpp"
  8 | 
  9 | namespace cppjieba {
 10 | 
 11 | using namespace std;
 12 | 
 13 | const size_t MAX_WORD_LENGTH = 512;
 14 | 
 15 | struct DictUnit {
 16 |   Unicode word;
 17 |   double weight;
 18 |   string tag;
 19 | }; // struct DictUnit
 20 | 
 21 | // for debugging
 22 | // inline ostream & operator << (ostream& os, const DictUnit& unit) {
 23 | //   string s;
 24 | //   s << unit.word;
 25 | //   return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
 26 | // }
 27 | 
 28 | struct Dag {
 29 |   RuneStr runestr;
 30 |   // [offset, nexts.first]
 31 |   limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
 32 |   const DictUnit * pInfo;
 33 |   double weight;
 34 |   size_t nextPos; // TODO
 35 |   Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
 36 |   }
 37 | }; // struct Dag
 38 | 
 39 | typedef Rune TrieKey;
 40 | 
 41 | class TrieNode {
 42 |  public :
 43 |   TrieNode(): next(NULL), ptValue(NULL) {
 44 |   }
 45 |  public:
 46 |   typedef unordered_map<TrieKey, TrieNode*> NextMap;
 47 |   NextMap *next;
 48 |   const DictUnit *ptValue;
 49 | };
 50 | 
 51 | class Trie {
 52 |  public:
 53 |   Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
 54 |    : root_(new TrieNode) {
 55 |     CreateTrie(keys, valuePointers);
 56 |   }
 57 |   ~Trie() {
 58 |     DeleteNode(root_);
 59 |   }
 60 | 
 61 |   const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
 62 |     if (begin == end) {
 63 |       return NULL;
 64 |     }
 65 | 
 66 |     const TrieNode* ptNode = root_;
 67 |     TrieNode::NextMap::const_iterator citer;
 68 |     for (RuneStrArray::const_iterator it = begin; it != end; it++) {
 69 |       if (NULL == ptNode->next) {
 70 |         return NULL;
 71 |       }
 72 |       citer = ptNode->next->find(it->rune);
 73 |       if (ptNode->next->end() == citer) {
 74 |         return NULL;
 75 |       }
 76 |       ptNode = citer->second;
 77 |     }
 78 |     return ptNode->ptValue;
 79 |   }
 80 | 
 81 |   void Find(RuneStrArray::const_iterator begin, 
 82 |         RuneStrArray::const_iterator end, 
 83 |         vector<struct Dag>&res, 
 84 |         size_t max_word_len = MAX_WORD_LENGTH) const {
 85 |     assert(root_ != NULL);
 86 |     res.resize(end - begin);
 87 | 
 88 |     const TrieNode *ptNode = NULL;
 89 |     TrieNode::NextMap::const_iterator citer;
 90 |     for (size_t i = 0; i < size_t(end - begin); i++) {
 91 |       res[i].runestr = *(begin + i);
 92 | 
 93 |       if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
 94 |         ptNode = citer->second;
 95 |       } else {
 96 |         ptNode = NULL;
 97 |       }
 98 |       if (ptNode != NULL) {
 99 |         res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
100 |       } else {
101 |         res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
102 |       }
103 | 
104 |       for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
105 |         if (ptNode == NULL || ptNode->next == NULL) {
106 |           break;
107 |         }
108 |         citer = ptNode->next->find((begin + j)->rune);
109 |         if (ptNode->next->end() == citer) {
110 |           break;
111 |         }
112 |         ptNode = citer->second;
113 |         if (NULL != ptNode->ptValue) {
114 |           res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
115 |         }
116 |       }
117 |     }
118 |   }
119 | 
120 |   void InsertNode(const Unicode& key, const DictUnit* ptValue) {
121 |     if (key.begin() == key.end()) {
122 |       return;
123 |     }
124 | 
125 |     TrieNode::NextMap::const_iterator kmIter;
126 |     TrieNode *ptNode = root_;
127 |     for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
128 |       if (NULL == ptNode->next) {
129 |         ptNode->next = new TrieNode::NextMap;
130 |       }
131 |       kmIter = ptNode->next->find(*citer);
132 |       if (ptNode->next->end() == kmIter) {
133 |         TrieNode *nextNode = new TrieNode;
134 | 
135 |         ptNode->next->insert(make_pair(*citer, nextNode));
136 |         ptNode = nextNode;
137 |       } else {
138 |         ptNode = kmIter->second;
139 |       }
140 |     }
141 |     assert(ptNode != NULL);
142 |     ptNode->ptValue = ptValue;
143 |   }
144 |   void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
145 |       if (key.begin() == key.end()) {
146 |         return;
147 |       }
148 |       //定义一个NextMap迭代器
149 |       TrieNode::NextMap::const_iterator kmIter;
150 |       //定义一个指向root的TrieNode指针
151 |       TrieNode *ptNode = root_;
152 |       for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
153 |         //链表不存在元素
154 |         if (NULL == ptNode->next) {
155 |           return;
156 |         }
157 |         kmIter = ptNode->next->find(*citer);
158 |         //如果map中不存在,跳出循环
159 |         if (ptNode->next->end() == kmIter) {
160 |               break;
161 |         }
162 |         //从unordered_map中擦除该项
163 |         ptNode->next->erase(*citer);
164 |         //删除该node
165 |         ptNode = kmIter->second;
166 |         delete ptNode;
167 |         break;
168 |       }
169 |       return;
170 |  }
171 |  private:
172 |   void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
173 |     if (valuePointers.empty() || keys.empty()) {
174 |       return;
175 |     }
176 |     assert(keys.size() == valuePointers.size());
177 | 
178 |     for (size_t i = 0; i < keys.size(); i++) {
179 |       InsertNode(keys[i], valuePointers[i]);
180 |     }
181 |   }
182 | 
183 |   void DeleteNode(TrieNode* node) {
184 |     if (NULL == node) {
185 |       return;
186 |     }
187 |     if (NULL != node->next) {
188 |       for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
189 |         DeleteNode(it->second);
190 |       }
191 |       delete node->next;
192 |     }
193 |     delete node;
194 |   }
195 | 
196 |   TrieNode* root_;
197 | }; // class Trie
198 | } // namespace cppjieba
199 | 
200 | #endif // CPPJIEBA_TRIE_HPP
201 | 


--------------------------------------------------------------------------------
/src/3rd_include/cppjieba/Unicode.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CPPJIEBA_UNICODE_H
  2 | #define CPPJIEBA_UNICODE_H
  3 | 
  4 | #include <stdint.h>
  5 | #include <stdlib.h>
  6 | #include <string>
  7 | #include <vector>
  8 | #include <ostream>
  9 | #include "limonp/LocalVector.hpp"
 10 | 
 11 | namespace cppjieba {
 12 | 
 13 | using std::string;
 14 | using std::vector;
 15 | 
 16 | typedef uint32_t Rune;
 17 | 
 18 | struct Word {
 19 |   string word;
 20 |   uint32_t offset;
 21 |   uint32_t unicode_offset;
 22 |   uint32_t unicode_length;
 23 |   Word(const string& w, uint32_t o)
 24 |    : word(w), offset(o) {
 25 |   }
 26 |   Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
 27 |           : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
 28 |   }
 29 | }; // struct Word
 30 | 
 31 | inline std::ostream& operator << (std::ostream& os, const Word& w) {
 32 |   return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
 33 | }
 34 | 
 35 | struct RuneStr {
 36 |   Rune rune;
 37 |   uint32_t offset;
 38 |   uint32_t len;
 39 |   uint32_t unicode_offset;
 40 |   uint32_t unicode_length;
 41 |   RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
 42 |   }
 43 |   RuneStr(Rune r, uint32_t o, uint32_t l)
 44 |     : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
 45 |   }
 46 |   RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
 47 |           : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
 48 |   }
 49 | }; // struct RuneStr
 50 | 
 51 | inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
 52 |   return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
 53 | }
 54 | 
 55 | typedef limonp::LocalVector<Rune> Unicode;
 56 | typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
 57 | 
 58 | // [left, right]
 59 | struct WordRange {
 60 |   RuneStrArray::const_iterator left;
 61 |   RuneStrArray::const_iterator right;
 62 |   WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
 63 |    : left(l), right(r) {
 64 |   }
 65 |   size_t Length() const {
 66 |     return right - left + 1;
 67 |   }
 68 |   bool IsAllAscii() const {
 69 |     for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
 70 |       if (iter->rune >= 0x80) {
 71 |         return false;
 72 |       }
 73 |     }
 74 |     return true;
 75 |   }
 76 | }; // struct WordRange
 77 | 
 78 | struct RuneStrLite {
 79 |   uint32_t rune;
 80 |   uint32_t len;
 81 |   RuneStrLite(): rune(0), len(0) {
 82 |   }
 83 |   RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
 84 |   }
 85 | }; // struct RuneStrLite
 86 | 
 87 | inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
 88 |   RuneStrLite rp(0, 0);
 89 |   if (str == NULL || len == 0) {
 90 |     return rp;
 91 |   }
 92 |   if (!(str[0] & 0x80)) { // 0xxxxxxx
 93 |     // 7bit, total 7bit
 94 |     rp.rune = (uint8_t)(str[0]) & 0x7f;
 95 |     rp.len = 1;
 96 |   } else if ((uint8_t)str[0] <= 0xdf &&  1 < len) { 
 97 |     // 110xxxxxx
 98 |     // 5bit, total 5bit
 99 |     rp.rune = (uint8_t)(str[0]) & 0x1f;
100 | 
101 |     // 6bit, total 11bit
102 |     rp.rune <<= 6;
103 |     rp.rune |= (uint8_t)(str[1]) & 0x3f;
104 |     rp.len = 2;
105 |   } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
106 |     // 4bit, total 4bit
107 |     rp.rune = (uint8_t)(str[0]) & 0x0f;
108 | 
109 |     // 6bit, total 10bit
110 |     rp.rune <<= 6;
111 |     rp.rune |= (uint8_t)(str[1]) & 0x3f;
112 | 
113 |     // 6bit, total 16bit
114 |     rp.rune <<= 6;
115 |     rp.rune |= (uint8_t)(str[2]) & 0x3f;
116 | 
117 |     rp.len = 3;
118 |   } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
119 |     // 3bit, total 3bit
120 |     rp.rune = (uint8_t)(str[0]) & 0x07;
121 | 
122 |     // 6bit, total 9bit
123 |     rp.rune <<= 6;
124 |     rp.rune |= (uint8_t)(str[1]) & 0x3f;
125 | 
126 |     // 6bit, total 15bit
127 |     rp.rune <<= 6;
128 |     rp.rune |= (uint8_t)(str[2]) & 0x3f;
129 | 
130 |     // 6bit, total 21bit
131 |     rp.rune <<= 6;
132 |     rp.rune |= (uint8_t)(str[3]) & 0x3f;
133 | 
134 |     rp.len = 4;
135 |   } else {
136 |     rp.rune = 0;
137 |     rp.len = 0;
138 |   }
139 |   return rp;
140 | }
141 | 
142 | inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
143 |   runes.clear();
144 |   runes.reserve(len / 2);
145 |   for (uint32_t i = 0, j = 0; i < len;) {
146 |     RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
147 |     if (rp.len == 0) {
148 |       runes.clear();
149 |       return false;
150 |     }
151 |     RuneStr x(rp.rune, i, rp.len, j, 1);
152 |     runes.push_back(x);
153 |     i += rp.len;
154 |     ++j;
155 |   }
156 |   return true;
157 | }
158 | 
159 | inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) {
160 |   return DecodeUTF8RunesInString(s.c_str(), s.size(), runes);
161 | }
162 | 
163 | inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) {
164 |   unicode.clear();
165 |   RuneStrArray runes;
166 |   if (!DecodeUTF8RunesInString(s, len, runes)) {
167 |     return false;
168 |   }
169 |   unicode.reserve(runes.size());
170 |   for (size_t i = 0; i < runes.size(); i++) {
171 |     unicode.push_back(runes[i].rune);
172 |   }
173 |   return true;
174 | }
175 | 
176 | inline bool IsSingleWord(const string& str) {
177 |   RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size());
178 |   return rp.len == str.size();
179 | }
180 | 
181 | inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) {
182 |   return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode);
183 | }
184 | 
185 | inline Unicode DecodeUTF8RunesInString(const string& s) {
186 |   Unicode result;
187 |   DecodeUTF8RunesInString(s, result);
188 |   return result;
189 | }
190 | 
191 | 
192 | // [left, right]
193 | inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
194 |   assert(right->offset >= left->offset);
195 |   uint32_t len = right->offset - left->offset + right->len;
196 |   uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
197 |   return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
198 | }
199 | 
200 | inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
201 |   assert(right->offset >= left->offset);
202 |   uint32_t len = right->offset - left->offset + right->len;
203 |   return s.substr(left->offset, len);
204 | }
205 | 
206 | inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
207 |   for (size_t i = 0; i < wrs.size(); i++) {
208 |     words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
209 |   }
210 | }
211 | 
212 | inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
213 |   vector<Word> result;
214 |   GetWordsFromWordRanges(s, wrs, result);
215 |   return result;
216 | }
217 | 
218 | inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
219 |   strs.resize(words.size());
220 |   for (size_t i = 0; i < words.size(); ++i) {
221 |     strs[i] = words[i].word;
222 |   }
223 | }
224 | 
225 | } // namespace cppjieba
226 | 
227 | #endif // CPPJIEBA_UNICODE_H
228 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/ArgvContext.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  * file enc : ascii
 3 |  * author   : wuyanyi09@gmail.com
 4 |  ************************************/
 5 | 
 6 | #ifndef LIMONP_ARGV_FUNCTS_H
 7 | #define LIMONP_ARGV_FUNCTS_H
 8 | 
 9 | #include <set>
10 | #include <sstream>
11 | #include "StringUtil.hpp"
12 | 
13 | namespace limonp {
14 | 
15 | using namespace std;
16 | 
17 | class ArgvContext {
18 |  public :
19 |   ArgvContext(int argc, const char* const * argv) {
20 |     for(int i = 0; i < argc; i++) {
21 |       if(StartsWith(argv[i], "-")) {
22 |         if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
23 |           mpss_[argv[i]] = argv[i+1];
24 |           i++;
25 |         } else {
26 |           sset_.insert(argv[i]);
27 |         }
28 |       } else {
29 |         args_.push_back(argv[i]);
30 |       }
31 |     }
32 |   }
33 |   ~ArgvContext() {
34 |   }
35 | 
36 |   friend ostream& operator << (ostream& os, const ArgvContext& args);
37 |   string operator [](size_t i) const {
38 |     if(i < args_.size()) {
39 |       return args_[i];
40 |     }
41 |     return "";
42 |   }
43 |   string operator [](const string& key) const {
44 |     map<string, string>::const_iterator it = mpss_.find(key);
45 |     if(it != mpss_.end()) {
46 |       return it->second;
47 |     }
48 |     return "";
49 |   }
50 | 
51 |   bool HasKey(const string& key) const {
52 |     if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
53 |       return true;
54 |     }
55 |     return false;
56 |   }
57 | 
58 |  private:
59 |   vector<string> args_;
60 |   map<string, string> mpss_;
61 |   set<string> sset_;
62 | }; // class ArgvContext
63 | 
64 | inline ostream& operator << (ostream& os, const ArgvContext& args) {
65 |   return os<<args.args_<<args.mpss_<<args.sset_;
66 | }
67 | 
68 | } // namespace limonp
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/Closure.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_CLOSURE_HPP
  2 | #define LIMONP_CLOSURE_HPP
  3 | 
  4 | namespace limonp {
  5 | 
  6 | class ClosureInterface {
  7 |  public:
  8 |   virtual ~ClosureInterface() {
  9 |   }
 10 |   virtual void Run() = 0;
 11 | };
 12 | 
 13 | template <class Funct>
 14 | class Closure0: public ClosureInterface {
 15 |  public:
 16 |   Closure0(Funct fun) {
 17 |     fun_ = fun;
 18 |   }
 19 |   virtual ~Closure0() {
 20 |   }
 21 |   virtual void Run() {
 22 |     (*fun_)();
 23 |   }
 24 |  private:
 25 |   Funct fun_;
 26 | }; 
 27 | 
 28 | template <class Funct, class Arg1>
 29 | class Closure1: public ClosureInterface {
 30 |  public:
 31 |   Closure1(Funct fun, Arg1 arg1) {
 32 |     fun_ = fun;
 33 |     arg1_ = arg1;
 34 |   }
 35 |   virtual ~Closure1() {
 36 |   }
 37 |   virtual void Run() {
 38 |     (*fun_)(arg1_);
 39 |   }
 40 |  private:
 41 |   Funct fun_;
 42 |   Arg1 arg1_;
 43 | }; 
 44 | 
 45 | template <class Funct, class Arg1, class Arg2>
 46 | class Closure2: public ClosureInterface {
 47 |  public:
 48 |   Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
 49 |     fun_ = fun;
 50 |     arg1_ = arg1;
 51 |     arg2_ = arg2;
 52 |   }
 53 |   virtual ~Closure2() {
 54 |   }
 55 |   virtual void Run() {
 56 |     (*fun_)(arg1_, arg2_);
 57 |   }
 58 |  private:
 59 |   Funct fun_;
 60 |   Arg1 arg1_;
 61 |   Arg2 arg2_;
 62 | }; 
 63 | 
 64 | template <class Funct, class Arg1, class Arg2, class Arg3>
 65 | class Closure3: public ClosureInterface {
 66 |  public:
 67 |   Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
 68 |     fun_ = fun;
 69 |     arg1_ = arg1;
 70 |     arg2_ = arg2;
 71 |     arg3_ = arg3;
 72 |   }
 73 |   virtual ~Closure3() {
 74 |   }
 75 |   virtual void Run() {
 76 |     (*fun_)(arg1_, arg2_, arg3_);
 77 |   }
 78 |  private:
 79 |   Funct fun_;
 80 |   Arg1 arg1_;
 81 |   Arg2 arg2_;
 82 |   Arg3 arg3_;
 83 | }; 
 84 | 
 85 | template <class Obj, class Funct> 
 86 | class ObjClosure0: public ClosureInterface {
 87 |  public:
 88 |   ObjClosure0(Obj* p, Funct fun) {
 89 |    p_ = p;
 90 |    fun_ = fun;
 91 |   }
 92 |   virtual ~ObjClosure0() {
 93 |   }
 94 |   virtual void Run() {
 95 |     (p_->*fun_)();
 96 |   }
 97 |  private:
 98 |   Obj* p_;
 99 |   Funct fun_;
100 | }; 
101 | 
102 | template <class Obj, class Funct, class Arg1> 
103 | class ObjClosure1: public ClosureInterface {
104 |  public:
105 |   ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
106 |    p_ = p;
107 |    fun_ = fun;
108 |    arg1_ = arg1;
109 |   }
110 |   virtual ~ObjClosure1() {
111 |   }
112 |   virtual void Run() {
113 |     (p_->*fun_)(arg1_);
114 |   }
115 |  private:
116 |   Obj* p_;
117 |   Funct fun_;
118 |   Arg1 arg1_;
119 | }; 
120 | 
121 | template <class Obj, class Funct, class Arg1, class Arg2> 
122 | class ObjClosure2: public ClosureInterface {
123 |  public:
124 |   ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
125 |    p_ = p;
126 |    fun_ = fun;
127 |    arg1_ = arg1;
128 |    arg2_ = arg2;
129 |   }
130 |   virtual ~ObjClosure2() {
131 |   }
132 |   virtual void Run() {
133 |     (p_->*fun_)(arg1_, arg2_);
134 |   }
135 |  private:
136 |   Obj* p_;
137 |   Funct fun_;
138 |   Arg1 arg1_;
139 |   Arg2 arg2_;
140 | }; 
141 | template <class Obj, class Funct, class Arg1, class Arg2, class Arg3> 
142 | class ObjClosure3: public ClosureInterface {
143 |  public:
144 |   ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
145 |    p_ = p;
146 |    fun_ = fun;
147 |    arg1_ = arg1;
148 |    arg2_ = arg2;
149 |    arg3_ = arg3;
150 |   }
151 |   virtual ~ObjClosure3() {
152 |   }
153 |   virtual void Run() {
154 |     (p_->*fun_)(arg1_, arg2_, arg3_);
155 |   }
156 |  private:
157 |   Obj* p_;
158 |   Funct fun_;
159 |   Arg1 arg1_;
160 |   Arg2 arg2_;
161 |   Arg3 arg3_;
162 | }; 
163 | 
164 | template<class R>
165 | ClosureInterface* NewClosure(R (*fun)()) {
166 |   return new Closure0<R (*)()>(fun);
167 | }
168 | 
169 | template<class R, class Arg1>
170 | ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
171 |   return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
172 | }
173 | 
174 | template<class R, class Arg1, class Arg2>
175 | ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
176 |   return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
177 | }
178 | 
179 | template<class R, class Arg1, class Arg2, class Arg3>
180 | ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
181 |   return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
182 | }
183 | 
184 | template<class R, class Obj>
185 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
186 |   return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
187 | }
188 | 
189 | template<class R, class Obj, class Arg1>
190 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
191 |   return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
192 | }
193 | 
194 | template<class R, class Obj, class Arg1, class Arg2>
195 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
196 |   return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
197 | }
198 | 
199 | template<class R, class Obj, class Arg1, class Arg2, class Arg3>
200 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
201 |   return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
202 | }
203 | 
204 | } // namespace limonp
205 | 
206 | #endif // LIMONP_CLOSURE_HPP
207 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/Colors.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_COLOR_PRINT_HPP
 2 | #define LIMONP_COLOR_PRINT_HPP
 3 | 
 4 | #include <string>
 5 | #include <stdarg.h>
 6 | 
 7 | namespace limonp {
 8 | 
 9 | using std::string;
10 | 
11 | enum Color {
12 |   BLACK = 30,
13 |   RED,
14 |   GREEN,
15 |   YELLOW,
16 |   BLUE,
17 |   PURPLE
18 | }; // enum Color
19 | 
20 | static void ColorPrintln(enum Color color, const char * fmt, ...) {
21 |   va_list ap;
22 |   printf("\033[0;%dm", color);
23 |   va_start(ap, fmt);
24 |   vprintf(fmt, ap);
25 |   va_end(ap);
26 |   printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
27 | }
28 | 
29 | } // namespace limonp
30 | 
31 | #endif // LIMONP_COLOR_PRINT_HPP
32 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/Condition.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_CONDITION_HPP
 2 | #define LIMONP_CONDITION_HPP
 3 | 
 4 | #include "MutexLock.hpp"
 5 | 
 6 | namespace limonp {
 7 | 
 8 | class Condition : NonCopyable {
 9 |  public:
10 |   explicit Condition(MutexLock& mutex)
11 |     : mutex_(mutex) {
12 |     XCHECK(!pthread_cond_init(&pcond_, NULL));
13 |   }
14 | 
15 |   ~Condition() {
16 |     XCHECK(!pthread_cond_destroy(&pcond_));
17 |   }
18 | 
19 |   void Wait() {
20 |     XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
21 |   }
22 | 
23 |   void Notify() {
24 |     XCHECK(!pthread_cond_signal(&pcond_));
25 |   }
26 | 
27 |   void NotifyAll() {
28 |     XCHECK(!pthread_cond_broadcast(&pcond_));
29 |   }
30 | 
31 |  private:
32 |   MutexLock& mutex_;
33 |   pthread_cond_t pcond_;
34 | }; // class Condition
35 | 
36 | } // namespace limonp
37 | 
38 | #endif // LIMONP_CONDITION_HPP
39 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/Config.hpp:
--------------------------------------------------------------------------------
  1 | /************************************
  2 |  * file enc : utf8
  3 |  * author   : wuyanyi09@gmail.com
  4 |  ************************************/
  5 | #ifndef LIMONP_CONFIG_H
  6 | #define LIMONP_CONFIG_H
  7 | 
  8 | #include <map>
  9 | #include <fstream>
 10 | #include <iostream>
 11 | #include <assert.h>
 12 | #include "StringUtil.hpp"
 13 | 
 14 | namespace limonp {
 15 | 
 16 | using namespace std;
 17 | 
 18 | class Config {
 19 |  public:
 20 |   explicit Config(const string& filePath) {
 21 |     LoadFile(filePath);
 22 |   }
 23 | 
 24 |   operator bool () {
 25 |     return !map_.empty();
 26 |   }
 27 | 
 28 |   string Get(const string& key, const string& defaultvalue) const {
 29 |     map<string, string>::const_iterator it = map_.find(key);
 30 |     if(map_.end() != it) {
 31 |       return it->second;
 32 |     }
 33 |     return defaultvalue;
 34 |   }
 35 |   int Get(const string& key, int defaultvalue) const {
 36 |     string str = Get(key, "");
 37 |     if("" == str) {
 38 |       return defaultvalue;
 39 |     }
 40 |     return atoi(str.c_str());
 41 |   }
 42 |   const char* operator [] (const char* key) const {
 43 |     if(NULL == key) {
 44 |       return NULL;
 45 |     }
 46 |     map<string, string>::const_iterator it = map_.find(key);
 47 |     if(map_.end() != it) {
 48 |       return it->second.c_str();
 49 |     }
 50 |     return NULL;
 51 |   }
 52 | 
 53 |   string GetConfigInfo() const {
 54 |     string res;
 55 |     res << *this;
 56 |     return res;
 57 |   }
 58 | 
 59 |  private:
 60 |   void LoadFile(const string& filePath) {
 61 |     ifstream ifs(filePath.c_str());
 62 |     assert(ifs);
 63 |     string line;
 64 |     vector<string> vecBuf;
 65 |     size_t lineno = 0;
 66 |     while(getline(ifs, line)) {
 67 |       lineno ++;
 68 |       Trim(line);
 69 |       if(line.empty() || StartsWith(line, "#")) {
 70 |         continue;
 71 |       }
 72 |       vecBuf.clear();
 73 |       Split(line, vecBuf, "=");
 74 |       if(2 != vecBuf.size()) {
 75 |         fprintf(stderr, "line[%s] illegal.\n", line.c_str());
 76 |         assert(false);
 77 |         continue;
 78 |       }
 79 |       string& key = vecBuf[0];
 80 |       string& value = vecBuf[1];
 81 |       Trim(key);
 82 |       Trim(value);
 83 |       if(!map_.insert(make_pair(key, value)).second) {
 84 |         fprintf(stderr, "key[%s] already exits.\n", key.c_str());
 85 |         assert(false);
 86 |         continue;
 87 |       }
 88 |     }
 89 |     ifs.close();
 90 |   }
 91 | 
 92 |   friend ostream& operator << (ostream& os, const Config& config);
 93 | 
 94 |   map<string, string> map_;
 95 | }; // class Config
 96 | 
 97 | inline ostream& operator << (ostream& os, const Config& config) {
 98 |   return os << config.map_;
 99 | }
100 | 
101 | } // namespace limonp
102 | 
103 | #endif // LIMONP_CONFIG_H
104 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/ForcePublic.hpp:
--------------------------------------------------------------------------------
1 | #ifndef LIMONP_FORCE_PUBLIC_H
2 | #define LIMONP_FORCE_PUBLIC_H
3 | 
4 | #define private public
5 | #define protected public
6 | 
7 | #endif // LIMONP_FORCE_PUBLIC_H
8 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/LocalVector.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_LOCAL_VECTOR_HPP
  2 | #define LIMONP_LOCAL_VECTOR_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <stdlib.h>
  6 | #include <assert.h>
  7 | #include <string.h>
  8 | 
  9 | namespace limonp {
 10 | using namespace std;
 11 | /*
 12 |  * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
 13 |  * LocalVector<T> is simple and not well-tested.
 14 |  */
 15 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
 16 | template <class T>
 17 | class LocalVector {
 18 |  public:
 19 |   typedef const T* const_iterator ;
 20 |   typedef T value_type;
 21 |   typedef size_t size_type;
 22 |  private:
 23 |   T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
 24 |   T * ptr_;
 25 |   size_t size_;
 26 |   size_t capacity_;
 27 |  public:
 28 |   LocalVector() {
 29 |     init_();
 30 |   };
 31 |   LocalVector(const LocalVector<T>& vec) {
 32 |     init_();
 33 |     *this = vec;
 34 |   }
 35 |   LocalVector(const_iterator  begin, const_iterator end) { // TODO: make it faster
 36 |     init_();
 37 |     while(begin != end) {
 38 |       push_back(*begin++);
 39 |     }
 40 |   }
 41 |   LocalVector(size_t size, const T& t) { // TODO: make it faster
 42 |     init_();
 43 |     while(size--) {
 44 |       push_back(t);
 45 |     }
 46 |   }
 47 |   ~LocalVector() {
 48 |     if(ptr_ != buffer_) {
 49 |       free(ptr_);
 50 |     }
 51 |   };
 52 |  public:
 53 |   LocalVector<T>& operator = (const LocalVector<T>& vec) {
 54 |     clear();
 55 |     size_ = vec.size();
 56 |     capacity_ = vec.capacity();
 57 |     if(vec.buffer_ == vec.ptr_) {
 58 |       memcpy(static_cast<void*>(buffer_), vec.buffer_, sizeof(T) * size_);
 59 |       ptr_ = buffer_;
 60 |     } else {
 61 |       ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
 62 |       assert(ptr_);
 63 |       memcpy(static_cast<void*>(ptr_), vec.ptr_, vec.size() * sizeof(T));
 64 |     }
 65 |     return *this;
 66 |   }
 67 |  private:
 68 |   void init_() {
 69 |     ptr_ = buffer_;
 70 |     size_ = 0;
 71 |     capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
 72 |   }
 73 |  public:
 74 |   T& operator [] (size_t i) {
 75 |     return ptr_[i];
 76 |   }
 77 |   const T& operator [] (size_t i) const {
 78 |     return ptr_[i];
 79 |   }
 80 |   void push_back(const T& t) {
 81 |     if(size_ == capacity_) {
 82 |       assert(capacity_);
 83 |       reserve(capacity_ * 2);
 84 |     }
 85 |     ptr_[size_ ++ ] = t;
 86 |   }
 87 |   void reserve(size_t size) {
 88 |     if(size <= capacity_) {
 89 |       return;
 90 |     }
 91 |     T * next =  (T*)malloc(sizeof(T) * size);
 92 |     assert(next);
 93 |     T * old = ptr_;
 94 |     ptr_ = next;
 95 |     memcpy(static_cast<void*>(ptr_), old, sizeof(T) * capacity_);
 96 |     capacity_ = size;
 97 |     if(old != buffer_) {
 98 |       free(old);
 99 |     }
100 |   }
101 |   bool empty() const {
102 |     return 0 == size();
103 |   }
104 |   size_t size() const {
105 |     return size_;
106 |   }
107 |   size_t capacity() const {
108 |     return capacity_;
109 |   }
110 |   const_iterator begin() const {
111 |     return ptr_;
112 |   }
113 |   const_iterator end() const {
114 |     return ptr_ + size_;
115 |   }
116 |   void clear() {
117 |     if(ptr_ != buffer_) {
118 |       free(ptr_);
119 |     }
120 |     init_();
121 |   }
122 | };
123 | 
124 | template <class T>
125 | ostream & operator << (ostream& os, const LocalVector<T>& vec) {
126 |   if(vec.empty()) {
127 |     return os << "[]";
128 |   }
129 |   os<<"[\""<<vec[0];
130 |   for(size_t i = 1; i < vec.size(); i++) {
131 |     os<<"\", \""<<vec[i];
132 |   }
133 |   os<<"\"]";
134 |   return os;
135 | }
136 | 
137 | }
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/Logging.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LIMONP_LOGGING_HPP
 2 | #define LIMONP_LOGGING_HPP
 3 | 
 4 | #include <sstream>
 5 | #include <iostream>
 6 | #include <cassert>
 7 | #include <cstdlib>
 8 | #include <ctime>
 9 | 
10 | #ifdef XLOG
11 | #error "XLOG has been defined already"
12 | #endif // XLOG
13 | #ifdef XCHECK
14 | #error "XCHECK has been defined already"
15 | #endif // XCHECK
16 | 
17 | #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 
18 | #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. "
19 | 
20 | namespace limonp {
21 | 
22 | enum {
23 |   LL_DEBUG = 0, 
24 |   LL_INFO = 1, 
25 |   LL_WARNING = 2, 
26 |   LL_ERROR = 3, 
27 |   LL_FATAL = 4,
28 | }; // enum
29 | 
30 | static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
31 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
32 | 
33 | class Logger {
34 |  public:
35 |   Logger(size_t level, const char* filename, int lineno)
36 |    : level_(level) {
37 | #ifdef LOGGING_LEVEL
38 |      if (level_ < LOGGING_LEVEL) {
39 |        return;
40 |      }
41 | #endif
42 |     assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
43 |     
44 |     char buf[32];
45 |     
46 |     time_t timeNow;
47 |     time(&timeNow);
48 | 
49 |     struct tm tmNow;
50 | 
51 |     #if defined(_WIN32) || defined(_WIN64)
52 |     errno_t e = localtime_s(&tmNow, &timeNow);
53 |     assert(e == 0);
54 |     #else
55 |     struct tm * tm_tmp = localtime_r(&timeNow, &tmNow);
56 |     assert(tm_tmp != nullptr);
57 |     #endif
58 | 
59 |     strftime(buf, sizeof(buf), LOG_TIME_FORMAT, &tmNow);
60 | 
61 |     stream_ << buf 
62 |       << " " << filename 
63 |       << ":" << lineno 
64 |       << " " << LOG_LEVEL_ARRAY[level_] 
65 |       << " ";
66 |   }
67 |   ~Logger() {
68 | #ifdef LOGGING_LEVEL
69 |      if (level_ < LOGGING_LEVEL) {
70 |        return;
71 |      }
72 | #endif
73 |     std::cerr << stream_.str() << std::endl;
74 |     if (level_ == LL_FATAL) {
75 |       abort();
76 |     }
77 |   }
78 | 
79 |   std::ostream& Stream() {
80 |     return stream_;
81 |   }
82 | 
83 |  private:
84 |   std::ostringstream stream_;
85 |   size_t level_;
86 | }; // class Logger
87 | 
88 | } // namespace limonp
89 | 
90 | #endif // LIMONP_LOGGING_HPP
91 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/NonCopyable.hpp:
--------------------------------------------------------------------------------
 1 | /************************************
 2 |  ************************************/
 3 | #ifndef LIMONP_NONCOPYABLE_H
 4 | #define LIMONP_NONCOPYABLE_H
 5 | 
 6 | namespace limonp {
 7 | 
 8 | class NonCopyable {
 9 |  protected:
10 |   NonCopyable() {
11 |   }
12 |   ~NonCopyable() {
13 |   }
14 |  private:
15 |   NonCopyable(const NonCopyable& );
16 |   const NonCopyable& operator=(const NonCopyable& );
17 | }; // class NonCopyable
18 | 
19 | } // namespace limonp
20 | 
21 | #endif // LIMONP_NONCOPYABLE_H
22 | 


--------------------------------------------------------------------------------
/src/3rd_include/limonp/StdExtension.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIMONP_STD_EXTEMSION_HPP
  2 | #define LIMONP_STD_EXTEMSION_HPP
  3 | 
  4 | #include <map>
  5 | 
  6 | #ifdef __APPLE__
  7 | #include <unordered_map>
  8 | #include <unordered_set>
  9 | #elif(__cplusplus >= 201103L)
 10 | #include <unordered_map>
 11 | #include <unordered_set>
 12 | #elif defined _MSC_VER
 13 | #include <unordered_map>
 14 | #include <unordered_set>
 15 | #else
 16 | #include <tr1/unordered_map>
 17 | #include <tr1/unordered_set>
 18 | namespace std {
 19 | using std::tr1::unordered_map;
 20 | using std::tr1::unordered_set;
 21 | }
 22 | 
 23 | #endif
 24 | 
 25 | #include <set>
 26 | #include <string>
 27 | #include <vector>
 28 | #include <deque>
 29 | #include <fstream>
 30 | #include <sstream>
 31 | 
 32 | namespace std {
 33 | 
 34 | template<typename T>
 35 | ostream& operator << (ostream& os, const vector<T>& v) {
 36 |   if(v.empty()) {
 37 |     return os << "[]";
 38 |   }
 39 |   os<<"["<<v[0];
 40 |   for(size_t i = 1; i < v.size(); i++) {
 41 |     os<<", "<<v[i];
 42 |   }
 43 |   os<<"]";
 44 |   return os;
 45 | }
 46 | 
 47 | template<>
 48 | inline ostream& operator << (ostream& os, const vector<string>& v) {
 49 |   if(v.empty()) {
 50 |     return os << "[]";
 51 |   }
 52 |   os<<"[\""<<v[0];
 53 |   for(size_t i = 1; i < v.size(); i++) {
 54 |     os<<"\", \""<<v[i];
 55 |   }
 56 |   os<<"\"]";
 57 |   return os;
 58 | }
 59 | 
 60 | template<typename T>
 61 | ostream& operator << (ostream& os, const deque<T>& dq) {
 62 |   if(dq.empty()) {
 63 |     return os << "[]";
 64 |   }
 65 |   os<<"[\""<<dq[0];
 66 |   for(size_t i = 1; i < dq.size(); i++) {
 67 |     os<<"\", \""<<dq[i];
 68 |   }
 69 |   os<<"\"]";
 70 |   return os;
 71 | }
 72 | 
 73 | 
 74 | template<class T1, class T2>
 75 | ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
 76 |   os << pr.first << ":" << pr.second ;
 77 |   return os;
 78 | }
 79 | 
 80 | 
 81 | template<class T>
 82 | string& operator << (string& str, const T& obj) {
 83 |   stringstream ss;
 84 |   ss << obj; // call ostream& operator << (ostream& os,
 85 |   return str = ss.str();
 86 | }
 87 | 
 88 | template<class T1, class T2>
 89 | ostream& operator << (ostream& os, const map<T1, T2>& mp) {
 90 |   if(mp.empty()) {
 91 |     os<<"{}";
 92 |     return os;
 93 |   }
 94 |   os<<'{';
 95 |   typename map<T1, T2>::const_iterator it = mp.begin();
 96 |   os<<*it;
 97 |   it++;
 98 |   while(it != mp.end()) {
 99 |     os<<", "<<*it;
100 |     it++;
101 |   }
102 |   os<<'}';
103 |   return os;
104 | }
105 | template<class T1, class T2>
106 | ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
107 |   if(mp.empty()) {
108 |     return os << "{}";
109 |   }
110 |   os<<'{';
111 |   typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
112 |   os<<*it;
113 |   it++;
114 |   while(it != mp.end()) {
115 |     os<<", "<<*it++;
116 |   }
117 |   return os<<'}';
118 | }
119 | 
120 | template<class T>
121 | ostream& operator << (ostream& os, const set<T>& st) {
122 |   if(st.empty()) {
123 |     os << "{}";
124 |     return os;
125 |   }
126 |   os<<'{';
127 |   typename set<T>::const_iterator it = st.begin();
128 |   os<<*it;
129 |   it++;
130 |   while(it != st.end()) {
131 |     os<<", "<<*it;
132 |     it++;
133 |   }
134 |   os<<'}';
135 |   return os;
136 | }
137 | 
138 | template<class KeyType, class ContainType>
139 | bool IsIn(const ContainType& contain, const KeyType& key) {
140 |   return contain.end() != contain.find(key);
141 | }
142 | 
143 | template<class T>
144 | basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
145 |   return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
146 | }
147 | 
148 | template<class T>
149 | ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
150 |   ostreambuf_iterator<T> itr (ofs);
151 |   copy(s.begin(), s.end(), itr);
152 |   return ofs;
153 | }
154 | 
155 | } // namespace std
156 | 
157 | #endif
158 | 


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/TextNormalizer.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by tao on 25-4-9.
 3 | //
 4 | 
 5 | #ifndef TEXTNORMALIZATION_H
 6 | #define TEXTNORMALIZATION_H
 7 | 
 8 | 
 9 | #include <string>
10 | #include <vector>
11 | #include <regex>
12 | 
13 | class TextNormalizer {
14 | public:
15 |     TextNormalizer();
16 |     std::vector<std::string> normalize(const std::string& text);
17 |     static std::string normalize_sentence(const std::string& sentence);
18 | 
19 | private:
20 |     std::regex SENTENCE_SPLITOR;
21 |     std::vector<std::string> _split(const std::string& text, const std::string& lang = "zh") const;
22 |     static  std::string _post_replace(const std::string& sentence);
23 | };
24 | 
25 | #endif //TEXTNORMALIZATION_H
26 | 


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/chinese_converter.h:
--------------------------------------------------------------------------------
 1 | #ifndef CHINESE_CONVERTER_H
 2 | #define CHINESE_CONVERTER_H
 3 | 
 4 | #include <string>
 5 | #include <unordered_map>
 6 | 
 7 | class ChineseConverter {
 8 | public:
 9 |     static std::string traditionalToSimplified(const std::string& text);
10 |     static std::string simplifiedToTraditional(const std::string& text);
11 | 
12 | private:
13 |     static std::unordered_map<char32_t, char32_t> t2s_dict;
14 |     static std::unordered_map<char32_t, char32_t> s2t_dict;
15 |     static void initializeDicts();
16 |     static bool is_initialized;
17 | };
18 | 
19 | #endif // CHINESE_CONVERTER_H 


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/chronology.h:
--------------------------------------------------------------------------------
 1 | #ifndef CHRONOLOGY_H
 2 | #define CHRONOLOGY_H
 3 | 
 4 | #include <regex>
 5 | #include <string>
 6 | #include <map>
 7 | 
 8 | #include "num.h"
 9 | 
10 | class Chronology {
11 | public:
12 |     // 静态处理函数
13 |     static std::string time_num2str(const std::string& num_string);
14 |     static std::string replace_time(const std::smatch& match);
15 |     static std::string replace_date(const std::smatch& match);
16 |     static std::string replace_date2(const std::smatch& match);
17 | 
18 |     // 静态正则表达式
19 |     static inline auto RE_TIME = std::regex(R"(([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?)");
20 |     static inline auto RE_TIME_RANGE = std::regex(R"(([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?(~|-)([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?)");
21 |     // static inline auto RE_DATE =  std::regex(R"(((\d{4}|\d{2})年)?((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?)");
22 |     static inline auto RE_DATE =     std::regex(R"(((\d{2,4})年)?(([1-9]|1[0-2])月)?(((0?[1-9])|([12][0-9])|30|31)(日|号))?)");
23 |     static inline auto RE_DATE2 = std::regex(R"((\d{4})([-/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01]))");
24 | };
25 | 
26 | #endif //CHRONOLOGY_H 


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/constants.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <unordered_map>
 5 | #include <regex>
 6 | 
 7 | class Constants {
 8 | public:
 9 |     // 获取单例实例
10 |     static Constants& getInstance();
11 | 
12 |     // 全角转半角
13 |     std::string fullToHalf(const std::string& text);
14 |     
15 |     // 半角转全角
16 |     std::string halfToFull(const std::string& text);
17 |     
18 |     // 获取非汉字部分（NSW: Non-Standard-Word）
19 |     std::vector<std::string> getNSWs(const std::string& text);
20 | 
21 | private:
22 |     Constants();  // 私有构造函数
23 |     ~Constants() = default;
24 |     
25 |     // 禁止拷贝和赋值
26 |     Constants(const Constants&) = delete;
27 |     Constants& operator=(const Constants&) = delete;
28 | 
29 |     void initializeMaps();
30 |     
31 |     // 全角半角映射表
32 |     std::unordered_map<char32_t, char32_t> f2h_ascii_letters;
33 |     std::unordered_map<char32_t, char32_t> h2f_ascii_letters;
34 |     std::unordered_map<char32_t, char32_t> f2h_digits;
35 |     std::unordered_map<char32_t, char32_t> h2f_digits;
36 |     std::unordered_map<char32_t, char32_t> f2h_punctuations;
37 |     std::unordered_map<char32_t, char32_t> h2f_punctuations;
38 |     std::unordered_map<char32_t, char32_t> f2h_space;
39 |     std::unordered_map<char32_t, char32_t> h2f_space;
40 |     
41 |     // 用于NSW提取的正则表达式
42 |     std::regex re_nsw;
43 |     bool is_initialized;
44 |     
45 |     // 常量定义
46 |     static const std::string ASCII_LETTERS;
47 |     static const std::string DIGITS;
48 |     static const std::string PUNCTUATIONS;
49 |     
50 |     // NSW正则表达式模式
51 |     #ifdef SUPPORT_UCS4
52 |     static const std::string NSW_PATTERN;
53 |     #else
54 |     static const std::string NSW_PATTERN_NO_UCS4;
55 |     #endif
56 | }; 


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/num.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by tao on 25-4-9.
 3 | //
 4 | 
 5 | #ifndef NUM_H
 6 | #define NUM_H
 7 | 
 8 | /**
 9 |  * Rules to verbalize numbers into Chinese characters.
10 |  * https://zh.wikipedia.org/wiki/中文数字#現代中文
11 |  */
12 | 
13 | #include <string>
14 | #include <map>
15 | #include <vector>
16 | #include <regex>
17 | 
18 | class Num {
19 | public:
20 |     // 静态成员变量
21 |     static const std::map<int, std::string> UNITS;
22 |     static const std::map<std::string, std::string> DIGITS;
23 |     static const std::string COM_QUANTIFIERS;
24 | 
25 |     // // General number pattern
26 |     static inline auto RE_NUMBER = std::regex(R"((-?)((\d+)(\.\d+)?)|(\.(\d+)))");
27 |     // // Range pattern
28 |     static inline auto RE_RANGE = std::regex(R"(((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+))))");
29 |     static inline auto RE_FRAC = std::regex(R"((-?)(\d+)/(\d+))");
30 |     // static inline auto RE_PERCENTAGE = std::regex(R"((-?)(\d+(\.\d+))?%)");
31 |     static inline auto RE_PERCENTAGE = std::regex(R"((-?)(\d+)(\.\d+)?%)");
32 |     static inline auto RE_INTEGER = std::regex(R"((-)(\d+))");
33 |     static inline auto RE_DEFAULT_NUM = std::regex(R"(\d{3}\d*)");
34 |     // Pure decimal numbers (both signed and unsigned)
35 |     static inline auto RE_DECIMAL_NUM = std::regex(R"((-?)((\d+)(\.\d+))|(\.(\d+)))");
36 | 
37 |     static inline auto TMP_QUANTS =  "(\\d+)([多余几\\+])?" + COM_QUANTIFIERS;
38 |     // Positive numbers with quantifiers
39 |     static inline auto RE_POSITIVE_QUANTIFIERS = std::regex(TMP_QUANTS) ;
40 |     static inline auto RE_DIGITAL = std::regex(R"(\d+)");
41 | 
42 | 
43 |     // 辅助方法
44 |     static std::vector<std::string> _get_value(const std::string& value_string, bool use_zero = true);
45 |     static std::string verbalize_cardinal(const std::string& value_string);
46 |     static std::string verbalize_digit(const std::string& value_string, bool alt_one = false);
47 |     static  std::string get_digit(const std::string& num_string) {
48 |         std::smatch match;
49 |         if (std::regex_search(num_string, match, RE_DIGITAL)) {
50 |             return match.str();
51 |         }
52 | 
53 |         return "";
54 |     }
55 | 
56 | 
57 |     // 公共方法
58 |     static std::string num2str(const std::string& value_string);
59 |     static std::string replace_frac(const std::smatch& match);
60 |     static std::string replace_percentage(const std::smatch& match);
61 |     static std::string replace_negative_num(const std::smatch& match);
62 |     static std::string replace_default_num(const std::smatch& match);
63 |     static std::string replace_positive_quantifier(const std::smatch& match);
64 |     static std::string replace_number(const std::smatch& match);
65 |     static std::string replace_range(const std::smatch& match);
66 | 
67 | };
68 | 
69 | 
70 | // 示例用法
71 | /*
72 | int main() {
73 |     std::string test = "123.45";
74 |     std::cout << ChineseNumberConverter::num2str(test) << std::endl;
75 |     return 0;
76 | }
77 | */
78 | #endif //NUM_H
79 | 


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/phonecode.h:
--------------------------------------------------------------------------------
 1 | #ifndef PHONE_NORMALIZER_HPP
 2 | #define PHONE_NORMALIZER_HPP
 3 | 
 4 | #include <string>
 5 | #include <regex>
 6 | 
 7 | class PhoneNormalizer {
 8 | public:
 9 |     // 将电话号码转换为字符串表示
10 |     static std::string phone2str(const std::string& phone_string, bool mobile = true);
11 | 
12 |     // 替换固定电话号码
13 |     static std::string replace_phone(const std::smatch& match);
14 | 
15 |     // 替换手机号码
16 |     static std::string replace_mobile(const std::smatch& match);
17 | 
18 | 
19 |     // Mobile phone numbers
20 |     // Matches: 13812345678, +8613812345678, +86 13812345678
21 |     static inline auto RE_MOBILE_PHONE  = std::regex(
22 |     R"((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})"
23 |     );
24 | 
25 |     // Telephone numbers
26 |     // Matches: 010-1234567, 0512-1234567, 12345678
27 |      static inline auto RE_TELEPHONE  = std::regex(
28 |      R"((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})"
29 |     );
30 | 
31 |     // National uniform numbers (400 numbers)
32 |     // Matches: 400-123-4567, 4001234567
33 |     static inline auto RE_NATIONAL_UNIFORM_NUMBER = std::regex(
34 |     R"(400(-)?\d{3}(-)?\d{4})"
35 |     );
36 | 
37 |     // 禁止创建实例
38 |     PhoneNormalizer() = delete;
39 | };
40 | #endif // PHONE_NORMALIZER_HPP


--------------------------------------------------------------------------------
/src/3rd_include/zh_normalization/quantifier.h:
--------------------------------------------------------------------------------
 1 | #ifndef QUANTIFIER_H
 2 | #define QUANTIFIER_H
 3 | 
 4 | #include <string>
 5 | #include <map>
 6 | #include <regex>
 7 | 
 8 | class Quantifier {
 9 | public:
10 |     static std::string replace_temperature(const std::smatch& match);
11 |     static std::string replace_measure(std::string& sentence);
12 |     static inline auto RE_TEMPERATURE = std::regex(R"((-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度))");
13 |     static const std::regex RE_PUNCTUATION;
14 | 
15 | private:
16 |     static const std::map<std::string, std::string> measure_dict;
17 | };
18 | 
19 | #endif //QUANTIFIER_H 


--------------------------------------------------------------------------------
/src/cpp-pinyin/CanTone.cpp:
--------------------------------------------------------------------------------
1 | #include <cpp-pinyin/CanTone.h>
2 | 
3 | namespace Pinyin
4 | {
5 |     std::u16string CanTone::tone3ToNormal(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) {
6 |         return {pinyin.begin(), pinyin.end() - 1};
7 |     }
8 | } // Pinyin
9 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/DictUtil.cpp:
--------------------------------------------------------------------------------
  1 | #include <filesystem>
  2 | #include <iostream>
  3 | #include <fstream>
  4 | #include <sstream>
  5 | #include <functional>
  6 | #include <vector>
  7 | #include <string>
  8 | 
  9 | #include "cpp-pinyin/DictUtil.h"
 10 | #include "cpp-pinyin/U16Str.h"
 11 | 
 12 | namespace Pinyin
 13 | {
 14 |     // Helper function to read and open file
 15 |     static std::ifstream openFile(const std::filesystem::path &dict_dir) {
 16 | #ifdef _WIN32
 17 |         const std::wstring wdict_dir = dict_dir.wstring();
 18 |         return std::ifstream(wdict_dir.c_str());
 19 | #else
 20 |         return std::ifstream(dict_dir.c_str());
 21 | #endif
 22 |     }
 23 | 
 24 |     // Helper function to trim whitespace from a string
 25 |     static void trim(std::string &str) {
 26 |         str.erase(0, str.find_first_not_of(" \t\r\n"));
 27 |         str.erase(str.find_last_not_of(" \t\r\n") + 1);
 28 |     }
 29 | 
 30 |     // Common function for reading lines and processing key-value pairs
 31 |     template <typename K, typename V, typename KeyFunc, typename ValueFunc>
 32 |     static bool processFile(std::ifstream &file, std::unordered_map<K, V> &resultMap,
 33 |                             const char &sep1, KeyFunc keyProcessor, ValueFunc valueProcessor) {
 34 |         if (!file.is_open()) {
 35 |             std::cerr << "Error: Unable to open file" << std::endl;
 36 |             return false;
 37 |         }
 38 | 
 39 |         std::string line;
 40 |         while (std::getline(file, line)) {
 41 |             trim(line);
 42 |             std::istringstream iss(line);
 43 |             std::string key, value;
 44 |             if (std::getline(iss, key, sep1) && std::getline(iss, value)) {
 45 |                 resultMap[keyProcessor(key)] = valueProcessor(value);
 46 |             }
 47 |         }
 48 |         return true;
 49 |     }
 50 | 
 51 |     static std::vector<std::string> split(const std::string &s, const std::string &delimiter) {
 52 |         std::vector<std::string> tokens;
 53 |         if (delimiter.empty()) {
 54 |             for (char c : s) {
 55 |                 tokens.emplace_back(1, c);
 56 |             }
 57 |         } else {
 58 |             std::string::size_type start = 0;
 59 |             std::string::size_type end = s.find(delimiter);
 60 |             while (end != std::string::npos) {
 61 |                 tokens.push_back(s.substr(start, end - start));
 62 |                 start = end + delimiter.size();
 63 |                 end = s.find(delimiter, start);
 64 |             }
 65 |             tokens.push_back(s.substr(start));
 66 |         }
 67 |         return tokens;
 68 |     }
 69 | 
 70 |     bool loadDict(const std::filesystem::path &dict_dir,
 71 |                   std::unordered_map<char16_t, char16_t> &resultMap, const char &sep1) {
 72 |         std::ifstream file = openFile(dict_dir);
 73 |         return processFile(file, resultMap, sep1,
 74 |                            [](const std::string &key) { return utf8strToU16str(key)[0]; },
 75 |                            [](const std::string &value) { return utf8strToU16str(value)[0]; });
 76 |     }
 77 | 
 78 |     bool loadDict(const std::filesystem::path &dict_dir,
 79 |                   std::unordered_map<char16_t, std::u16string> &resultMap, const char &sep1) {
 80 |         std::ifstream file = openFile(dict_dir);
 81 |         return processFile(file, resultMap, sep1,
 82 |                            [](const std::string &key) { return utf8strToU16str(key)[0]; },
 83 |                            [](const std::string &value) { return utf8strToU16str(value); });
 84 |     }
 85 | 
 86 |     bool loadDict(const std::filesystem::path &dict_dir,
 87 |                   std::unordered_map<char16_t, std::vector<std::u16string>> &resultMap, const char &sep1,
 88 |                   const std::string &sep2) {
 89 |         std::ifstream file = openFile(dict_dir);
 90 |         return processFile(file, resultMap, sep1,
 91 |                            [](const std::string &key) { return utf8strToU16str(key)[0]; },
 92 |                            [&sep2](const std::string &value)
 93 |                            {
 94 |                                std::vector<std::u16string> u8strlist;
 95 |                                for (const auto &str : split(value, sep2)) {
 96 |                                    if (!str.empty())
 97 |                                        u8strlist.emplace_back(utf8strToU16str(str));
 98 |                                }
 99 |                                return u8strlist;
100 |                            });
101 |     }
102 | 
103 |     bool loadDict(const std::filesystem::path &dict_dir,
104 |                   std::unordered_map<std::u16string, std::vector<std::u16string>> &resultMap, const char &sep1,
105 |                   const std::string &sep2) {
106 |         std::ifstream file = openFile(dict_dir);
107 |         return processFile(file, resultMap, sep1,
108 |                            [](const std::string &key) { return utf8strToU16str(key); },
109 |                            [&sep2](const std::string &value)
110 |                            {
111 |                                std::vector<std::u16string> u8strlist;
112 |                                for (const auto &str : split(value, sep2)) {
113 |                                    if (!str.empty())
114 |                                        u8strlist.emplace_back(utf8strToU16str(str));
115 |                                }
116 |                                return u8strlist;
117 |                            });
118 |     }
119 | 
120 |     bool loadAdditionalDict(const std::filesystem::path &dict_dir,
121 |                             std::unordered_map<std::u16string, std::vector<std::u16string>> &resultMap,
122 |                             const char &sep1,
123 |                             const std::string &sep2,
124 |                             const std::function<std::u16string(const std::u16string &pinyin)> &
125 |                             converterForDefaultPinyin) {
126 |         std::ifstream file = openFile(dict_dir);
127 |         return processFile(file, resultMap, sep1,
128 |                            [](const std::string &key) { return utf8strToU16str(key); },
129 |                            [&sep2, &converterForDefaultPinyin](const std::string &value)
130 |                            {
131 |                                std::vector<std::u16string> u8strlist;
132 |                                for (const auto &str : split(value, sep2)) {
133 |                                    if (!str.empty())
134 |                                        u8strlist.emplace_back(converterForDefaultPinyin(utf8strToU16str(str)));
135 |                                }
136 |                                return u8strlist;
137 |                            });
138 |     }
139 | } // namespace Pinyin
140 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/G2pglobal.cpp:
--------------------------------------------------------------------------------
 1 | #include <cpp-pinyin/G2pglobal.h>
 2 | 
 3 | #include <memory>
 4 | #include <unordered_set>
 5 | 
 6 | namespace Pinyin
 7 | {
 8 |     class G2pGlobal {
 9 |     public:
10 |         std::filesystem::path path;
11 |     };
12 | 
13 |     auto m_global = std::make_unique<G2pGlobal>();
14 | 
15 |     std::filesystem::path dictionaryPath() {
16 |         return m_global->path;
17 |     }
18 | 
19 |     void setDictionaryPath(const std::filesystem::path &dir) {
20 |         m_global->path = dir;
21 |     }
22 | 
23 |     bool isLetter(const char16_t &c) {
24 |         return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
25 |     }
26 | 
27 |     bool isHanzi(const char16_t &c) {
28 |         return c >= 0x4e00 && c <= 0x9fa5;
29 |     }
30 | 
31 |     bool isKana(const char16_t &c) {
32 |         return (c >= 0x3040 && c <= 0x309F) || (c >= 0x30A0 && c <= 0x30FF);
33 |     }
34 | 
35 |     bool isDigit(const char16_t &c) {
36 |         return c >= '0' && c <= '9';
37 |     }
38 | 
39 |     bool isSpace(const char16_t &c) {
40 |         return c == ' ';
41 |     }
42 | 
43 |     bool isSpecialKana(const char16_t &c) {
44 |         static const std::unordered_set<char16_t> specialKana = {
45 |             u'ャ', u'ュ', u'ョ', u'ゃ', u'ゅ', u'ょ',
46 |             u'ァ', u'ィ', u'ゥ', u'ェ', u'ォ', u'ぁ', u'ぃ', u'ぅ', u'ぇ', u'ぉ'
47 |         };
48 |         return specialKana.find(c) != specialKana.end();
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/Jyutping.cpp:
--------------------------------------------------------------------------------
 1 | #include <cpp-pinyin/Jyutping.h>
 2 | 
 3 | namespace Pinyin
 4 | {
 5 |     PinyinResVector Jyutping::hanziToPinyin(const std::string &hans, CanTone::Style style, Error error,
 6 |                                             bool candidates) const {
 7 |         /*
 8 |             @param hans : raw utf-8 std::string.
 9 |             @param ManTone::Style : Preserve the pinyin tone.
10 |             @param errorType : Ignore words that have failed conversion. Default: Keep original.
11 |             @param candidates : Return all possible pinyin candidates. Default: true.
12 |             @return PinyinResVector.
13 |         */
14 |         return ChineseG2p::hanziToPinyin(hans, static_cast<int>(style), error, candidates, false, false);
15 |     }
16 | 
17 |     PinyinResVector Jyutping::hanziToPinyin(const std::vector<std::string> &hans, CanTone::Style style,
18 |                                             Error error, bool candidates) const {
19 |         /*
20 |             @param hans : raw utf-8 std::string vector, each element of the vector is a character.
21 |             @param ManTone::Style : Preserve the pinyin tone.
22 |             @param errorType : Ignore words that have failed conversion. Default: Keep original.
23 |             @param candidates : Return all possible pinyin candidates. Default: true.
24 |             @return PinyinResVector.
25 |         */
26 |         return ChineseG2p::hanziToPinyin(hans, static_cast<int>(style), error, candidates, false, false);
27 |     }
28 | 
29 |     //  Convert to Simplified Chinese.  utf-8 std::string
30 |     std::vector<std::string> Jyutping::getDefaultPinyin(const std::string &hanzi, CanTone::Style style) const {
31 |         return ChineseG2p::getDefaultPinyin(hanzi, static_cast<int>(style), false, false);
32 |     }
33 | } // Pinyin
34 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/ManTone.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <unordered_map>
  3 | 
  4 | #include <cpp-pinyin/G2pglobal.h>
  5 | #include <cpp-pinyin/ManTone.h>
  6 | 
  7 | namespace Pinyin
  8 | {
  9 |     // 映射表，音调符号 -> (无音调元音, 对应的调号)
 10 |     static const std::unordered_map<char16_t, std::pair<char16_t, char16_t>> toneToNum = {
 11 |         {u'ā', {u'a', u'1'}}, {u'á', {u'a', u'2'}}, {u'ǎ', {u'a', u'3'}}, {u'à', {u'a', u'4'}},
 12 |         {u'ō', {u'o', u'1'}}, {u'ó', {u'o', u'2'}}, {u'ǒ', {u'o', u'3'}}, {u'ò', {u'o', u'4'}},
 13 |         {u'ē', {u'e', u'1'}}, {u'é', {u'e', u'2'}}, {u'ě', {u'e', u'3'}}, {u'è', {u'e', u'4'}},
 14 |         {u'ī', {u'i', u'1'}}, {u'í', {u'i', u'2'}}, {u'ǐ', {u'i', u'3'}}, {u'ì', {u'i', u'4'}},
 15 |         {u'ū', {u'u', u'1'}}, {u'ú', {u'u', u'2'}}, {u'ǔ', {u'u', u'3'}}, {u'ù', {u'u', u'4'}},
 16 |         {u'ǖ', {u'v', u'1'}}, {u'ǘ', {u'v', u'2'}}, {u'ǚ', {u'v', u'3'}}, {u'ǜ', {u'v', u'4'}},
 17 |         {u'ń', {u'n', u'2'}}, {u'ň', {u'n', u'3'}}, {u'ǹ', {u'n', u'4'}},
 18 |         {u'ḿ', {u'm', u'2'}}
 19 |     };
 20 | 
 21 |     std::u16string ManTone::toneToNormal(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) {
 22 |         std::u16string result;
 23 |         result.reserve(pinyin.size());
 24 | 
 25 |         for (const char16_t &ch : pinyin) {
 26 |             if (isLetter(ch)) {
 27 |                 result += ch;
 28 |             } else {
 29 |                 const auto &it = toneToNum.find(ch);
 30 |                 result += it != toneToNum.end() ? it->second.first : ch;
 31 |             }
 32 |         }
 33 | 
 34 |         if (!v_to_u)
 35 |             std::replace(result.begin(), result.end(), u'ü', u'v');
 36 | 
 37 |         return result;
 38 |     }
 39 | 
 40 |     std::u16string ManTone::toneToTone(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) {
 41 |         if (v_to_u)
 42 |             return pinyin;
 43 | 
 44 |         std::u16string result;
 45 |         result.reserve(pinyin.size());
 46 | 
 47 |         for (const char16_t &ch : pinyin) {
 48 |             if (isLetter(ch)) {
 49 |                 result += ch;
 50 |             } else {
 51 |                 result += ch == u'ü' ? u'v' : ch;
 52 |             }
 53 |         }
 54 | 
 55 |         return result;
 56 |     }
 57 | 
 58 |     std::u16string ManTone::toneToTone2(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) {
 59 |         std::u16string result;
 60 |         result.reserve(pinyin.size() + 1);
 61 | 
 62 |         for (const char16_t &ch : pinyin) {
 63 |             if (isLetter(ch)) {
 64 |                 result += ch;
 65 |             } else {
 66 |                 const auto &it = toneToNum.find(ch);
 67 |                 if (it != toneToNum.end()) {
 68 |                     result += it->second.first;
 69 |                     const char16_t &toneNumber = it->second.second;
 70 |                     if (!(!neutral_tone_with_five && toneNumber == u'5'))
 71 |                         result += toneNumber;
 72 |                 } else {
 73 |                     if (!v_to_u && ch == u'ü') {
 74 |                         result += u'v';
 75 |                         continue;
 76 |                     }
 77 |                     result += ch;
 78 |                 }
 79 |             }
 80 |         }
 81 |         return result;
 82 |     }
 83 | 
 84 | 
 85 |     std::u16string ManTone::toneToTone3(const std::u16string &pinyin, bool v_to_u, bool neutral_tone_with_five) {
 86 |         std::u16string result;
 87 |         result.reserve(pinyin.size() + 1);
 88 | 
 89 |         char16_t toneNumber = u'5';
 90 | 
 91 |         for (const char16_t &ch : pinyin) {
 92 |             if (isLetter(ch)) {
 93 |                 result += ch;
 94 |             } else {
 95 |                 const auto &it = toneToNum.find(ch);
 96 |                 if (it != toneToNum.end()) {
 97 |                     result += it->second.first;
 98 |                     toneNumber = it->second.second;
 99 |                 } else {
100 |                     if (!v_to_u && ch == u'ü') {
101 |                         result += u'v';
102 |                         continue;
103 |                     }
104 |                     result += ch;
105 |                 }
106 |             }
107 |         }
108 | 
109 |         result += toneNumber;
110 | 
111 |         if (!neutral_tone_with_five && toneNumber == u'5')
112 |             result = result.substr(0, result.length() - 1);
113 |         return result;
114 |     }
115 | } // Pinyin
116 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/ManToneUtil.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <unordered_map>
  3 | #include <vector>
  4 | 
  5 | #include "cpp-pinyin/ManToneUtil.h"
  6 | 
  7 | namespace Pinyin
  8 | {
  9 |     // 定义 phonetic_symbol_reverse 映射表
 10 |     static const std::unordered_map<std::u16string, char16_t> phoneticSymbolReverse = {
 11 |         {u"a1", u'ā'}, {u"a2", u'á'}, {u"a3", u'ǎ'}, {u"a4", u'à'},
 12 |         {u"e1", u'ē'}, {u"e2", u'é'}, {u"e3", u'ě'}, {u"e4", u'è'},
 13 |         {u"i1", u'ī'}, {u"i2", u'í'}, {u"i3", u'ǐ'}, {u"i4", u'ì'},
 14 |         {u"o1", u'ō'}, {u"o2", u'ó'}, {u"o3", u'ǒ'}, {u"o4", u'ò'},
 15 |         {u"u1", u'ū'}, {u"u2", u'ú'}, {u"u3", u'ǔ'}, {u"u4", u'ù'},
 16 |         {u"v1", u'ǖ'}, {u"v2", u'ǘ'}, {u"v3", u'ǚ'}, {u"v4", u'ǜ'},
 17 |     };
 18 | 
 19 |     // https://github.com/mozillazg/python-pinyin/blob/master/pypinyin/style/_tone_rule.py
 20 |     int rightMarkIndex(const std::u16string &pinyin_no_tone) {
 21 |         // 'iou', 'uei', 'uen': 根据还原前的拼音进行标记
 22 |         if (pinyin_no_tone.find(u"iou") != std::string::npos) {
 23 |             return pinyin_no_tone.find('u');
 24 |         }
 25 |         if (pinyin_no_tone.find(u"uei") != std::string::npos) {
 26 |             return pinyin_no_tone.find('i');
 27 |         }
 28 |         if (pinyin_no_tone.find(u"uen") != std::string::npos) {
 29 |             return pinyin_no_tone.find('u');
 30 |         }
 31 | 
 32 |         // 有 'a' 不放过, 没 'a' 找 'o'、'e'
 33 |         static const std::vector<char16_t> vowels = {u'a', u'o', u'e'};
 34 |         for (const char16_t c : vowels) {
 35 |             const auto pos = pinyin_no_tone.find(c);
 36 |             if (pos != std::u16string::npos) {
 37 |                 return pos;
 38 |             }
 39 |         }
 40 | 
 41 |         // 'i'、'u' 若是连在一起，谁在后面就标谁
 42 |         static const std::vector<std::u16string> combos = {u"iu", u"ui"};
 43 |         for (const std::u16string &combo : combos) {
 44 |             const auto pos = pinyin_no_tone.find(combo);
 45 |             if (pos != std::u16string::npos) {
 46 |                 return pos + 1;
 47 |             }
 48 |         }
 49 | 
 50 |         // 'i'、'u'、'v'、'ü'
 51 |         static const std::vector<char16_t> other_vowels = {u'i', u'u', u'v', u'ü'};
 52 |         for (const char16_t c : other_vowels) {
 53 |             const auto pos = pinyin_no_tone.find(c);
 54 |             if (pos != std::u16string::npos) {
 55 |                 return pos;
 56 |             }
 57 |         }
 58 | 
 59 |         // 'n', 'm', 'ê'
 60 |         static const std::vector<char16_t> final_chars = {u'n', u'm', u'ê'};
 61 |         for (const char16_t c : final_chars) {
 62 |             const auto pos = pinyin_no_tone.find(c);
 63 |             if (pos != std::u16string::npos) {
 64 |                 return pos;
 65 |             }
 66 |         }
 67 | 
 68 |         // 如果没有找到合适的位置，则返回-1表示没有可以标记的位置
 69 |         return -1;
 70 |     }
 71 | 
 72 |     static bool isToneNumber(const char16_t c) {
 73 |         return c >= u'1' && c <= u'5';
 74 |     }
 75 | 
 76 |     static bool isPhoneticSymbol(const char16_t c) {
 77 |         return std::u16string(u"aeiouüv").find(c) != std::u16string::npos;
 78 |     }
 79 | 
 80 |     static std::u16string toneToTone(const std::u16string &tone2) {
 81 |         // 替换 "ü" 为 "v" 并去掉 5 和 0
 82 |         std::u16string string;
 83 |         for (const char16_t c : tone2)
 84 |             string += c == u'ü' ? u'v' : c;
 85 | 
 86 |         string.erase(std::remove(string.begin(), string.end(), u'5'), string.end());
 87 |         string.erase(std::remove(string.begin(), string.end(), u'0'), string.end());
 88 | 
 89 |         std::vector<char16_t> result;
 90 | 
 91 |         int pos = 0;
 92 |         while (pos < string.size()) {
 93 |             const char16_t &currentChar = string[pos];
 94 |             if (isPhoneticSymbol(currentChar)) {
 95 |                 if (pos + 1 < string.length() && isToneNumber(string[pos + 1])) {
 96 |                     const auto str = string.substr(pos, 2);
 97 |                     const auto it = phoneticSymbolReverse.find(str);
 98 |                     if (it != phoneticSymbolReverse.end()) {
 99 |                         result.emplace_back(it->second);
100 |                         pos += 2;
101 |                     } else {
102 |                         result.emplace_back(currentChar);
103 |                         pos++;
104 |                     }
105 |                 } else {
106 |                     result.emplace_back(currentChar);
107 |                     pos++;
108 |                 }
109 |             } else {
110 |                 result.emplace_back(currentChar);
111 |                 pos++;
112 |             }
113 |         }
114 | 
115 |         std::u16string result_str;
116 |         for (const char16_t c : result)
117 |             result_str += c == u'ü' ? u'v' : c;
118 | 
119 |         return result_str;
120 |     }
121 | 
122 |     static std::u16string tone3ToTone2(const std::u16string &pinyin) {
123 |         const auto no_number_tone3 = pinyin.size() > 1 && isToneNumber(pinyin.back())
124 |             ? pinyin.substr(0, pinyin.size() - 1)
125 |             : pinyin;
126 |         auto mark_index = rightMarkIndex(no_number_tone3);
127 |         if (mark_index == -1)
128 |             mark_index = no_number_tone3.size() - 1;
129 | 
130 |         const std::u16string before = no_number_tone3.substr(0, mark_index + 1);
131 |         const std::u16string after = no_number_tone3.substr(mark_index + 1);
132 |         const std::u16string number = pinyin.substr(pinyin.size() - 1);
133 | 
134 |         return before + number + after;
135 |     }
136 | 
137 |     std::u16string tone3ToTone(const std::u16string &pinyin) {
138 |         const auto tone2 = tone3ToTone2(pinyin);
139 |         return toneToTone(tone2);
140 |     }
141 | 
142 | 
143 | } // Pinyin
144 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/Pinyin.cpp:
--------------------------------------------------------------------------------
 1 | #include <cpp-pinyin/Pinyin.h>
 2 | 
 3 | namespace Pinyin
 4 | {
 5 |     PinyinResVector Pinyin::hanziToPinyin(const std::string &hans, ManTone::Style style, Error error, bool candidates,
 6 |                                           bool v_to_u, bool neutral_tone_with_five) const {
 7 |         /*
 8 |             @param hans : raw utf-8 std::string.
 9 |             @param ManTone::Style : Preserve the pinyin tone.
10 |             @param errorType : Ignore words that have failed conversion. Default: Keep original.
11 |             @param candidates : Return all possible pinyin candidates. Default: true.
12 |             @param v_to_u : Convert v to ü. Default: false.
13 |             @param neutral_tone_with_five : Use 5 as neutral tone. Default: false.
14 |             @return PinyinResVector.
15 |         */
16 |         return ChineseG2p::hanziToPinyin(hans, static_cast<int>(style), error, candidates, v_to_u,
17 |                                          neutral_tone_with_five);
18 |     }
19 | 
20 |     PinyinResVector Pinyin::hanziToPinyin(const std::vector<std::string> &hans, ManTone::Style style,
21 |                                           Error error, bool candidates, bool v_to_u,
22 |                                           bool neutral_tone_with_five) const {
23 |         /*
24 |             @param hans : raw utf-8 std::string vector, each element of the vector is a character.
25 |             @param ManTone::Style : Preserve the pinyin tone.
26 |             @param errorType : Ignore words that have failed conversion. Default: Keep original.
27 |             @param candidates : Return all possible pinyin candidates. Default: true.
28 |             @param v_to_u : Convert v to ü. Default: false.
29 |             @param neutral_tone_with_five : Use 5 as neutral tone. Default: false.
30 |             @return PinyinResVector.
31 |         */
32 |         return ChineseG2p::hanziToPinyin(hans, static_cast<int>(style), error, candidates, v_to_u,
33 |                                          neutral_tone_with_five);
34 |     }
35 | 
36 |     //  Convert to Simplified Chinese.  utf-8 std::string
37 |     std::vector<std::string> Pinyin::getDefaultPinyin(const std::string &hanzi, ManTone::Style style,
38 |                                                       bool v_to_u, bool neutral_tone_with_five) const {
39 |         return ChineseG2p::getDefaultPinyin(hanzi, static_cast<int>(style), v_to_u, neutral_tone_with_five);
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/PinyinRes.cpp:
--------------------------------------------------------------------------------
 1 | #include <cpp-pinyin/PinyinRes.h>
 2 | 
 3 | namespace Pinyin
 4 | {
 5 |     // Convert PinyinResVector to std::vector<std::string>
 6 |     std::vector<std::string> PinyinResVector::toStdVector() const {
 7 |         std::vector<std::string> result;
 8 |         result.reserve(this->size());
 9 |         for (const auto &res : *this) {
10 |             result.emplace_back(res.error ? res.hanzi : res.pinyin);
11 |         }
12 |         return result;
13 |     }
14 | 
15 |     // Convert PinyinResVector to std::string with delimiter
16 |     std::string PinyinResVector::toStdStr(const std::string &delimiter) const {
17 |         std::string result;
18 |         bool first = true;
19 | 
20 |         for (const auto &res : *this) {
21 |             if (!first) {
22 |                 result += delimiter;
23 |             }
24 |             result += res.error ? res.hanzi : res.pinyin;
25 |             first = false;
26 |         }
27 | 
28 |         return result;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/ToneConverter.cpp:
--------------------------------------------------------------------------------
 1 | #include <cpp-pinyin/ToneConverter.h>
 2 | 
 3 | namespace Pinyin
 4 | {
 5 |     std::u16string ToneConverter::convert(std::u16string str, int style, bool v_to_u,
 6 |                                           bool neutral_tone_with_five) const {
 7 |         const auto it = m_converts.find(style);
 8 | 
 9 |         if (it == m_converts.end()) {
10 |             return str;
11 |         }
12 |         return it->second(str, v_to_u, neutral_tone_with_five);
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/cpp-pinyin/U16Str.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <cpp-pinyin/U16Str.h>
  3 | 
  4 | #include <string>
  5 | #include <stdexcept>
  6 | 
  7 | namespace Pinyin
  8 | {
  9 |     std::string u16strToUtf8str(const char16_t &ch16) {
 10 |         std::string utf8str;
 11 |         utf8str.reserve(3); // UTF-16 characters could expand into 3 bytes in UTF-8
 12 |         if (ch16 <= 0x7F) {
 13 |             // 1-byte UTF-8
 14 |             utf8str.push_back(static_cast<char>(ch16));
 15 |         } else if (ch16 <= 0x7FF) {
 16 |             // 2-byte UTF-8
 17 |             utf8str.push_back(static_cast<char>(0xC0 | ((ch16 >> 6) & 0x1F)));
 18 |             utf8str.push_back(static_cast<char>(0x80 | (ch16 & 0x3F)));
 19 |         } else {
 20 |             // 3-byte UTF-8
 21 |             utf8str.push_back(static_cast<char>(0xE0 | ((ch16 >> 12) & 0x0F)));
 22 |             utf8str.push_back(static_cast<char>(0x80 | ((ch16 >> 6) & 0x3F)));
 23 |             utf8str.push_back(static_cast<char>(0x80 | (ch16 & 0x3F)));
 24 |         }
 25 |         return utf8str;
 26 |     }
 27 | 
 28 |     std::string u16strToUtf8str(const std::u16string &u16str) {
 29 |         std::string utf8str;
 30 |         utf8str.reserve(u16str.size() * 3); // UTF-16 characters could expand into 3 bytes in UTF-8
 31 | 
 32 |         for (size_t i = 0; i < u16str.size(); ++i) {
 33 |             const uint16_t ch = u16str[i];
 34 | 
 35 |             if (ch < 0x80) {
 36 |                 // 1-byte sequence
 37 |                 utf8str.push_back(static_cast<char>(ch));
 38 |             } else if (ch < 0x800) {
 39 |                 // 2-byte sequence
 40 |                 utf8str.push_back(static_cast<char>(0xC0 | (ch >> 6)));
 41 |                 utf8str.push_back(static_cast<char>(0x80 | (ch & 0x3F)));
 42 |             } else if (ch >= 0xD800 && ch <= 0xDBFF) {
 43 |                 // High surrogate (part of a 4-byte UTF-16 character)
 44 |                 if (i + 1 >= u16str.size())
 45 |                     throw std::invalid_argument("Invalid UTF-16 surrogate pair");
 46 | 
 47 |                 const uint16_t low = u16str[i + 1];
 48 |                 if (low < 0xDC00 || low > 0xDFFF)
 49 |                     throw std::invalid_argument("Invalid UTF-16 surrogate pair");
 50 | 
 51 |                 const uint32_t codepoint = ((ch - 0xD800) << 10) + (low - 0xDC00) + 0x10000;
 52 |                 utf8str.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
 53 |                 utf8str.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
 54 |                 utf8str.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
 55 |                 utf8str.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
 56 |                 ++i; // Skip next low surrogate
 57 |             } else {
 58 |                 // 3-byte sequence
 59 |                 utf8str.push_back(static_cast<char>(0xE0 | (ch >> 12)));
 60 |                 utf8str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3F)));
 61 |                 utf8str.push_back(static_cast<char>(0x80 | (ch & 0x3F)));
 62 |             }
 63 |         }
 64 | 
 65 |         return utf8str;
 66 |     }
 67 | 
 68 |     std::u16string utf8strToU16str(const std::string &utf8str) {
 69 |         std::u16string u16str;
 70 |         u16str.reserve(utf8str.size());
 71 | 
 72 |         size_t i = 0;
 73 |         while (i < utf8str.size()) {
 74 |             const unsigned char c = utf8str[i];
 75 | 
 76 |             if (c < 0x80) {
 77 |                 // 1-byte sequence
 78 |                 u16str.push_back(c);
 79 |                 ++i;
 80 |             } else if (c < 0xE0) {
 81 |                 // 2-byte sequence
 82 |                 if (i + 1 >= utf8str.size())
 83 |                     throw std::invalid_argument("Invalid UTF-8 sequence");
 84 |                 u16str.push_back(((c & 0x1F) << 6) | (utf8str[i + 1] & 0x3F));
 85 |                 i += 2;
 86 |             } else if (c < 0xF0) {
 87 |                 // 3-byte sequence
 88 |                 if (i + 2 >= utf8str.size())
 89 |                     throw std::invalid_argument("Invalid UTF-8 sequence");
 90 |                 u16str.push_back(((c & 0x0F) << 12) | ((utf8str[i + 1] & 0x3F) << 6) | (utf8str[i + 2] & 0x3F));
 91 |                 i += 3;
 92 |             } else {
 93 |                 // 4-byte sequence (assuming UTF-32 character, but storing in UTF-16)
 94 |                 if (i + 3 >= utf8str.size())
 95 |                     throw std::invalid_argument("Invalid UTF-8 sequence");
 96 |                 uint32_t codepoint = ((c & 0x07) << 18) | ((utf8str[i + 1] & 0x3F) << 12) |
 97 |                     ((utf8str[i + 2] & 0x3F) << 6) | (utf8str[i + 3] & 0x3F);
 98 |                 codepoint -= 0x10000;
 99 |                 u16str.push_back(0xD800 | (codepoint >> 10)); // High surrogate
100 |                 u16str.push_back(0xDC00 | (codepoint & 0x3FF)); // Low surrogate
101 |                 i += 4;
102 |             }
103 |         }
104 | 
105 |         return u16str;
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/tokenizer.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  tokenizer.hpp
  3 | //
  4 | //  Created by MNN on 2023/09/25.
  5 | //  ZhaodeWang
  6 | //
  7 | 
  8 | #ifndef TOKENIZER_hpp
  9 | #define TOKENIZER_hpp
 10 | 
 11 | #include <vector>
 12 | #include <memory>
 13 | #include <string>
 14 | #include <unordered_map>
 15 | #include <iostream>
 16 | // #include <string_view>
 17 | #include <cstring>
 18 | class string_view_ {
 19 | public:
 20 |     string_view_() : data_(nullptr), size_(0) {}
 21 |     string_view_(const char* data) : data_(data), size_(std::strlen(data)) {}
 22 |     string_view_(const char* data, std::size_t size) : data_(data), size_(size) {}
 23 |     string_view_(const std::string& str) : data_(str.data()), size_(str.size()) {}
 24 |     constexpr string_view_(const string_view_&) noexcept = default;
 25 |     string_view_& operator=(const string_view_&) noexcept = default;
 26 |     const char& operator[](size_t pos) const { return data_[pos]; }
 27 |     constexpr const char* data() const noexcept { return data_; }
 28 |     constexpr std::size_t size() const noexcept { return size_; }
 29 |     constexpr bool empty() const { return size_ == 0; }
 30 |     std::string to_string() const { return std::string(data_, size_); }
 31 |     bool operator==(const string_view_& other) const noexcept {
 32 |         return size_ == other.size_ && strncmp(data_, other.data_, size_) == 0;
 33 |     }
 34 |     void remove_prefix(size_t n) {
 35 |         if (n < size_) {
 36 |             data_ += n;
 37 |             size_ -= n;
 38 |         } else {
 39 |             data_ = "";
 40 |             size_ = 0;
 41 |         }
 42 |     }
 43 | private:
 44 |     const char* data_;
 45 |     std::size_t size_ = 0;
 46 | };
 47 | // std::string_view impl in c++11 end
 48 | 
 49 | namespace std {
 50 |     template<>
 51 |     class hash<string_view_> {
 52 |     public:
 53 |         size_t operator()(const string_view_& sv) const {
 54 |             size_t result = 0;
 55 |             for (size_t i = 0; i < sv.size(); ++i) {
 56 |                 result = (result * 31) + static_cast<size_t>(sv[i]);
 57 |             }
 58 |             return result;
 59 |         }
 60 |     };
 61 | }
 62 | namespace MNN {
 63 | namespace Transformer {
 64 | // std::string_view impl in c++11 start
 65 | 
 66 | class Tokenizer {
 67 | public:
 68 |     static constexpr int MAGIC_NUMBER = 430;
 69 |     enum TokenizerType {
 70 |         SENTENCEPIECE = 0,
 71 |         TIKTOIKEN = 1,
 72 |         BERT = 2,
 73 |         HUGGINGFACE = 3
 74 |     };
 75 |     Tokenizer() = default;
 76 |     virtual ~Tokenizer() = default;
 77 |     static Tokenizer* createTokenizer(const std::string& filename);
 78 |     bool is_stop(int token);
 79 |     bool is_special(int token);
 80 |     std::vector<int> encode(const std::string& str);
 81 |     virtual std::string decode(int id) = 0;
 82 | protected:
 83 |     virtual void load_special(std::ifstream& file);
 84 |     virtual bool load_vocab(std::ifstream& file) = 0;
 85 |     virtual void encode(const std::string& str, std::vector<int>& ids) = 0;
 86 |     std::vector<int> special_tokens_;
 87 |     std::vector<int> stop_tokens_;
 88 |     std::vector<int> prefix_tokens_;
 89 | private:
 90 |     std::string mTemplate;
 91 | };
 92 | 
 93 | class Sentencepiece : public Tokenizer {
 94 | public:
 95 |     Sentencepiece() = default;
 96 |     virtual std::string decode(int id) override;
 97 | protected:
 98 |     virtual bool load_vocab(std::ifstream& file) override;
 99 |     virtual void encode(const std::string& str, std::vector<int>& ids) override;
100 | private:
101 |     enum ModelType {
102 |         UNIGRAM = 1,
103 |         BPE = 2,
104 |         WORD = 3,
105 |         CHAR = 4
106 |     };
107 |     enum PieceType {
108 |         NORMAL = 1,
109 |         UNKNOWN = 2,
110 |         CONTROL = 3,
111 |         USER_DEFINED = 4,
112 |         UNUSED = 5,
113 |         BYTE = 6
114 |     };
115 |     struct SentencePiece {
116 |         std::string piece;
117 |         float score;
118 |         PieceType type = PieceType::NORMAL;
119 |         SentencePiece() {}
120 |         SentencePiece(const std::string& p, float s, PieceType t) : piece(p), score(s), type(t) {}
121 |     };
122 |     using EncodeResult = std::vector<std::pair<string_view_, int>>;
123 | private:
124 |     // model train type
125 |     ModelType type_ = BPE;
126 |     // byte fall back enable
127 |     bool byte_fall_back_ = true;
128 |     // unknown id.
129 |     int unk_id_ = 0;
130 |     // pieces from model
131 |     std::vector<SentencePiece> sentence_pieces_;
132 |     // piece -> id map for normal pieces
133 |     std::unordered_map<std::string, int> pieces_;
134 |     // piece -> id map for control, unknown, and byte pieces
135 |     std::unordered_map<std::string, int> reserved_id_map_;
136 | private:
137 |     float get_score(int id) const;
138 |     bool is_unused(int id) const;
139 |     bool is_control(int id) const;
140 |     int piece_to_id(const std::string& w) const;
141 |     std::string byte_to_piece(unsigned char c) const;
142 |     EncodeResult bpe_encode(string_view_ str, float alpha = 0.f);
143 | };
144 | 
145 | class Tiktoken : public Tokenizer {
146 | public:
147 |     Tiktoken() = default;
148 |     virtual std::string decode(int id) override;
149 | protected:
150 |     virtual bool load_vocab(std::ifstream& file) override;
151 |     virtual void encode(const std::string& str, std::vector<int>& ids) override;
152 |     std::unordered_map<std::string, int> encoder_;
153 |     std::vector<std::string> decoder_;
154 | };
155 | 
156 | class BertTokenizer : public Tiktoken {
157 | public:
158 |     BertTokenizer() = default;
159 | protected:
160 |     virtual void encode(const std::string& str, std::vector<int>& ids) override;
161 | private:
162 |     std::vector<int> word_piece(const std::string& token);
163 | };
164 | 
165 | class HuggingfaceTokenizer : public Tokenizer {
166 | struct hash_pair_wstring {
167 |     size_t operator()(const std::pair<std::wstring, std::wstring>& p) const {
168 |         auto hash1 = std::hash<std::wstring>{}(p.first);
169 |         auto hash2 = std::hash<std::wstring>{}(p.second);
170 |         // If hash1 == hash2, their XOR is zero.
171 |         return (hash1 != hash2) ? hash1 ^ hash2 : hash1;
172 |     }
173 | };
174 | using BPERanks = std::unordered_map<std::pair<std::wstring, std::wstring>, int, hash_pair_wstring>;
175 | public:
176 |     HuggingfaceTokenizer() = default;
177 |     virtual std::string decode(int id) override;
178 | protected:
179 |     virtual bool load_vocab(std::ifstream& file) override;
180 |     virtual void encode(const std::string& str, std::vector<int>& ids) override;
181 | private:
182 |     void bpe(const std::wstring& token, const BPERanks& bpe_ranks, std::vector<std::wstring>* result);
183 |     BPERanks bpe_ranks_;
184 |     std::unordered_map<uint8_t, wchar_t> b2u_;
185 |     std::unordered_map<wchar_t, uint8_t> u2b_;
186 |     std::unordered_map<std::string, int> encoder_;
187 |     std::vector<std::string> decoder_;
188 | };
189 | };
190 | };
191 | 
192 | #endif // TOKENIZER_hpp
193 | 


--------------------------------------------------------------------------------
/src/tts.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  tts.cpp
 3 | //
 4 | //  Created by MNN on 2025/2/20.
 5 | //  ZhaodeWang
 6 | //
 7 | 
 8 | #include "tts.hpp"
 9 | #include "ttsconfig.hpp"
10 | #include "tokenizer.hpp"
11 | #include "zhg2p.hpp"
12 | #include <audio/audio.hpp>
13 | 
14 | #include <cmath>
15 | #include <fstream>
16 | #include <iostream>
17 | #include <sstream>
18 | #include <memory>
19 | 
20 | using namespace MNN::Express;
21 | namespace MNN {
22 | namespace Transformer {
23 | 
24 | Tts *Tts::createTTS(const std::string &config_path) {
25 |     std::shared_ptr<TtsConfig> config(new TtsConfig(config_path));
26 |     return new Tts(config);
27 | }
28 | 
29 | Tts::~Tts() {}
30 | 
31 | void Tts::load() {
32 |     g2p_.reset(new Zhg2p());
33 |     tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file()));
34 |     {
35 |         ScheduleConfig config;
36 |         BackendConfig cpuBackendConfig;
37 |         config.type          = MNN_FORWARD_CPU;
38 |         config.numThread     = 4;
39 |         // cpuBackendConfig.power = BackendConfig::Power_Low;
40 |         cpuBackendConfig.memory = BackendConfig::Memory_Low;
41 |         // cpuBackendConfig.precision = BackendConfig::Precision_Low;
42 |         config.backendConfig = &cpuBackendConfig;
43 |         // ExecutorScope::Current()->setGlobalExecutorConfig(config.type, cpuBackendConfig, config.numThread);
44 |         runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
45 |     }
46 |     Module::Config module_config;
47 |     module_config.shapeMutable = true;
48 |     module_config.rearrange = true;
49 |     module_.reset(Module::load({"style", "input_ids", "speed"}, {"waveform"},
50 |                                 config_->tts_model().c_str(), runtime_manager_, &module_config));
51 |     voices_ = Express::Variable::load(config_->voices().c_str());
52 | }
53 | 
54 | template <typename T>
55 | static inline VARP _var(std::vector<T> vec, const std::vector<int> &dims) {
56 |     return _Const(vec.data(), dims, NCHW, halide_type_of<T>());
57 | }
58 | 
59 | Express::VARP Tts::generate(const std::string &text, float speed) {
60 |     std::string phonemes = g2p_->g2p(text);
61 |     auto ids = tokenizer_->encode(phonemes);
62 |     int len = ids.size();
63 |     ids.push_back(0);
64 |     ids.insert(ids.begin(), 0);
65 |     auto style = Express::_Squeeze(Express::_GatherV2(voices_[0], _var<int>({len}, {1}), _var<int>({0}, {1})), {0});
66 |     auto input_ids = _var<int>(ids, {1, len + 2});
67 |     auto speed_var = _var<float>({speed}, {1});
68 |     auto ouputs = module_->onForward({style, input_ids, speed_var});
69 |     auto wavform = ouputs[0];
70 |     return wavform;
71 | }
72 | 
73 | void Tts::save(const std::string &path, Express::VARP wavform) {
74 |     AUDIO::save(path, wavform, 24000);
75 | }
76 | 
77 | } // namespace Transformer
78 | } // namespace MNN
79 | 


--------------------------------------------------------------------------------
/src/tts_demo.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  tts_demo.cpp
 3 | //
 4 | //  Created by MNN on 2024/2/20.
 5 | //  ZhaodeWang
 6 | //
 7 | 
 8 | #include "tts.hpp"
 9 | 
10 | using namespace MNN::Transformer;
11 | 
12 | int main(int argc, const char* argv[]) {
13 |     if (argc < 3) {
14 |         std::cout << "Usage: " << argv[0] << " config.json text <file_name>" << std::endl;
15 |         return 0;
16 |     }
17 | 
18 |     std::string config_path = argv[1];
19 |     std::unique_ptr<Tts> tts(Tts::createTTS(config_path));
20 |     std::string text = argv[2];
21 |     tts->load();
22 |     auto wavform = tts->generate(text, 1.0);
23 |     std::string filename = "output.wav";
24 |     if (argc > 4) {
25 |         filename = argv[3];
26 |     }
27 |     Tts::save(filename, wavform);
28 |     return 0;
29 | }


--------------------------------------------------------------------------------
/src/ttsconfig.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  ttsconfig.hpp
  3 | //
  4 | //  Created by MNN on 2024/2/20.
  5 | //  ZhaodeWang
  6 | //
  7 | 
  8 | #include "rapidjson/document.h"
  9 | #include <rapidjson/writer.h>
 10 | #include <rapidjson/stringbuffer.h>
 11 | 
 12 | namespace MNN {
 13 | namespace Transformer {
 14 | 
 15 | static inline bool has_suffix(const std::string& str, const std::string& suffix) {
 16 |     return str.size() >= suffix.size() &&
 17 |     str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 18 | }
 19 | 
 20 | static inline std::string base_dir(const std::string& path) {
 21 |     size_t pos = path.find_last_of("/\\");
 22 |     if (pos == std::string::npos) {
 23 |         return "./";
 24 |     } else {
 25 |         return path.substr(0, pos + 1);
 26 |     }
 27 | }
 28 | 
 29 | static inline std::string file_name(const std::string& path) {
 30 |     size_t pos = path.find_last_of("/\\");
 31 |     if (pos == std::string::npos) {
 32 |         return path;
 33 |     } else {
 34 |         return path.substr(pos + 1);
 35 |     }
 36 | }
 37 | 
 38 | bool merge_json(rapidjson::Value& destination, const rapidjson::Value& source,
 39 |                 rapidjson::Document::AllocatorType& allocator) {
 40 |     if (!source.IsObject() || !destination.IsObject()) {
 41 |         return false;
 42 |     }
 43 | 
 44 |     for (auto it = source.MemberBegin(); it != source.MemberEnd(); ++it) {
 45 |         const char* key = it->name.GetString();
 46 |         if (destination.HasMember(key)) {
 47 |             if (destination[key].IsObject() && it->value.IsObject()) {
 48 |                 // Recursively merge the two JSON objects
 49 |                 merge_json(destination[key], it->value, allocator);
 50 |             } else {
 51 |                 // Overwrite the value in the destination
 52 |                 destination[key].CopyFrom(it->value, allocator);
 53 |             }
 54 |         } else {
 55 |             // Add the value to the destination
 56 |             rapidjson::Value newKey(key, allocator);
 57 |             rapidjson::Value newValue;
 58 |             newValue.CopyFrom(it->value, allocator);
 59 |             destination.AddMember(newKey, newValue, allocator);
 60 |         }
 61 |     }
 62 |     return true;
 63 | }
 64 | 
 65 | class rapid_json_wrapper {
 66 | public:
 67 |     rapidjson::Document document;
 68 |     rapid_json_wrapper() {}
 69 |     rapid_json_wrapper(rapidjson::Document doc) : document(std::move(doc)) {}
 70 |     static rapid_json_wrapper parse(const std::ifstream& ifile) {
 71 |         std::ostringstream ostr;
 72 |         ostr << ifile.rdbuf();
 73 |         rapidjson::Document document;
 74 |         document.Parse(ostr.str().c_str());
 75 |         rapid_json_wrapper json_wrapper(std::move(document));
 76 |         return json_wrapper;
 77 |     }
 78 |     static rapid_json_wrapper parse(const char* str) {
 79 |         rapidjson::Document document;
 80 |         document.Parse(str);
 81 |         rapid_json_wrapper json_wrapper(std::move(document));
 82 |         return json_wrapper;
 83 |     }
 84 |     bool merge(const char* str) {
 85 |         rapidjson::Document input_doc;
 86 |         input_doc.Parse(str);
 87 |         if (input_doc.HasParseError()) {
 88 |             return false;
 89 |         }
 90 |         // merge
 91 |         rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
 92 |         return merge_json(document, input_doc, allocator);
 93 |     }
 94 |     std::string dump() {
 95 |         rapidjson::StringBuffer buffer;
 96 |         rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
 97 |         document.Accept(writer);
 98 |         return buffer.GetString();
 99 |     }
100 |     // read value
101 |     int value(const char* key, const int& default_value) const {
102 |         if (document.HasMember(key)) {
103 |             const auto& value = document[key];
104 |             if (value.IsInt()) return value.GetInt();
105 |         }
106 |         return default_value;
107 |     }
108 |     float value(const char* key, const float& default_value) const {
109 |         if (document.HasMember(key)) {
110 |             const auto& value = document[key];
111 |             if (value.IsFloat()) return value.GetFloat();
112 |         }
113 |         return default_value;
114 |     }
115 |     bool value(const char* key, const bool& default_value) const {
116 |         if (document.HasMember(key)) {
117 |             const auto& value = document[key];
118 |             if (value.IsBool()) return value.GetBool();
119 |         }
120 |         return default_value;
121 |     }
122 |     std::string value(const char* key, const std::string& default_value) const {
123 |         if (document.HasMember(key)) {
124 |             const auto& value = document[key];
125 |             if (value.IsString()) return value.GetString();
126 |         }
127 |         return default_value;
128 |     }
129 |     std::vector<int> value(const char* key, const std::vector<int>& default_value) const {
130 |         if (document.HasMember(key)) {
131 |             const auto& value = document[key];
132 |             if (value.IsArray()) {
133 |                 std::vector<int> result;
134 |                 for (auto& v : value.GetArray()) {
135 |                     if (v.IsInt()) {
136 |                         result.push_back(v.GetInt());
137 |                     }
138 |                 }
139 |                 return result;
140 |             }
141 |         }
142 |         return default_value;
143 |     }
144 |     std::vector<float> value(const char* key, const std::vector<float>& default_value) const {
145 |         if (document.HasMember(key)) {
146 |             const auto& value = document[key];
147 |             if (value.IsArray()) {
148 |                 std::vector<float> result;
149 |                 for (auto& v : value.GetArray()) {
150 |                     if (v.IsFloat()) {
151 |                         result.push_back(v.GetFloat());
152 |                     }
153 |                 }
154 |                 return result;
155 |             }
156 |         }
157 |         return default_value;
158 |     }
159 |     std::vector<std::string> value(const char* key, const std::vector<std::string>& default_value) const {
160 |         if (document.HasMember(key)) {
161 |             const auto& value = document[key];
162 |             if (value.IsArray()) {
163 |                 std::vector<std::string> result;
164 |                 for (auto& v : value.GetArray()) {
165 |                     if (v.IsString()) {
166 |                         result.push_back(v.GetString());
167 |                     }
168 |                 }
169 |                 return result;
170 |             }
171 |         }
172 |         return default_value;
173 |     }
174 |     std::string value(const char key[], const char default_value[]) const {
175 |         return value(key, std::string(default_value));
176 |     }
177 | };
178 | 
179 | class TtsConfig {
180 | public:
181 |     std::string base_dir_;
182 |     rapid_json_wrapper config_, tts_config_;
183 |     TtsConfig() {}
184 |     TtsConfig(const std::string& path) {
185 |         // load config
186 |         if (has_suffix(path, ".json")) {
187 |             std::ifstream config_file(path);
188 |             if (config_file.is_open()) {
189 |                 config_ = rapid_json_wrapper::parse(config_file);
190 |             } else {
191 |                 std::cerr << "Unable to open config file: " << path << std::endl;
192 |             }
193 |             base_dir_ = base_dir(path);
194 |         }
195 |         // using config's base_dir
196 |         base_dir_ = config_.value("base_dir", base_dir_);
197 |         // load llm_config for model info
198 |         std::ifstream tts_config_file(tts_config());
199 |         if (tts_config_file.is_open()) {
200 |             tts_config_ = rapid_json_wrapper::parse(tts_config_file);
201 |         } else {
202 |             std::cerr << "Unable to open asr_config file: " << tts_config() << std::endl;
203 |         }
204 |     }
205 | 
206 |     // < model file config start
207 |     std::string tts_config() const {
208 |         return base_dir_ + config_.value("tts_config", "tts_config.json");
209 |     }
210 | 
211 |     std::string tts_model() const {
212 |         return base_dir_ + config_.value("tts_model", "tts.mnn");
213 |     }
214 | 
215 |     std::string voices() const {
216 |         return base_dir_ + config_.value("voices", "voices.mnn");
217 |     }
218 | 
219 |     std::string tokenizer_file() const {
220 |         return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt");
221 |     }
222 |     // model file config end >
223 | 
224 |     // < backend config start
225 |     std::string backend_type() const {
226 |         return config_.value("backend_type", "cpu");
227 |     }
228 | 
229 |     int thread_num() const {
230 |         return config_.value("thread_num", 4);
231 |     }
232 | 
233 |     std::string precision() const {
234 |         return config_.value("precision", "low");
235 |     }
236 |     std::string power() const {
237 |         return config_.value("power", "normal");
238 |     }
239 | 
240 |     std::string memory() const {
241 |         return config_.value("memory", "low");
242 |     }
243 |     // backend config end >
244 | 
245 |     // < tts model config start
246 |     std::vector<float> var() const {
247 |         return tts_config_.value("var", std::vector<float>{});
248 |     }
249 |     // tts model config end >
250 | };
251 | } // Transformer
252 | } // MNN
253 | 


--------------------------------------------------------------------------------
/src/zh_normalization/chinese_converter.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <zh_normalization/chinese_converter.h>
  3 | #include <codecvt>
  4 | #include <locale>
  5 | #include <string>
  6 | 
  7 | // 初始化静态成员变量
  8 | std::unordered_map<char32_t, char32_t> ChineseConverter::t2s_dict;
  9 | std::unordered_map<char32_t, char32_t> ChineseConverter::s2t_dict;
 10 | bool ChineseConverter::is_initialized = false;
 11 | 
 12 | void ChineseConverter::initializeDicts() {
 13 |     if (is_initialized) return;
 14 |     
 15 |     // 这里只添加一些示例映射，实际使用时需要添加完整的映射表
 16 |      s2t_dict= {
 17 |         {U'专', U'專'}, {U'业', U'業'}, {U'东', U'東'}, {U'丝', U'絲'},
 18 |      {U'丢', U'丟'}, {U'两', U'兩'}, {U'严', U'嚴'}, {U'丧', U'喪'},
 19 |      {U'个', U'個'}, {U'临', U'臨'}, {U'为', U'為'}, {U'举', U'舉'},
 20 |      {U'义', U'義'}, {U'乐', U'樂'}, {U'习', U'習'}, {U'乡', U'鄉'},
 21 |      {U'书', U'書'}, {U'买', U'買'}, {U'乱', U'亂'}, {U'争', U'爭'},
 22 |      {U'于', U'於'}, {U'亚', U'亞'}, {U'亲', U'親'}, {U'云', U'雲'},
 23 |      {U'产', U'產'}, {U'亩', U'畝'}, {U'亿', U'億'}, {U'仅', U'僅'},
 24 |      {U'从', U'從'}, {U'仓', U'倉'}, {U'仪', U'儀'}, {U'们', U'們'},
 25 |      {U'价', U'價'}, {U'众', U'眾'}, {U'优', U'優'}, {U'会', U'會'},
 26 |      {U'伛', U'傴'}, {U'伞', U'傘'}, {U'伟', U'偉'}, {U'传', U'傳'},
 27 |      {U'体', U'體'}, {U'余', U'餘'}, {U'佣', U'傭'}, {U'侠', U'俠'},
 28 |      {U'侣', U'侶'}, {U'侥', U'僥'}, {U'侦', U'偵'}, {U'侧', U'側'},
 29 |      {U'侨', U'僑'}, {U'侩', U'儈'}, {U'侪', U'儕'}, {U'债', U'債'},
 30 |      {U'倾', U'傾'}, {U'假', U'假'}, {U'偿', U'償'}, {U'储', U'儲'},
 31 |      {U'军', U'軍'}, {U'农', U'農'}, {U'冯', U'馮'}, {U'决', U'決'},
 32 |      {U'况', U'況'}, {U'冲', U'衝'}, {U'净', U'淨'}, {U'准', U'準'},
 33 |      {U'凤', U'鳳'}, {U'别', U'別'}, {U'刘', U'劉'}, {U'动', U'動'},
 34 |      {U'务', U'務'}, {U'医', U'醫'}, {U'华', U'華'}, {U'协', U'協'},
 35 |      {U'单', U'單'}, {U'卖', U'賣'}, {U'占', U'佔'}, {U'卫', U'衛'},
 36 |      {U'压', U'壓'}, {U'厅', U'廳'}, {U'历', U'歷'}, {U'县', U'縣'},
 37 |      {U'叠', U'疊'}, {U'发', U'發'}, {U'变', U'變'}, {U'台', U'臺'},
 38 |      {U'叶', U'葉'}, {U'号', U'號'}, {U'后', U'後'}, {U'向', U'向'},
 39 |      {U'吓', U'嚇'}, {U'吗', U'嗎'}, {U'听', U'聽'}, {U'启', U'啟'},
 40 |      {U'员', U'員'}, {U'周', U'週'}, {U'响', U'響'}, {U'国', U'國'}
 41 |     };
 42 | 
 43 |      t2s_dict = {
 44 |         {U'專', U'专'}, {U'業', U'业'}, {U'東', U'东'}, {U'絲', U'丝'},
 45 |           {U'丟', U'丢'}, {U'兩', U'两'}, {U'嚴', U'严'}, {U'喪', U'丧'},
 46 |           {U'個', U'个'}, {U'臨', U'临'}, {U'為', U'为'}, {U'舉', U'举'},
 47 |           {U'義', U'义'}, {U'樂', U'乐'}, {U'習', U'习'}, {U'鄉', U'乡'},
 48 |           {U'書', U'书'}, {U'買', U'买'}, {U'亂', U'乱'}, {U'爭', U'争'},
 49 |           {U'於', U'于'}, {U'亞', U'亚'}, {U'親', U'亲'}, {U'雲', U'云'},
 50 |           {U'產', U'产'}, {U'畝', U'亩'}, {U'億', U'亿'}, {U'僅', U'仅'},
 51 |           {U'從', U'从'}, {U'倉', U'仓'}, {U'儀', U'仪'}, {U'們', U'们'},
 52 |           {U'價', U'价'}, {U'眾', U'众'}, {U'優', U'优'}, {U'會', U'会'},
 53 |           {U'傴', U'伛'}, {U'傘', U'伞'}, {U'偉', U'伟'}, {U'傳', U'传'},
 54 |           {U'體', U'体'}, {U'餘', U'余'}, {U'傭', U'佣'}, {U'俠', U'侠'},
 55 |           {U'侶', U'侣'}, {U'僥', U'侥'}, {U'偵', U'侦'}, {U'側', U'侧'},
 56 |           {U'僑', U'侨'}, {U'儈', U'侩'}, {U'儕', U'侪'}, {U'債', U'债'},
 57 |           {U'傾', U'倾'}, {U'假', U'假'}, {U'償', U'偿'}, {U'儲', U'储'},
 58 |           {U'軍', U'军'}, {U'農', U'农'}, {U'馮', U'冯'}, {U'決', U'决'},
 59 |           {U'況', U'况'}, {U'衝', U'冲'}, {U'淨', U'净'}, {U'準', U'准'},
 60 |           {U'鳳', U'凤'}, {U'別', U'别'}, {U'劉', U'刘'}, {U'動', U'动'},
 61 |           {U'務', U'务'}, {U'醫', U'医'}, {U'華', U'华'}, {U'協', U'协'},
 62 |           {U'單', U'单'}, {U'賣', U'卖'}, {U'佔', U'占'}, {U'衛', U'卫'},
 63 |           {U'壓', U'压'}, {U'廳', U'厅'}, {U'歷', U'历'}, {U'縣', U'县'},
 64 |           {U'疊', U'叠'}, {U'發', U'发'}, {U'變', U'变'}, {U'臺', U'台'},
 65 |           {U'葉', U'叶'}, {U'號', U'号'}, {U'後', U'后'}, {U'向', U'向'},
 66 |           {U'嚇', U'吓'}, {U'嗎', U'吗'}, {U'聽', U'听'}, {U'啟', U'启'},
 67 |           {U'員', U'员'}, {U'週', U'周'}, {U'響', U'响'}, {U'國', U'国'}
 68 |     };
 69 | 
 70 |     is_initialized = true;
 71 | }
 72 | 
 73 | std::string ChineseConverter::traditionalToSimplified(const std::string& text) {
 74 |     initializeDicts();
 75 |     
 76 |     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
 77 |     std::u32string u32str = converter.from_bytes(text);
 78 |     
 79 |     std::u32string result;
 80 |     for (char32_t c : u32str) {
 81 |         auto it = t2s_dict.find(c);
 82 |         if (it != t2s_dict.end()) {
 83 |             result += it->second;
 84 |         } else {
 85 |             result += c;
 86 |         }
 87 |     }
 88 |     
 89 |     return converter.to_bytes(result);
 90 | }
 91 | 
 92 | std::string ChineseConverter::simplifiedToTraditional(const std::string& text) {
 93 |     initializeDicts();
 94 |     
 95 |     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
 96 |     std::u32string u32str = converter.from_bytes(text);
 97 |     
 98 |     std::u32string result;
 99 |     for (char32_t c : u32str) {
100 |         auto it = s2t_dict.find(c);
101 |         if (it != s2t_dict.end()) {
102 |             result += it->second;
103 |         } else {
104 |             result += c;
105 |         }
106 |     }
107 |     
108 |     return converter.to_bytes(result);
109 | } 


--------------------------------------------------------------------------------
/src/zh_normalization/chronology.cpp:
--------------------------------------------------------------------------------
  1 | #include <zh_normalization/chronology.h>
  2 | #include <iostream>
  3 | 
  4 | std::string Chronology::time_num2str(const std::string& num_string) {
  5 |     // 去除前导零后转换
  6 |     size_t start_pos = num_string.find_first_not_of('0');
  7 |     std::string stripped;
  8 |     if (start_pos != std::string::npos) {
  9 |         stripped = num_string.substr(start_pos);
 10 |     } else {
 11 |         stripped = "";
 12 |     }
 13 | 
 14 |     std::string result = Num::num2str(stripped.empty() ? "0" : stripped);
 15 | 
 16 |     // 如果原始数字以0开头，添加"零"
 17 |     if (!num_string.empty() && num_string[0] == '0') {
 18 |         result = Num::DIGITS.at("0") + result;
 19 |     }
 20 | 
 21 |     return result;
 22 | }
 23 | 
 24 | std::string Chronology::replace_time(const std::smatch& match) {
 25 |     bool is_range = match.size() > 6;  // 判断是否为时间范围
 26 | 
 27 |     std::string hour = match[1].str();
 28 |     std::string minute = match[2].str();
 29 |     std::string second = match[4].str();
 30 | 
 31 |     std::string hour_2, minute_2, second_2;
 32 |     if (is_range) {
 33 |         hour_2 = match[6].str();
 34 |         minute_2 = match[7].str();
 35 |         second_2 = match[9].str();
 36 |     }
 37 | 
 38 |     std::string result = Num::num2str(hour) + "点";
 39 | 
 40 |     // 处理分钟
 41 |     if (!minute.empty()) {
 42 |         size_t start_pos = minute.find_first_not_of('0');
 43 |         std::string stripped_minute;
 44 |         if (start_pos != std::string::npos) {
 45 |             stripped_minute = minute.substr(start_pos);
 46 |         } else {
 47 |             stripped_minute = "";
 48 |         }
 49 | 
 50 |         if (!stripped_minute.empty()) {
 51 |             if (std::stoi(minute) == 30) {
 52 |                 result += "半";
 53 |             } else {
 54 |                 result += time_num2str(minute) + "分";
 55 |             }
 56 |         }
 57 |     }
 58 | 
 59 |     // 处理秒数
 60 |     if (!second.empty()) {
 61 |         size_t start_pos = second.find_first_not_of('0');
 62 |         std::string stripped_second;
 63 |         if (start_pos != std::string::npos) {
 64 |             stripped_second = second.substr(start_pos);
 65 |         } else {
 66 |             stripped_second = "";
 67 |         }
 68 | 
 69 |         if (!stripped_second.empty()) {
 70 |             result += time_num2str(second) + "秒";
 71 |         }
 72 |     }
 73 | 
 74 |     // 如果是时间范围，处理第二部分
 75 |     if (is_range) {
 76 |         result += "至";
 77 |         result += Num::num2str(hour_2) + "点";
 78 | 
 79 |         if (!minute_2.empty()) {
 80 |             size_t start_pos = minute_2.find_first_not_of('0');
 81 |             std::string stripped_minute;
 82 |             if (start_pos != std::string::npos) {
 83 |                 stripped_minute = minute_2.substr(start_pos);
 84 |             } else {
 85 |                 stripped_minute = "";
 86 |             }
 87 | 
 88 |             if (!stripped_minute.empty()) {
 89 |                 if (std::stoi(minute_2) == 30) {
 90 |                     result += "半";
 91 |                 } else {
 92 |                     result += time_num2str(minute_2) + "分";
 93 |                 }
 94 |             }
 95 |         }
 96 | 
 97 |         if (!second_2.empty()) {
 98 |             size_t start_pos = second_2.find_first_not_of('0');
 99 |             std::string stripped_second;
100 |             if (start_pos != std::string::npos) {
101 |                 stripped_second = second_2.substr(start_pos);
102 |             } else {
103 |                 stripped_second = "";
104 |             }
105 | 
106 |             if (!stripped_second.empty()) {
107 |                 result += time_num2str(second_2) + "秒";
108 |             }
109 |         }
110 |     }
111 | 
112 |     return result;
113 | }
114 | 
115 | std::string Chronology::replace_date(const std::smatch& match) {
116 |     // xx年
117 |     std::string year = match[1].str();
118 |     // xx月
119 |     std::string month = match[3].str();
120 |     // xx[日号]
121 |     std::string day = match[6].str();
122 | 
123 |     std::string tmp = match[9].str();
124 | 
125 |     std::string result;
126 | 
127 |     if (!year.empty()) {
128 |         year = Num::get_digit(year);
129 |         result += Num::verbalize_digit(year) + "年";
130 |     }
131 | 
132 |     if (!month.empty()) {
133 |         month = Num::get_digit(month);
134 |         result += Num::verbalize_cardinal(month) + "月";
135 |     }
136 | 
137 |     if (!day.empty()) {
138 |         day = Num::get_digit(day);
139 |         result += Num::verbalize_cardinal(day) + match[9].str(); // 日或号
140 |     }
141 | 
142 |     return result;
143 | }
144 | 
145 | std::string Chronology::replace_date2(const std::smatch& match) {
146 |     std::string year = match[1].str();
147 |     std::string month = match[3].str();
148 |     std::string day = match[4].str();
149 | 
150 |     std::string result = "";
151 | 
152 |     if (!year.empty()) {
153 |         result += Num::verbalize_digit(year) + "年";
154 |     }
155 | 
156 |     if (!month.empty()) {
157 |         result += Num::verbalize_cardinal(month) + "月";
158 |     }
159 | 
160 |     if (!day.empty()) {
161 |         result += Num::verbalize_cardinal(day) + "日";
162 |     }
163 | 
164 |     return result;
165 | }


--------------------------------------------------------------------------------
/src/zh_normalization/constants.cpp:
--------------------------------------------------------------------------------
  1 | #include <zh_normalization/constants.h>
  2 | #include <codecvt>
  3 | #include <locale>
  4 | 
  5 | // 定义静态常量
  6 | const std::string Constants::ASCII_LETTERS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  7 | const std::string Constants::DIGITS = "0123456789";
  8 | const std::string Constants::PUNCTUATIONS = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
  9 | 
 10 | #ifdef SUPPORT_UCS4
 11 | const std::string Constants::NSW_PATTERN = 
 12 |     "(?:[^"
 13 |     "\u3007"  // 〇
 14 |     "\u3400-\u4dbf"  // CJK扩展A:[3400-4DBF]
 15 |     "\u4e00-\u9fff"  // CJK基本:[4E00-9FFF]
 16 |     "\uf900-\ufaff"  // CJK兼容:[F900-FAFF]
 17 |     "\U00020000-\U0002A6DF"  // CJK扩展B:[20000-2A6DF]
 18 |     "\U0002A703-\U0002B73F"  // CJK扩展C:[2A700-2B73F]
 19 |     "\U0002B740-\U0002B81D"  // CJK扩展D:[2B740-2B81D]
 20 |     "\U0002F80A-\U0002FA1F"  // CJK兼容扩展:[2F800-2FA1F]
 21 |     "])+";
 22 | #else
 23 | const std::string Constants::NSW_PATTERN_NO_UCS4 = 
 24 |     "(?:[^"
 25 |     "\u3007"  // 〇
 26 |     "\u3400-\u4dbf"  // CJK扩展A:[3400-4DBF]
 27 |     "\u4e00-\u9fff"  // CJK基本:[4E00-9FFF]
 28 |     "\uf900-\ufaff"  // CJK兼容:[F900-FAFF]
 29 |     "])+";
 30 | #endif
 31 | 
 32 | Constants& Constants::getInstance() {
 33 |     static Constants instance;
 34 |     return instance;
 35 | }
 36 | 
 37 | Constants::Constants() : is_initialized(false) {
 38 |     initializeMaps();
 39 |     #ifdef SUPPORT_UCS4
 40 |     re_nsw = std::regex(NSW_PATTERN, std::regex::ECMAScript | std::regex::optimize);
 41 |     #else
 42 |     re_nsw = std::regex(NSW_PATTERN_NO_UCS4, std::regex::ECMAScript | std::regex::optimize);
 43 |     #endif
 44 | }
 45 | 
 46 | void Constants::initializeMaps() {
 47 |     if (is_initialized) return;
 48 |     
 49 |     // 初始化ASCII字母映射
 50 |     for (char c : ASCII_LETTERS) {
 51 |         f2h_ascii_letters[static_cast<char32_t>(c) + 65248] = static_cast<char32_t>(c);
 52 |         h2f_ascii_letters[static_cast<char32_t>(c)] = static_cast<char32_t>(c) + 65248;
 53 |     }
 54 |     
 55 |     // 初始化数字映射
 56 |     for (char c : DIGITS) {
 57 |         f2h_digits[static_cast<char32_t>(c) + 65248] = static_cast<char32_t>(c);
 58 |         h2f_digits[static_cast<char32_t>(c)] = static_cast<char32_t>(c) + 65248;
 59 |     }
 60 |     
 61 |     // 初始化标点符号映射
 62 |     for (char c : PUNCTUATIONS) {
 63 |         f2h_punctuations[static_cast<char32_t>(c) + 65248] = static_cast<char32_t>(c);
 64 |         h2f_punctuations[static_cast<char32_t>(c)] = static_cast<char32_t>(c) + 65248;
 65 |     }
 66 |     
 67 |     // 初始化空格映射
 68 |     f2h_space[0x3000] = 0x0020;  // 全角空格到半角空格
 69 |     h2f_space[0x0020] = 0x3000;  // 半角空格到全角空格
 70 |     
 71 |     is_initialized = true;
 72 | }
 73 | 
 74 | std::string Constants::fullToHalf(const std::string& text) {
 75 |     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
 76 |     std::u32string u32str = converter.from_bytes(text);
 77 |     
 78 |     std::u32string result;
 79 |     result.reserve(u32str.length());
 80 |     
 81 |     for (char32_t c : u32str) {
 82 |         // 检查ASCII字母
 83 |         auto it_letter = f2h_ascii_letters.find(c);
 84 |         if (it_letter != f2h_ascii_letters.end()) {
 85 |             result += it_letter->second;
 86 |             continue;
 87 |         }
 88 |         
 89 |         // 检查数字
 90 |         auto it_digit = f2h_digits.find(c);
 91 |         if (it_digit != f2h_digits.end()) {
 92 |             result += it_digit->second;
 93 |             continue;
 94 |         }
 95 |         
 96 |         // 检查标点符号
 97 |         auto it_punct = f2h_punctuations.find(c);
 98 |         if (it_punct != f2h_punctuations.end()) {
 99 |             result += it_punct->second;
100 |             continue;
101 |         }
102 |         
103 |         // 检查空格
104 |         if (c == 0x3000) {
105 |             result += 0x0020;
106 |             continue;
107 |         }
108 |         
109 |         // 其他字符保持不变
110 |         result += c;
111 |     }
112 |     
113 |     return converter.to_bytes(result);
114 | }
115 | 
116 | std::string Constants::halfToFull(const std::string& text) {
117 |     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
118 |     std::u32string u32str = converter.from_bytes(text);
119 |     
120 |     std::u32string result;
121 |     result.reserve(u32str.length());
122 |     
123 |     for (char32_t c : u32str) {
124 |         // 检查ASCII字母
125 |         auto it_letter = h2f_ascii_letters.find(c);
126 |         if (it_letter != h2f_ascii_letters.end()) {
127 |             result += it_letter->second;
128 |             continue;
129 |         }
130 |         
131 |         // 检查数字
132 |         auto it_digit = h2f_digits.find(c);
133 |         if (it_digit != h2f_digits.end()) {
134 |             result += it_digit->second;
135 |             continue;
136 |         }
137 |         
138 |         // 检查标点符号
139 |         auto it_punct = h2f_punctuations.find(c);
140 |         if (it_punct != h2f_punctuations.end()) {
141 |             result += it_punct->second;
142 |             continue;
143 |         }
144 |         
145 |         // 检查空格
146 |         if (c == 0x0020) {
147 |             result += 0x3000;
148 |             continue;
149 |         }
150 |         
151 |         // 其他字符保持不变
152 |         result += c;
153 |     }
154 |     
155 |     return converter.to_bytes(result);
156 | }
157 | 
158 | std::vector<std::string> Constants::getNSWs(const std::string& text) {
159 |     std::vector<std::string> results;
160 |     std::string::const_iterator searchStart(text.cbegin());
161 |     std::smatch matches;
162 |     
163 |     while (std::regex_search(searchStart, text.cend(), matches, re_nsw)) {
164 |         results.push_back(matches[0].str());
165 |         searchStart = matches.suffix().first;
166 |     }
167 |     
168 |     return results;
169 | } 


--------------------------------------------------------------------------------
/src/zh_normalization/num.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by tao on 25-4-9.
  3 | //
  4 | 
  5 | #include <zh_normalization/num.h>
  6 | 
  7 | // 静态成员变量定义
  8 | const std::map<int, std::string> Num::UNITS = {
  9 |     {1, "十"},
 10 |     {2, "百"},
 11 |     {3, "千"},
 12 |     {4, "万"},
 13 |     {8, "亿"}
 14 | };
 15 | 
 16 | const std::map<std::string, std::string> Num::DIGITS = {
 17 |     {"0", "零"}, {"1", "一"}, {"2", "二"}, {"3", "三"}, {"4", "四"},
 18 |     {"5", "五"}, {"6", "六"}, {"7", "七"}, {"8", "八"}, {"9", "九"}
 19 | };
 20 | 
 21 | const std::string Num::COM_QUANTIFIERS = "(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分|(公(里|引|丈|尺|寸|分|釐)))";
 22 | 
 23 | // 方法实现
 24 | std::string Num::num2str(const std::string& value_string) {
 25 |     size_t dot_pos = value_string.find('.');
 26 |     std::string integer, decimal;
 27 | 
 28 |     if (dot_pos == std::string::npos) {
 29 |         integer = value_string;
 30 |         decimal = "";
 31 |     } else {
 32 |         integer = value_string.substr(0, dot_pos);
 33 |         decimal = value_string.substr(dot_pos + 1);
 34 |     }
 35 | 
 36 |     std::string result = verbalize_cardinal(integer);
 37 | 
 38 |     // 去除decimal末尾的0
 39 |     while (!decimal.empty() && decimal.back() == '0') {
 40 |         decimal.pop_back();
 41 |     }
 42 | 
 43 |     if (!decimal.empty()) {
 44 |         // '.22' is verbalized as '零点二二'
 45 |         // '3.20' is verbalized as '三点二'
 46 |         if (result.empty()) {
 47 |             result = "零";
 48 |         }
 49 |         result += "点" + verbalize_digit(decimal);
 50 |     }
 51 | 
 52 |     return result;
 53 | }
 54 | 
 55 | // 分数表达式
 56 | std::string Num::replace_frac(const std::smatch& match) {
 57 |     std::string sign = match[1].str();
 58 |     std::string nominator = match[2].str();
 59 |     std::string denominator = match[3].str();
 60 |     sign = sign.empty() ? "" : "负";
 61 |     nominator = num2str(nominator);
 62 |     denominator = num2str(denominator);
 63 |     return sign + denominator + "分之" + nominator;
 64 | }
 65 | 
 66 | std::string Num::replace_percentage(const std::smatch& match) {
 67 |     bool is_negative = !match[1].str().empty();  // Check if '-' exists
 68 |     std::string integer_part = match[2].str();   // Main number part
 69 |     std::string decimal_part = match[3].matched ? match[3].str() : "";  // Decimal part if exists
 70 | 
 71 |     std::string result;
 72 |     if (is_negative) {
 73 |         result += "负";
 74 |     }
 75 |     result += "百分之" + num2str(integer_part + decimal_part);
 76 |     return result;
 77 | }
 78 | 
 79 | // 处理负数
 80 | std::string Num::replace_negative_num(const std::smatch& match) {
 81 |     std::string sign = match[1].str();
 82 |     std::string number = match[2].str();
 83 |     sign = sign.empty() ? "" : "负";
 84 |     number = num2str(number);
 85 |     return sign + number;
 86 | }
 87 | 
 88 | // 编号-无符号整形
 89 | std::string Num::replace_default_num(const std::smatch& match) {
 90 |     std::string number = match[0].str();
 91 |     return verbalize_digit(number, true);
 92 | }
 93 | 
 94 | // 正整数 + 量词
 95 | std::string Num::replace_positive_quantifier(const std::smatch& match) {
 96 |     std::string number = match[1].str();
 97 |     std::string match_2 = match[2].str();
 98 |     if (match_2 == "+") {
 99 |         match_2 = "多";
100 |     }
101 |     std::string quantifiers = match[3].str();
102 |     number = num2str(number);
103 |     return number + match_2 + quantifiers;
104 | }
105 | 
106 | // 数字表达式
107 | std::string Num::replace_number(const std::smatch& match) {
108 |     std::string sign = match[1].str();
109 |     std::string number = match[2].str();
110 |     std::string pure_decimal = match[5].str();
111 | 
112 |     if (!pure_decimal.empty()) {
113 |         return num2str(pure_decimal);
114 |     } else {
115 |         sign = sign.empty() ? "" : "负";
116 |         number = num2str(number);
117 |         return sign + number;
118 |     }
119 | }
120 | 
121 | // 范围表达式
122 | std::string Num::replace_range(const std::smatch& match) {
123 |     std::string first = match[1].str();
124 |     std::string second = match[8].str();
125 | 
126 |     // 使用正则替换first和second
127 |     std::regex re_number(R"((-?)((\d+)(\.\d+)?)|(\.(\d+)))");
128 | 
129 |     // 使用迭代器方式处理first字符串
130 |     std::string first_result;
131 |     std::sregex_iterator first_it(first.begin(), first.end(), re_number);
132 |     std::sregex_iterator end;
133 | 
134 |     size_t last_pos = 0;
135 |     while (first_it != end) {
136 |         std::smatch match = *first_it;
137 |         // 添加未匹配部分
138 |         first_result += first.substr(last_pos, match.position() - last_pos);
139 |         // 添加转换后的数字
140 |         first_result += replace_number(match);
141 |         last_pos = match.position() + match.length();
142 |         ++first_it;
143 |     }
144 |     // 添加剩余部分
145 |     first_result += first.substr(last_pos);
146 | 
147 |     // 使用相同方式处理second字符串
148 |     std::string second_result;
149 |     std::sregex_iterator second_it(second.begin(), second.end(), re_number);
150 | 
151 |     last_pos = 0;
152 |     while (second_it != end) {
153 |         std::smatch match = *second_it;
154 |         second_result += second.substr(last_pos, match.position() - last_pos);
155 |         second_result += replace_number(match);
156 |         last_pos = match.position() + match.length();
157 |         ++second_it;
158 |     }
159 |     second_result += second.substr(last_pos);
160 | 
161 |     return first_result + "到" + second_result;
162 | }
163 | 
164 | inline std::vector<std::string> Num::_get_value(const std::string& value_string, bool use_zero) {
165 |     // 去除前导零
166 |     size_t start_pos = value_string.find_first_not_of('0');
167 |     std::string stripped;
168 |     if (start_pos != std::string::npos) {
169 |         stripped = value_string.substr(start_pos);
170 |     } else {
171 |         stripped = "";
172 |     }
173 | 
174 |     if (stripped.empty()) {
175 |         return {};
176 |     } else if (stripped.length() == 1) {
177 |         if (use_zero && stripped.length() < value_string.length()) {
178 |             return {DIGITS.at("0"), DIGITS.at(stripped)};
179 |         } else {
180 |             return {DIGITS.at(stripped)};
181 |         }
182 |     } else {
183 |         // 找到最大单位
184 |         int largest_unit = 0;
185 |         for (auto it = UNITS.rbegin(); it != UNITS.rend(); ++it) {
186 |             if (it->first < stripped.length()) {
187 |                 largest_unit = it->first;
188 |                 break;
189 |             }
190 |         }
191 | 
192 |         std::string first_part = value_string.substr(0, value_string.length() - largest_unit);
193 |         std::string second_part = value_string.substr(value_string.length() - largest_unit);
194 | 
195 |         std::vector<std::string> result = _get_value(first_part);
196 |         result.push_back(UNITS.at(largest_unit));
197 | 
198 |         std::vector<std::string> second_result = _get_value(second_part);
199 |         result.insert(result.end(), second_result.begin(), second_result.end());
200 | 
201 |         return result;
202 |     }
203 | }
204 | 
205 | std::string Num::verbalize_cardinal(const std::string& value_string) {
206 |     if (value_string.empty()) {
207 |         return "";
208 |     }
209 | 
210 |     // 去除前导零
211 |     size_t start_pos = value_string.find_first_not_of('0');
212 |     std::string stripped;
213 |     if (start_pos != std::string::npos) {
214 |         stripped = value_string.substr(start_pos);
215 |     } else {
216 |         stripped = "";
217 |     }
218 | 
219 |     // 000 -> '零' , 0 -> '零'
220 |     if (stripped.empty()) {
221 |         return DIGITS.at("0");
222 |     }
223 | 
224 |     std::vector<std::string> result_symbols = _get_value(stripped);
225 | 
226 |     // verbalized number starting with '一十*' is abbreviated as `十*`
227 |     if (result_symbols.size() >= 2 && result_symbols[0] == DIGITS.at("1") && result_symbols[1] == UNITS.at(1)) {
228 |         result_symbols.erase(result_symbols.begin());
229 |     }
230 | 
231 |     std::string result;
232 |     for (const auto& s : result_symbols) {
233 |         result += s;
234 |     }
235 | 
236 |     return result;
237 | }
238 | 
239 | std::string Num::verbalize_digit(const std::string& value_string, bool alt_one) {
240 |     std::string result;
241 |     for (char digit : value_string) {
242 |         std::string digit_str(1, digit);
243 |         result += DIGITS.at(digit_str);
244 |     }
245 | 
246 |     if (alt_one) {
247 |         // 将"一"替换为"幺"
248 |         size_t pos = 0;
249 |         while ((pos = result.find("一", pos)) != std::string::npos) {
250 |             result.replace(pos, 3, "幺");  // UTF-8中"一"和"幺"都是3字节
251 |             pos += 3;
252 |         }
253 |     }
254 | 
255 |     return result;
256 | }


--------------------------------------------------------------------------------
/src/zh_normalization/phonecode.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "zh_normalization/phonecode.h"
18 | #include "zh_normalization/num.h"
19 | #include <sstream>
20 | #include <algorithm>
21 | #include <unordered_map>
22 | 
23 | std::string PhoneNormalizer::phone2str(const std::string& phone_string, bool mobile) {
24 |     std::string processed_string = phone_string;
25 |     if (mobile) {
26 |         // 处理手机号码
27 |         // 去除开头的'+'号
28 |         if (!processed_string.empty() && processed_string[0] == '+') {
29 |             processed_string = processed_string.substr(1);
30 |         }
31 | 
32 |         // 按空格分割
33 |         std::istringstream iss(processed_string);
34 |         std::vector<std::string> parts;
35 |         std::string part;
36 |         while (iss >> part) {
37 |             parts.push_back(Num::verbalize_digit(part, true));
38 |         }
39 | 
40 |         // 用逗号连接
41 |         std::string result;
42 |         for (size_t i = 0; i < parts.size(); ++i) {
43 |             if (i > 0) result += "，";
44 |             result += parts[i];
45 |         }
46 |         return result;
47 |     } else {
48 |         // 处理固定电话
49 |         std::stringstream ss(processed_string);
50 |         std::string part;
51 |         std::vector<std::string> parts;
52 | 
53 |         // 按'-'分割
54 |         while (std::getline(ss, part, '-')) {
55 |             parts.push_back(Num::verbalize_digit(part, true));
56 |         }
57 | 
58 |         // 用逗号连接
59 |         std::string result;
60 |         for (size_t i = 0; i < parts.size(); ++i) {
61 |             if (i > 0) result += "，";
62 |             result += parts[i];
63 |         }
64 |         return result;
65 |     }
66 | }
67 | 
68 | std::string PhoneNormalizer::replace_phone(const std::smatch& match) {
69 |     return phone2str(match.str(0), false);
70 | }
71 | 
72 | std::string PhoneNormalizer::replace_mobile(const std::smatch& match) {
73 |     return phone2str(match.str(0), true);
74 | } 


--------------------------------------------------------------------------------
/src/zh_normalization/quantifier.cpp:
--------------------------------------------------------------------------------
 1 | #include "zh_normalization/quantifier.h"
 2 | #include "zh_normalization/num.h"
 3 | 
 4 | const std::map<std::string, std::string> Quantifier::measure_dict = {
 5 |     {"cm2", "平方厘米"},
 6 |     {"cm²", "平方厘米"},
 7 |     {"cm3", "立方厘米"},
 8 |     {"cm³", "立方厘米"},
 9 |     {"cm", "厘米"},
10 |     {"db", "分贝"},
11 |     {"ds", "毫秒"},
12 |     {"kg", "千克"},
13 |     {"km", "千米"},
14 |     {"m2", "平方米"},
15 |     {"m²", "平方米"},
16 |     {"m³", "立方米"},
17 |     {"m3", "立方米"},
18 |     {"ml", "毫升"},
19 |     {"m", "米"},
20 |     {"mm", "毫米"},
21 |     {"s", "秒"},
22 |     {"h", "小时"},
23 |     {"mg", "毫克"}
24 | };
25 | 
26 | std::string Quantifier::replace_temperature(const std::smatch& match) {
27 |     std::string sign = match[1].str();
28 |     std::string temperature = match[2].str();
29 |     std::string unit = match[4].str();
30 | 
31 |     sign = sign.empty() ? "" : "零下";
32 |     temperature = Num::num2str(temperature);
33 |     unit = (unit == "摄氏度") ? "摄氏度" : "度";
34 | 
35 |     return sign + temperature + unit;
36 | }
37 | 
38 | std::string Quantifier::replace_measure(std::string &sentence) {
39 |     for (const auto& pair : measure_dict) {
40 |         const std::string& q_notation = pair.first;
41 |         const std::string& replacement = pair.second;
42 | 
43 |         size_t pos = 0;
44 |         while ((pos = sentence.find(q_notation, pos)) != std::string::npos) {
45 |             sentence.replace(pos, q_notation.length(), replacement);
46 |             pos += replacement.length();
47 |         }
48 |     }
49 |     return sentence;
50 | } 


--------------------------------------------------------------------------------
/src/zhg2p.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  zhg2p.hpp
 3 | //
 4 | //  Created by MNN on 2025/2/24.
 5 | //  ZhaodeWang
 6 | //
 7 | 
 8 | #include <iostream>
 9 | #include <memory>
10 | #include <string>
11 | #include <vector>
12 | #include <regex>
13 | #include <cppjieba/Jieba.hpp>
14 | #include <cpp-pinyin/Pinyin.h>
15 | #include <cpp-pinyin/G2pglobal.h>
16 | #include "zh_normalization/TextNormalizer.h"
17 | 
18 | namespace MNN {
19 | namespace Transformer {
20 | 
21 | class Zhg2p {
22 | private:
23 |     std::unique_ptr<cppjieba::Jieba> jieba_;
24 |     std::unique_ptr<Pinyin::Pinyin> pinyin_;
25 | public:
26 |     Zhg2p();
27 |     static std::string pinyinToIPA(const std::string &py);
28 |     static std::vector<std::string> obtainPinyins(const std::string &word);
29 |     static std::string retone(const std::string &p);
30 |     static std::string py2ipa(const std::string &py);
31 |     static std::string preprocess(const std::string &text);
32 |     static std::string get_finals(const std::string &text);
33 |     static std::string get_initials(const std::string &text);
34 |     static std::string pinyin_to_phonemes(const std::string &pinyin_normal,  const std::string &pinyin_initial, const std::string &pinyin_final, int tone_nr);
35 |     static std::string pinyin_to_ipa(const std::string &pinyin);
36 |     std::string word2ipa(const std::string &word);
37 |     std::string g2p(const std::string &text);
38 | 
39 |    static inline  std::map<std::string, std::vector<std::string>> INITIAL_MAPPING = {
40 |         {"b", {"p"}}, {"c", {"ʦʰ"}}, {"ch", {"\uAB67ʰ"}}, {"d", {"t"}}, {"f", {"f"}},
41 |         {"g", {"k"}}, {"h", {"x", "h"}}, {"j", {"ʨ"}}, {"k", {"kʰ"}}, {"l", {"l"}},
42 |         {"m", {"m"}}, {"n", {"n"}}, {"p", {"pʰ"}}, {"q", {"ʨʰ"}}, {"r", {"ɻ", "ʐ"}},
43 |         {"s", {"s"}}, {"sh", {"ʂ"}}, {"t", {"tʰ"}}, {"x", {"ɕ"}}, {"z", {"ʦ"}}, {"zh", {"\uAB67"}},};
44 |    static inline std::map<std::string, std::vector<std::string>> FINAL_MAPPING = {
45 |         {"a", {"a0"}}, {"ai", {"ai̯0"}}, {"an", {"a0", "n"}}, {"ang", {"a0", "ŋ"}}, {"ao", {"au̯0"}},
46 |         {"e", {"ɤ0"}}, {"ei", {"ei̯0"}}, {"en", {"ə0", "n"}}, {"eng", {"ə0", "ŋ"}}, {"i", {"i0"}},
47 |         {"ia", {"j", "a0"}}, {"ian", {"j", "ɛ0", "n"}}, {"iang", {"j", "a0", "ŋ"}}, {"iao", {"j", "au̯0"}},
48 |         {"ie", {"j", "e0"}}, {"in", {"i0", "n"}}, {"iou", {"j", "ou̯0"}}, {"ing", {"i0", "ŋ"}},
49 |         {"iong", {"j", "ʊ0", "ŋ"}}, {"ong", {"ʊ0", "ŋ"}}, {"ou", {"ou̯0"}}, {"u", {"u0"}}, {"uei", {"w", "ei̯0"}},
50 |         {"ua", {"w", "a0"}}, {"uai", {"w", "ai̯0"}}, {"uan", {"w", "a0", "n"}}, {"uen", {"w", "ə0", "n"}},
51 |         {"uang", {"w", "a0", "ŋ"}}, {"ueng", {"w", "ə0", "ŋ"}}, {"uo", {"w", "o0"}}, {"o", {"w", "o0"}},
52 |         {"ü", {"y0"}}, {"üe", {"ɥ", "e0"}}, {"üan", {"ɥ", "ɛ0", "n"}}, {"ün", {"y0", "n"}}
53 |     };
54 | 
55 |     static inline std::map<std::string, std::vector<std::string>> INTERJECTION_MAPPINGS = {
56 |         {"io", {"j", "ɔ0"}},
57 |         {"ê", {"ɛ0"}},
58 |         {"er", {"ɚ0", "aɚ0"}},
59 |         {"o", {"ɔ0"}},
60 |     };
61 | 
62 |     static  inline std::map<std::string, std::vector<std::string>> SYLLABIC_CONSONANT_MAPPINGS = {
63 |         {"hm", {"h", "ɔ0"}},
64 |         {"hng", {"h", "ŋ0"}},
65 |         {"m", {"m0"}},
66 |         {"n", {"n0"}},
67 |         {"ng", {"ŋ0"}},
68 |     };
69 | 
70 |     static  inline std::vector<std::string> ZH_CH_SH_R = {"zh", "ch", "sh", "r"};
71 |     static  inline std::vector<std::string> Z_C_S = {"z", "c", "s"};
72 |     static  inline std::vector<std::string> FINAL_MAPPING_AFTER_ZH_CH_SH_R = {"ɻ̩0", "ʐ̩0"};
73 |     static  inline std::vector<std::string> FINAL_MAPPING_AFTER_Z_C_S = {"ɹ̩0", "z̩0"};
74 | };
75 | 
76 | } // namespace Transformer
77 | } // namespace MNN


--------------------------------------------------------------------------------