├── README.md ├── sample.cc └── ChisTrie.h /README.md: -------------------------------------------------------------------------------- 1 | # TrieNAhoCorasick 2 | 带AC自动机的C++Trie树实现,支持UTF8和GBK编码 3 | -------------------------------------------------------------------------------- /sample.cc: -------------------------------------------------------------------------------- 1 | #include "ChisTrie.h" 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | void wchar_test() { 7 | chis::lctrie trie; 8 | wcout.imbue(locale(locale(),"",LC_CTYPE)); 9 | {//一波初始化 10 | vector> words = { 11 | {L"李", L"姓"}, 12 | {L"李刚", L"爸爸"}, 13 | {L"李天一", L"儿子"}, 14 | {L"刚", L"爸爸的名字"}, 15 | {L"天一", L"儿子的名字"}, 16 | {L"刚仔", L"爸爸的昵称1"}, 17 | {L"刚刚", L"爸爸的昵称2"}, 18 | {L"仔", L"儿子的昵称1"}, 19 | {L"仔仔", L"儿子的昵称2"}, 20 | {L"天一的爸", L"李刚"}, 21 | {L"我", L"就是我,是颜色不一样的烟火"}, 22 | {L"的爸", L"的哥的爸爸"}, 23 | }; 24 | for(auto w:words) { 25 | trie.insert(w.first, w.second); 26 | } 27 | trie.build_aho_corasick();//构建ac自动机 28 | } 29 | wstring input = L"我李天一的爸是李刚刚仔仔"; 30 | auto res = trie.match(input);//默认返回左优先的贪心匹配 31 | for(auto r:res) { 32 | wcout << input.substr(0, r.begin-input.c_str()) 33 | << L"[" << input.substr(r.begin-input.c_str(),r.end-r.begin) << L"(" << r.val << L")] " 34 | << input.substr(r.end-input.c_str()) << endl; 35 | } 36 | wcout << "//////////////+Full Match//////////////" << endl; 37 | res = trie.match(input, true);//返回所有结果 38 | for(auto r:res) { 39 | wcout << input.substr(0, r.begin-input.c_str()) 40 | << L"[" << input.substr(r.begin-input.c_str(),r.end-r.begin) << L"(" << r.val << L")]" 41 | << input.substr(r.end-input.c_str()) << endl; 42 | } 43 | wcout << "//////////////-Longest Match//////////////" << endl; 44 | res = trie.match(input, false, false);//非贪心匹配 45 | for(auto r:res) { 46 | wcout << input.substr(0, r.begin-input.c_str()) 47 | << L"[" << input.substr(r.begin-input.c_str(),r.end-r.begin) << L"(" << r.val << L")]" 48 | << input.substr(r.end-input.c_str()) << endl; 49 | } 50 | } 51 | 52 | void vector_test() { 53 | chis::lctrie, string, chis::Trie_decoder_ONESTEP> trie; 54 | {//一波初始化 55 | vector, string>> words = { 56 | {{"李"}, "姓"}, 57 | {{"李","刚"}, "爸爸"}, 58 | {{"李","天", "一"}, "儿子"}, 59 | {{"刚"}, "爸爸的名字"}, 60 | {{"天","一"}, "儿子的名字"}, 61 | {{"刚","仔"}, "爸爸的昵称1"}, 62 | {{"刚","刚"}, "爸爸的昵称2"}, 63 | {{"仔"}, "儿子的昵称1"}, 64 | {{"仔","仔"}, "儿子的昵称2"}, 65 | {{"天","一","的","爸"}, "李刚"}, 66 | {{"我"}, "就是我,是颜色不一样的烟火"}, 67 | {{"的","爸"}, "的哥的爸爸"}, 68 | }; 69 | for(auto w:words) { 70 | trie.insert(w.first, w.second); 71 | } 72 | trie.build_aho_corasick();//构建ac自动机 73 | } 74 | vector input = {"我","李","天","一","的","爸","是","李","刚","刚","仔","仔"}; 75 | auto res = trie.match(input);//默认返回左优先的贪心匹配 76 | auto tostring = [](const vector &v) { 77 | string ret; 78 | for(auto &s:v) { 79 | ret += s; 80 | } 81 | return ret; 82 | }; 83 | for(auto r:res) { 84 | cout << tostring(vector((const string*)input.data(), r.begin)) 85 | << "[" << tostring(vector(r.begin,r.end)) << "(" << r.val << ")] " 86 | << tostring(vector(r.end, (const string*)(input.data()+input.size()))) << endl; 87 | } 88 | cout << "//////////////+Full Match//////////////" << endl; 89 | res = trie.match(input, true);//返回所有结果 90 | for(auto r:res) { 91 | cout << tostring(vector((const string*)input.data(), r.begin)) 92 | << "[" << tostring(vector(r.begin,r.end)) << "(" << r.val << ")] " 93 | << tostring(vector(r.end, (const string*)(input.data()+input.size()))) << endl; 94 | } 95 | cout << "//////////////-Longest Match//////////////" << endl; 96 | res = trie.match(input, false, false);//非贪心匹配 97 | for(auto r:res) { 98 | cout << tostring(vector((const string*)input.data(), r.begin)) 99 | << "[" << tostring(vector(r.begin,r.end)) << "(" << r.val << ")] " 100 | << tostring(vector(r.end, (const string*)(input.data()+input.size()))) << endl; 101 | } 102 | } 103 | 104 | void norm_test() { 105 | chis::lctrie trie; 106 | {//一波初始化 107 | vector> words = { 108 | {"李", "姓"}, 109 | {"李刚", "爸爸"}, 110 | {"李天一", "儿子"}, 111 | {"刚", "爸爸的名字"}, 112 | {"天一", "儿子的名字"}, 113 | {"刚仔", "爸爸的昵称1"}, 114 | {"刚刚", "爸爸的昵称2"}, 115 | {"仔", "儿子的昵称1"}, 116 | {"仔仔", "儿子的昵称2"}, 117 | {"天一的爸", "李刚"}, 118 | {"我", "就是我,是颜色不一样的烟火"}, 119 | {"的爸", "的哥的爸爸"}, 120 | }; 121 | for(auto w:words) { 122 | trie.insert(w.first, w.second); 123 | } 124 | trie.build_aho_corasick();//构建ac自动机 125 | } 126 | string input = "我李天一的爸是李刚刚仔仔"; 127 | auto res = trie.match(input);//默认返回左优先的贪心匹配 128 | for(auto r:res) { 129 | cout << input.substr(0, r.begin-input.c_str()) 130 | << "[" << input.substr(r.begin-input.c_str(),r.end-r.begin) << "(" << r.val << ")] " 131 | << input.substr(r.end-input.c_str()) << endl; 132 | } 133 | cout << "//////////////+Full Match//////////////" << endl; 134 | res = trie.match(input, true);//返回所有结果 135 | for(auto r:res) { 136 | cout << input.substr(0, r.begin-input.c_str()) 137 | << "[" << input.substr(r.begin-input.c_str(),r.end-r.begin) << "(" << r.val << ")]" 138 | << input.substr(r.end-input.c_str()) << endl; 139 | } 140 | cout << "//////////////-Longest Match//////////////" << endl; 141 | res = trie.match(input, false, false);//非贪心匹配 142 | for(auto r:res) { 143 | cout << input.substr(0, r.begin-input.c_str()) 144 | << "[" << input.substr(r.begin-input.c_str(),r.end-r.begin) << "(" << r.val << ")]" 145 | << input.substr(r.end-input.c_str()) << endl; 146 | } 147 | } 148 | int main() { 149 | norm_test(); 150 | cout << "///////////////// wstring ////////////////" << endl; 151 | wchar_test(); 152 | cout << "///////////////// vector ////////////////" << endl; 153 | vector_test(); 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /ChisTrie.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | namespace chis { 8 | enum STATUS { LEAF, INTER, ROOT }; 9 | struct Trie_decoder_GBK { //Tire树编码类,next返回begin的后继和当前字符的offset 10 | inline static std::pair next(const char* begin, const char* end); 11 | }; 12 | struct Trie_decoder_UTF8 { 13 | inline static std::pair next(const char* begin, const char* end); 14 | }; 15 | 16 | struct Trie_decoder_ONESTEP { 17 | template 18 | inline static std::pair<_keyItor, size_t> next(_keyItor begin, _keyItor end) { 19 | if (begin < end) { 20 | size_t offset = 1; 21 | if (begin + offset <= end) { 22 | return { begin + offset, offset }; 23 | } else { 24 | return { end, offset }; 25 | } 26 | } 27 | return { begin, 0 }; 28 | }; 29 | }; 30 | template 31 | struct _trie_node { 32 | _trie_node() {} 33 | _trie_node(STATUS status) :m_status(status) {} 34 | std::map<_wordTy, _trie_node*> m_succTo;//匹配跳转 35 | STATUS m_status; 36 | _valTy m_val; 37 | _trie_node *m_failTo = nullptr;//失配跳转 38 | size_t depth = 0; 39 | }; 40 | template 41 | struct _match_res { 42 | _valTy val; 43 | _keyItor begin; 44 | _keyItor end; 45 | }; 46 | 47 | template 48 | class lctrie { 49 | using _keyItor = typename _keyTy::const_pointer;// const char* || const wchar_t* 50 | using _wordTy = _keyTy; 51 | public: 52 | lctrie() { 53 | m_allNode.emplace_front(ROOT); 54 | m_root = &m_allNode.front(); 55 | }; 56 | std::vector<_match_res<_valTy, _keyItor>> match(const _keyTy &str, bool fullMatch = false, bool longest = true) const { 57 | std::vector<_match_res<_valTy, _keyItor>> ret; 58 | auto node = m_root; 59 | _keyItor begin = str.data(), end = str.data() + str.size(); 60 | auto next = Trie_decoder::next(begin, end); 61 | auto nextchr = _wordTy(begin, next.first); 62 | while (begin != end || node != m_root) { 63 | do { 64 | if (node->m_succTo.count(nextchr)) {//可行跳转 65 | node = node->m_succTo[nextchr]; 66 | } else if (node != m_root) {//非可行跳转,尝试跳转到fail指针 67 | node = node->m_failTo ? node->m_failTo : m_root; 68 | continue;//重新匹配nextchr 69 | } 70 | begin = next.first; 71 | next = Trie_decoder::next(begin, end); 72 | nextchr = _wordTy(begin, next.first); 73 | } while (node->m_status != LEAF && (begin != end || node != m_root)); 74 | if (node->m_status == LEAF) { 75 | auto mbegin = begin - node->depth; 76 | auto mend = begin; 77 | if (!fullMatch && !ret.empty()) { 78 | if (longest && ret.back().begin == mbegin) {//贪心匹配(留下长结果) 79 | ret.back().end = mend; 80 | ret.back().val = node->m_val; 81 | } else if (ret.back().end <= mbegin) { //相交过滤 82 | ret.push_back({ node->m_val, mbegin, mend }); 83 | } 84 | } else { //全匹配(包括包含,相交结果) 85 | ret.push_back({ node->m_val, mbegin, mend }); 86 | } 87 | } 88 | } 89 | return ret; 90 | } 91 | lctrie& insert(const _keyTy &str, const _valTy &val) { 92 | acIsBuild = false; 93 | auto node = m_root; 94 | _keyItor begin = str.data(), end = str.data() + str.size(); 95 | while (begin != end) { 96 | auto next = Trie_decoder::next(begin, end); 97 | auto nextchr = _wordTy(begin, next.first); 98 | begin = next.first; 99 | if (!node->m_succTo.count(nextchr)) { //没有这个分支,增加节点 100 | m_allNode.emplace_front(INTER); 101 | auto &back = m_allNode.front(); 102 | back.depth = begin-str.data(); 103 | node->m_succTo[nextchr] = &back; 104 | } 105 | node = node->m_succTo[nextchr]; 106 | 107 | } 108 | node->m_val = val; 109 | node->m_status = LEAF; 110 | return *this; 111 | } 112 | lctrie& build_aho_corasick() { 113 | if (acIsBuild) { 114 | return *this; 115 | } 116 | acIsBuild = true; 117 | std::queue<_trie_node<_wordTy, _valTy>*> quNode; 118 | for (auto succNext : m_root->m_succTo) { 119 | auto next = succNext.second; 120 | auto nextchr = succNext.first; 121 | next->m_failTo = m_root; 122 | quNode.push(next); 123 | } 124 | while (!quNode.empty()) { 125 | //出队列 126 | auto tmpPtr = quNode.front(); 127 | quNode.pop(); 128 | //遍历succ边 129 | for (auto succNext : tmpPtr->m_succTo) { 130 | auto next = succNext.second; 131 | auto nextchr = succNext.first; 132 | //从父节点的fail指针开始,遍历可跳转的fail节点 133 | auto ptr = tmpPtr->m_failTo; 134 | while (ptr) { 135 | //找到可以接受nextchr的fail节点 136 | if (ptr->m_succTo.count(nextchr)) { 137 | next->m_failTo = ptr->m_succTo[nextchr]; 138 | break; 139 | } 140 | ptr = ptr->m_failTo; 141 | } 142 | //没有找到fail指针,则指向m_root 143 | if (!ptr) { 144 | next->m_failTo = m_root; 145 | } 146 | quNode.push(next); 147 | } 148 | } 149 | return *this; 150 | } 151 | private: 152 | bool acIsBuild = false; 153 | std::forward_list<_trie_node<_wordTy, _valTy>> m_allNode; 154 | _trie_node<_wordTy, _valTy> *m_root; 155 | }; 156 | 157 | std::pair Trie_decoder_GBK::next(const char* begin, const char* end) { 158 | if (begin < end) { 159 | unsigned char uc = (unsigned char)*begin; 160 | size_t offset = 1; 161 | //双/四字节 162 | if (uc >= 0x81 && uc <= 0xFE) { 163 | if (begin + 1 < end) { 164 | unsigned char tc = (unsigned char)*(begin + 1); 165 | if (tc >= 0x40 && tc <= 0xFE && tc != 0x7F) { 166 | offset = 2;//双字节 167 | } else if (tc >= 0x30 && tc <= 0x39 && begin + 3 < end) { 168 | unsigned char thrc = (unsigned char)*(begin + 2); 169 | unsigned char fourc = (unsigned char)*(begin + 3); 170 | if (thrc >= 0x81 && thrc <= 0xFE && fourc >= 0x30 && fourc <= 0x39) { 171 | offset = 4;//四字节 172 | } 173 | } 174 | } 175 | } 176 | if (begin + offset <= end) { 177 | return { begin + offset, offset }; 178 | } else { 179 | return { end, offset }; 180 | } 181 | } 182 | return { begin, 0 }; 183 | } 184 | std::pair Trie_decoder_UTF8::next(const char* begin, const char* end) { 185 | if (begin < end) { 186 | unsigned char uc = (unsigned char)*begin; 187 | size_t offset = 1; 188 | if ((uc & 0xC0) == 0xC0) { 189 | offset = 2; 190 | } else if ((uc & 0xE0) == 0xE0) { 191 | offset = 3; 192 | } else if ((uc & 0xF0) == 0xF0) { 193 | offset = 4; 194 | } else if ((uc & 0xF8) == 0xF8) { 195 | offset = 5; 196 | } 197 | if (begin + offset <= end) { 198 | return { begin + offset, offset }; 199 | } else { 200 | return { end, offset }; 201 | } 202 | } 203 | return { begin, 0 }; 204 | } 205 | } 206 | --------------------------------------------------------------------------------