├── rankselect ├── rank_select.numbers └── rankselect.md ├── samples └── src │ ├── text_to_bin.cpp │ ├── dawg_map_to_text.cpp │ ├── simple_unzip.cpp │ ├── dawg_mmap_to_text.cpp │ ├── for_each_suffix.cpp │ ├── dawg_mmap_build.cpp │ ├── word2idx.cpp │ ├── dawg_map_build.cpp │ ├── idx2word.cpp │ ├── text_match_key.cpp │ ├── match_pinyin.cpp │ ├── match_dawg.cpp │ ├── regex_allmatch.cpp │ ├── step_key.cpp │ ├── regex_submatch.cpp │ ├── fsa_extract_by_iter.cpp │ ├── ac_scan.cpp │ ├── match_key.cpp │ └── regex_maxmatch.cpp ├── tools ├── bin │ ├── fplcat.exe.md │ ├── zbs_unzip.exe.md │ ├── ac_build.exe.md │ ├── pinyin_build.exe.md │ ├── zbs_build.exe.md │ ├── adfa_build.exe.md │ ├── nlt_build.exe.md │ └── regex_build.exe.md └── tools.md ├── README.md └── LICENSE /rankselect/rank_select.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krareT/terark-wiki-zh_cn/HEAD/rankselect/rank_select.numbers -------------------------------------------------------------------------------- /samples/src/text_to_bin.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() { 5 | terark::LineBuf line; 6 | while (line.getline(stdin) > 0) { 7 | line.chomp(); 8 | int32_t len = line.n; 9 | fwrite(&len, 4, 1, stdout); 10 | fwrite(line.p, 1, line.n, stdout); 11 | } 12 | return 0; 13 | } 14 | -------------------------------------------------------------------------------- /samples/src/dawg_map_to_text.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | int main(int argc, char* argv[]) { 6 | terark::DAWG_Map dm; 7 | if (argc < 2) { 8 | fprintf(stderr, "usage: %s dawg_map_file\n", argv[0]); 9 | return 3; 10 | } 11 | dm.load_map(argv[1]); 12 | for (size_t i = 0; i < dm.size(); ++i) { 13 | printf("%s\t%d\n", dm.key(i).c_str(), dm.val(i)); 14 | } 15 | return 0; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /samples/src/simple_unzip.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | int main(int argc, char* argv[]) { 8 | if (argc < 2) { 9 | fprintf(stderr, "Usage: %s DFA-File\n", argv[0]); 10 | return 1; 11 | } 12 | std::unique_ptr dfa(BaseDFA::load_from(argv[1])); 13 | if (AcyclicPathDFA* adfa = dynamic_cast(dfa.get())) { 14 | adfa->print_output(stdout); 15 | } else { 16 | fprintf(stderr, "file: %s is not an AcyclicPathDFA\n", argv[1]); 17 | } 18 | return 0; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /samples/src/dawg_mmap_to_text.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | int main(int argc, char* argv[]) { 8 | if (argc < 3) { 9 | fprintf(stderr, "usage %s key_file val_file\n", argv[0]); 10 | return 3; 11 | } 12 | const char* key_file = argv[1]; 13 | const char* val_file = argv[2]; 14 | DAWG_Map dm(BaseDFA::load_mmap(key_file)); 15 | dm.load_mmap_values(val_file); 16 | for (size_t i = 0; i < dm.size(); ++i) { 17 | printf("%s\t%d\n", dm.key(i).c_str(), dm.val(i)); 18 | } 19 | return 0; 20 | } 21 | 22 | -------------------------------------------------------------------------------- /tools/bin/fplcat.exe.md: -------------------------------------------------------------------------------- 1 | # fplcat.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | ## Usage Help 6 | ```text 7 | Usage: fplcat.exe Options [Input-File-List] 8 | Options: 9 | -h Show this help information 10 | -o Output-File 11 | -v Show verbose info 12 | -k Save file name as key, default DO NOT save file name 13 | If Input-File-List is empty, read file names from stdin 14 | ``` 15 | 16 | ## Package many many files and compress by zbs_build.exe 17 | ```bash 18 | find some/dir | fplcat.exe -o package.pkg 19 | zbs_build.exe -B -S 0.03 -o package.zbs package.pkg 20 | ``` 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Terark Wiki 中文文档 2 | 3 | [English](https://github.com/Terark/terark-wiki-en) 4 | 5 | ## 概述 6 | 7 | 本文档主要对 Terark 公司的相关产品、技术进行综合介绍，包括一些工具集，关键技术点等。 8 | 9 | 关于每个产品的详细说明和测试报告，请参考每个产品本身的 Github Wiki 10 | 11 | ## 目录 12 | - 产品列表 13 | - 最核心的　[TerarkDB](https://github.com/Terark/terarkdb/wiki)：基于 RocksDB，使用 Terark 核心技术的存储引擎 14 | - [Mongo on TerarkDB](https://github.com/Terark/mongo-on-terarkdb/wiki)：基于 TerarkDB 存储引擎的 MongoDB 版本，通过 MongoRocks 实现 15 | - [MySQL on TerarkDB](https://github.com/Terark/mysql-on-terarkdb/wiki)：基于 TerarkDB 存储引擎的　MySQL　版本，通过 MyRocks 实现 16 | - [Terark 核心工具集](tools/tools.md) 17 | - 使用 Terark 的核心算法生成的各类工具 18 | - [Rank-Select](rankselect/rankselect.md) 19 | - Rank-Select 是 Succinct Data Structures 的算法基础，我们自己实现了一套高性能的 Rank-Select 20 | -------------------------------------------------------------------------------- /tools/bin/zbs_unzip.exe.md: -------------------------------------------------------------------------------- 1 | # zbs\_unzip.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | ## Links 6 | 7 | TODO: 8 | 9 | ## Usage Help 10 | ```text 11 | Usage: zbs_unzip.exe Options Input-BlobStore-File [recId1 recId2 ...] 12 | Synopsis: 13 | If recId1 recId2 ... are provided, just unzip/extract the specified records 14 | Record id is 0-base, so the min record id is 0, not 1 15 | Options: 16 | -h Show this help information 17 | -t Show timing and through put 18 | -p MMAP_POPULATE(linux) or FileMapping prefetch(windows) 19 | -r Unzip in random order 20 | -b Bench mark, this will not output unzipped data 21 | -B Output as binary, do not append newline for each record 22 | -T thread num, when benchmark, use multi thread 23 | ``` 24 | -------------------------------------------------------------------------------- /samples/src/for_each_suffix.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace terark; 7 | 8 | static void on_suffix(size_t nth, fstring suffix) { 9 | printf("%06zd\t%.*s\n", nth, suffix.ilen(), suffix.data()); 10 | } 11 | 12 | int main(int argc, char* argv[]) { 13 | const char* ifile = NULL; // input dfa file name 14 | for (int opt=0; (opt = getopt(argc, argv, "i:")) != -1; ) { 15 | switch (opt) { 16 | case '?': return 3; 17 | case 'i': ifile = optarg; break; 18 | } 19 | } 20 | std::unique_ptr dfa; 21 | if (ifile) dfa.reset(MatchingDFA::load_from(ifile)); // by filename 22 | else dfa.reset(MatchingDFA::load_from(stdin)); // by FILE* 23 | 24 | for(int i = optind; i < argc; ++i) { 25 | const char* exact_prefix = argv[i]; 26 | printf("%s ----------\n", exact_prefix); 27 | dfa->for_each_suffix(exact_prefix, on_suffix); 28 | } 29 | return 0; 30 | } 31 | 32 | -------------------------------------------------------------------------------- /samples/src/dawg_mmap_build.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace terark; 7 | 8 | struct ParseKeyInt { 9 | void operator()(fstring line, fstring* key, int* val) { 10 | line.split('\t', &F); 11 | *key = F[0]; 12 | *val = terark::lcast(F[1]); 13 | } 14 | valvec F; 15 | }; 16 | 17 | int main(int argc, char* argv[]) { 18 | if (argc < 4) { 19 | fprintf(stderr, "usage: %s text_file dawg_dfa_file mmap_value_file\n", argv[0]); 20 | return 3; 21 | } 22 | if (0) { 23 | terark::FileStream text_file(argv[1], "r"); 24 | DAWG_Map dm(BaseDFA::load_mmap(argv[2])); 25 | dm.patch_values(text_file, ParseKeyInt()); 26 | dm.save_mmap_values(argv[3]); 27 | } else { 28 | // 通常情况下，使用这种方式创建 DAWG_Map 更简洁 29 | // this is the short cut 30 | DAWG_Map dm; 31 | dm.complete_dawg_map_mmap(argv[1], argv[2], argv[3], ParseKeyInt()); 32 | } 33 | return 0; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /samples/src/word2idx.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | int main(int argc, char* argv[]) { 8 | const char* ifile = NULL; 9 | for (int opt=0; (opt = getopt(argc, argv, "i:")) != -1; ) { 10 | switch (opt) { 11 | case '?': return 3; 12 | case 'i': ifile = optarg; break; 13 | } 14 | } 15 | std::unique_ptr dfa; 16 | if (ifile) dfa.reset(BaseDFA::load_from(ifile)); // by filename 17 | else dfa.reset(BaseDFA::load_from(stdin)); // by FILE* 18 | const BaseDAWG* dawg = dfa->get_dawg(); 19 | if (NULL == dawg) { 20 | fprintf(stderr, "file \"%s\" is not a dawg\n", ifile?ifile:"stdin"); 21 | return 1; 22 | } 23 | for (int i = optind; i < argc; ++i) { 24 | const char* word = argv[i]; 25 | size_t idx = dawg->index(word); 26 | if (size_t(-1) == idx) 27 | printf("NOT FOUND %s\n", word); 28 | else 29 | printf("%08zd %s\n", idx, word); 30 | } 31 | return 0; 32 | } 33 | 34 | -------------------------------------------------------------------------------- /rankselect/rankselect.md: -------------------------------------------------------------------------------- 1 | # Rank Select 2 | 3 | [返回首页](../README.md) 4 | 5 | ## Rank Select 性能比较 6 | 7 | 我们的核心产品使用了 Succinct 数据结构，Succinct 数据结构的核心是 Rank Select，所以 Rank Select 的性能至关重要。 8 | 为了保证性能，我们自己实现了 Rank Select，比起最流行的开源实现([sdsl-lite](https://github.com/simongog/sdsl-lite))，我们的性能有巨大优势，以下是性能测试结果： 9 | 10 | ### 测试机器配置 11 | 12 | |CPU | | 13 | |------|------| 14 | |CPU 数量 | 2 15 | |CPU 型号 | Xeon E5-2630 v3 16 | |CPU 标称频率|2.4 GHz 17 | |CPU 实际运行频率|2.6 GHz 18 | |CPU 核心数/线程数（单颗）|8核16线程 19 | |CPU 核心数/线程数（总数）|16核32线程 20 | |CPU 缓存尺寸|20M 21 | |CPU bogomips
（反应cpu真实速度）|4793 22 | 23 | |内存 | | 24 | |------|------| 25 | |内存容量|64GB 26 | |内存频率|DDR4 1866 MHz 27 | 28 | ### 测试结果 29 | 30 | 纵坐标是单个操作的耗时，单位是**纳秒**，横坐标是各种 rank select 实现，`sdsl_v_mcl` 是 [sdsl-lite](https://github.com/simongog/sdsl-lite) 中最快的 rank select 实现。其余的是我们自己的不同实现，包含 `_fast` 后缀的是加了一点额外优化的 inline 函数。 31 | 32 | 从图中可以看到，内存墙（4G vs 32K bits）是最大的性能瓶颈。 33 | 34 | 图中文字显示有点问题，比较拥挤，最好点击图片，在浏览器新窗口中打开，会正常显示。 35 | 36 | ![rank_select](https://cdn.rawgit.com/terark/terark-wiki-zh_cn/master/rankselect/rank_select.svg) 37 | -------------------------------------------------------------------------------- /samples/src/dawg_map_build.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace terark; 7 | 8 | struct ParseKeyInt { 9 | void operator()(fstring line, fstring* key, int* val) { 10 | line.split('\t', &F); 11 | *key = F[0]; 12 | *val = terark::lcast(F[1]); 13 | } 14 | valvec F; 15 | }; 16 | 17 | int main(int argc, char* argv[]) { 18 | if (argc < 3) { 19 | fprintf(stderr, "usage: %s text_file dawg_dfa_file [binary_out_file]\n", argv[0]); 20 | return 3; 21 | } 22 | if (4 == argc) { 23 | terark::FileStream text_file(argv[1], "r"); 24 | DAWG_Map dm(BaseDFA::load_from(argv[2])); 25 | dm.patch_values(text_file, ParseKeyInt()); 26 | dm.save_map(argv[3]); 27 | } else { 28 | // 通常情况下，使用这种方式创建 DAWG_Map 更简洁 29 | // this is the short cut, append values to dawg binary file 30 | // 31 | // if dawg file already has values, do overwrite 32 | // bool overwriteValues = true; // this param is defaulted as true 33 | DAWG_Map dm; 34 | dm.complete_dawg_map(argv[1], argv[2], ParseKeyInt() /*, overwriteValues*/); 35 | } 36 | return 0; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /samples/src/idx2word.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | int main(int argc, char* argv[]) { 8 | const char* ifile = NULL; 9 | for (int opt=0; (opt = getopt(argc, argv, "i:")) != -1; ) { 10 | switch (opt) { 11 | case '?': return 3; 12 | case 'i': ifile = optarg; break; 13 | } 14 | } 15 | std::unique_ptr dfa; 16 | if (ifile) dfa.reset(BaseDFA::load_from(ifile)); // by filename 17 | else dfa.reset(BaseDFA::load_from(stdin)); // by FILE* 18 | const BaseDAWG* dawg = dfa->get_dawg(); 19 | if (NULL == dawg) { 20 | fprintf(stderr, "file \"%s\" is not a dawg\n", ifile?ifile:"stdin"); 21 | return 1; 22 | } 23 | size_t num_words = dawg->num_words(); 24 | for (int i = optind; i < argc; ++i) { 25 | const char* szidx = argv[i]; 26 | size_t idx = strtoul(szidx, NULL, 10); 27 | typedef long long ll; 28 | if (idx >= dawg->num_words()) { 29 | fprintf(stderr, "idx=%lld >= num_words=%lld\n", (ll)idx, (ll)num_words); 30 | } else { 31 | // dawg->nth_word(idx) will throw std::out_of_range for larger idx 32 | std::string word = dawg->nth_word(idx); 33 | printf("%08lld %s\n", (ll)idx, word.c_str()); 34 | } 35 | } 36 | return 0; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /samples/src/text_match_key.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // A simple match_key/match_key_l example 4 | int main(int argc, char* argv[]) { 5 | const char* ifile = NULL; // input dfa file name 6 | bool longest_match = false; 7 | for (int opt=0; (opt = getopt(argc, argv, "i:l")) != -1; ) { 8 | switch (opt) { 9 | case '?': return 3; 10 | case 'i': ifile = optarg; break; 11 | case 'l': longest_match = true; break; 12 | } 13 | } 14 | using namespace terark; 15 | std::unique_ptr dfa; 16 | if (ifile) dfa.reset(MatchingDFA::load_from(ifile)); // by filename 17 | else dfa.reset(MatchingDFA::load_from(stdin)); // by FILE* 18 | for(int i = optind; i < argc; ++i) { 19 | const char* text = argv[i]; 20 | int keylen = 0; 21 | int len; ///< max_partial_match_len, could be ignored 22 | auto on_match = [&](int klen, int idx, fstring value) { 23 | printf("%-20.*s idx=%08d val=%.*s\n" 24 | , klen, text, idx, value.ilen(), value.data()); 25 | keylen = klen; 26 | }; 27 | printf("----text=%s\n", text); 28 | if (longest_match) 29 | len = dfa->match_key_l(text, on_match); 30 | else 31 | len = dfa->match_key(text, on_match); 32 | if (keylen != len) 33 | printf("max_partial_match_len=%d: %.*s\n", len, len, text); 34 | } 35 | return 0; 36 | } 37 | 38 | -------------------------------------------------------------------------------- /samples/src/match_pinyin.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace terark; 9 | 10 | bool be_quiet = false; 11 | void OnMatch(int klen, int, fstring value) { 12 | if (!be_quiet) 13 | printf("%d\t%.*s\n", klen, value.ilen(), value.data()); 14 | } 15 | int main(int argc, char* argv[]) { 16 | const char* base_pinyin_txt_file = NULL; 17 | const char* word_pinyin_dfa_file = NULL; 18 | for (int opt=0; (opt = getopt(argc, argv, "p:qw:")) != -1; ) { 19 | switch (opt) { 20 | case 'q': be_quiet = true; break; 21 | case 'p': base_pinyin_txt_file = optarg; break; 22 | case 'w': word_pinyin_dfa_file = optarg; break; 23 | } 24 | } 25 | if (!base_pinyin_txt_file || !word_pinyin_dfa_file) { 26 | fprintf(stderr, 27 | "usage: %s -w word_pinyin_dfa_file -p base_pinyin_text_file < input\n", 28 | argv[0]); 29 | return 1; 30 | } 31 | MatchingDFAPtr dfa(MatchingDFA_load(word_pinyin_dfa_file)); 32 | fprintf(stderr, "loaded dfa: %s\n", word_pinyin_dfa_file); 33 | PinYinDFA_Builder pinyin_dfa_builder(base_pinyin_txt_file); 34 | fprintf(stderr, "built pinyin_dfa_builder: %s\n", base_pinyin_txt_file); 35 | terark::LineBuf line; 36 | terark::profiling pf; 37 | while (line.getline(stdin) > 0) { 38 | line.chomp(); 39 | printf("input: %s\n", line.p); 40 | long long t2 = pf.now(); 41 | MatchingDFAPtr pinyin_dfa(pinyin_dfa_builder.make_pinyin_dfa(line, NULL)); 42 | long long t3 = pf.now(); 43 | if (pinyin_dfa.get() != NULL) { 44 | PinYinDFA_Builder::match_pinyin(dfa.get(), pinyin_dfa.get(), &OnMatch); 45 | long long t4 = pf.now(); 46 | printf("time: make_dfa=%f'us match=%f'us\n", pf.uf(t2,t3), pf.uf(t3,t4)); 47 | } else { 48 | fprintf(stderr, "make_pinyin_dfa failed: %s\n", line.p); 49 | } 50 | } 51 | return 0; 52 | } 53 | 54 | -------------------------------------------------------------------------------- /samples/src/match_dawg.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | const char* func = "match_dawg"; 8 | struct OnMatch { 9 | void operator()(int len, int idx) { 10 | printf("%s: len=%d idx=%d word=%.*s\n", func, len, idx, len, word); 11 | max_word_len = len; 12 | } 13 | const char* word; 14 | int max_word_len; 15 | }; 16 | 17 | int main(int argc, char* argv[]) { 18 | const char* ifile = NULL; 19 | int longest_match = 0; 20 | for (int opt=0; (opt = getopt(argc, argv, "i:lL")) != -1; ) { 21 | switch (opt) { 22 | case '?': return 3; 23 | case 'i': ifile = optarg; break; 24 | case 'l': longest_match = 1; break; // same proto as match_dawg 25 | case 'L': longest_match = 2; break; // easy proto of match_dawg_l 26 | } 27 | } 28 | std::unique_ptr dfa; 29 | if (ifile) dfa.reset(BaseDFA::load_from(ifile)); // by filename 30 | else dfa.reset(BaseDFA::load_from(stdin)); // by FILE* 31 | const BaseDAWG* dawg = dfa->get_dawg(); 32 | if (NULL == dawg) { 33 | fprintf(stderr, "file \"%s\" is not a dawg\n", ifile?ifile:"stdin"); 34 | return 1; 35 | } 36 | if (longest_match) func = "match_dawg_l"; 37 | for(int i = 1; i < argc; ++i) { 38 | if (longest_match == 2) { // use easy match_dawg_l 39 | fstring word = argv[i]; 40 | size_t len, idx; 41 | if (dawg->match_dawg_l(word, &len, &idx)) { 42 | printf("match_dawg_l: len=%zd idx=%zd word=%.*s\n", len, idx, int(len), word.p); 43 | } 44 | } else { 45 | OnMatch on_match; 46 | on_match.word = argv[i]; 47 | int len; ///< max_partial_match_len, could be ignored 48 | if (longest_match) // same proto as match_dawg 49 | len = dawg->match_dawg_l(on_match.word, ref(on_match)); 50 | else 51 | len = dawg->match_dawg(on_match.word, ref(on_match)); 52 | if (len != on_match.max_word_len) { 53 | printf("%s: max_partial_match_len=%d: %.*s\n", func, len, len, on_match.word); 54 | } 55 | } 56 | } 57 | return 0; 58 | } 59 | 60 | -------------------------------------------------------------------------------- /samples/src/regex_allmatch.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #define _SCL_SECURE_NO_WARNINGS 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #if defined(__DARWIN_C_ANSI) 10 | #define malloc_stats() (void)(0) 11 | #else 12 | #include 13 | #endif 14 | 15 | #ifdef _MSC_VER 16 | #define malloc_stats() (void)(0) 17 | #endif 18 | 19 | using namespace terark; 20 | 21 | int main(int argc, char* argv[]) { 22 | bool verbose = false; 23 | MultiRegexMatchOptions mrOpt; 24 | for (int opt=0; (opt = getopt(argc, argv, "D:i:v")) != -1; ) { 25 | switch (opt) { 26 | case '?': return 1; 27 | case 'D': mrOpt.enableDynamicDFA = atoi(optarg) != 0; break; 28 | case 'i': mrOpt.dfaFilePath = optarg; break; 29 | case 'v': verbose = true; break; 30 | } 31 | } 32 | if (mrOpt.dfaFilePath.empty()) { 33 | fprintf(stderr, "usage: -i dfa_file must be provided!\n"); 34 | return 1; 35 | } 36 | mrOpt.load_dfa(); 37 | std::unique_ptr 38 | all(MultiRegexFullMatch::create(mrOpt)); 39 | terark::profiling pf; 40 | long long ts = pf.now(); 41 | all->warm_up(); 42 | long long t0 = pf.now(); 43 | long lineno = 0; 44 | long matched = 0; 45 | long sumlen = 0; 46 | terark::LineBuf line; 47 | while (line.getline(stdin) > 0) { 48 | lineno++; 49 | line.chomp(); 50 | all->match_all(fstring(line), ::tolower); 51 | if (all->size()) { 52 | if (verbose) { 53 | printf("line:%ld:", lineno); 54 | for(int regexId : *all) { 55 | printf(" %d", regexId); 56 | } 57 | printf("\n"); 58 | } 59 | matched++; 60 | sumlen += line.size(); 61 | } 62 | } 63 | long long t1 = pf.now(); 64 | printf("time(warm_up)=%f's\n", pf.sf(ts, t0)); 65 | printf("time=%f's lines=%ld matched=%ld QPS=%f Throughput=%f'MiB Latency=%f'us\n" 66 | , pf.sf(t0,t1) 67 | , lineno 68 | , matched 69 | , lineno/pf.sf(t0,t1) 70 | , sumlen/pf.uf(t0,t1) 71 | , pf.uf(t0,t1)/lineno 72 | ); 73 | malloc_stats(); 74 | return 0; 75 | } 76 | 77 | -------------------------------------------------------------------------------- /samples/src/step_key.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | int delim = '\t'; 8 | 9 | struct OnWord { 10 | void operator()(int idx, fstring value) { 11 | if (strnlen(value.p, value.n) < value.size() || 256 == delim) { 12 | // value is binary 13 | printf("%-20.*s idx=%08d bin=", pos, text, idx); 14 | for (int i = 0; i < value.n; ++i) 15 | printf("%02X", (byte_t)value.p[i]); 16 | printf("\n"); 17 | } 18 | else { // value is text 19 | printf("%-20.*s idx=%08d val=%.*s\n" 20 | , pos, text, idx, value.ilen(), value.data()); 21 | } 22 | } 23 | int pos; 24 | const char* text; 25 | }; 26 | 27 | int main(int argc, char* argv[]) { 28 | const char* ifile = NULL; // input dfa file name 29 | bool longest_match = false; 30 | for (int opt=0; (opt = getopt(argc, argv, "d::i:l")) != -1; ) { 31 | switch (opt) { 32 | case '?': return 3; 33 | case 'd': 34 | if (optarg) 35 | delim = optarg[0]; 36 | else // no arg for -d, set delim for binary key-val match 37 | delim = 256; // dfa built by kvbin_build use 256 as delim 38 | break; 39 | case 'i': ifile = optarg; break; 40 | case 'l': longest_match = true; break; 41 | } 42 | } 43 | std::unique_ptr dfa; 44 | if (ifile) dfa.reset(MatchingDFA::load_from(ifile)); // by filename 45 | else dfa.reset(MatchingDFA::load_from(stdin)); // by FILE* 46 | 47 | printf("delim=%c(0x%02X)\n", delim, delim); 48 | for(int i = optind; i < argc; ++i) { 49 | const char* text = argv[i]; 50 | printf("%s ----------\n", text); 51 | bool ok; 52 | MatchContext ctx; 53 | fstring ftext(text); 54 | if (longest_match) { 55 | ok = dfa->step_key_l(ctx, delim, ftext); 56 | } else { 57 | ok = dfa->step_key(ctx, delim, ftext); 58 | } 59 | printf("%s.ret=%d strlen=%zd ctx[pos=%zd zidx=%zd]: %.*s\n", 60 | longest_match ? "step_key_l" : "step_key", 61 | ok, ftext.size(), ctx.pos, ctx.zidx, (int)ctx.pos, text); 62 | if (ok) { 63 | OnWord on; 64 | on.text = text; 65 | on.pos = int(ctx.pos); 66 | dfa->for_each_word(ctx.root, ctx.zidx, ref(on)); 67 | } 68 | } 69 | return 0; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /samples/src/regex_submatch.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace terark; 9 | 10 | void match_and_print(MultiRegexSubmatch& sub, fstring text) { 11 | printf("%s ----------\n", text.p); 12 | int max_match_len = sub.match_utf8(text, ::tolower); 13 | for(int j = 0; j < (int)sub.fullmatch_regex().size(); ++j) { 14 | int regex_id = sub.fullmatch_regex()[j]; 15 | int nsub = sub.num_submatch(regex_id); 16 | for(int k = 0; k < nsub; ++k) { 17 | fstring str = sub(text.p, regex_id, k); 18 | printf("j=%d regex_id=%d sub(%d): %.*s\n", j, regex_id, k, str.ilen(), str.data()); 19 | } 20 | } 21 | printf("max_match_len=%d: %.*s\n", max_match_len, max_match_len, text.p); 22 | } 23 | 24 | int main(int argc, char* argv[]) { 25 | MultiRegexMatchOptions mrOpt; 26 | bool verbose = false; 27 | for (int opt=0; (opt = getopt(argc, argv, "m:D:i:v")) != -1; ) { 28 | switch (opt) { 29 | case '?': return 1; 30 | case 'D': mrOpt.enableDynamicDFA = atoi(optarg) != 0; break; 31 | case 'm': mrOpt.regexMetaFilePath = optarg; break; 32 | case 'i': mrOpt.dfaFilePath = optarg; break; 33 | case 'v': verbose = true; break; 34 | } 35 | } 36 | if (mrOpt.regexMetaFilePath.empty()) { 37 | fprintf(stderr, "-m bin_meta_file is required\n"); 38 | return 1; 39 | } 40 | if (mrOpt.dfaFilePath.empty()) { 41 | fprintf(stderr, "-i dfa_file is required\n"); 42 | return 1; 43 | } 44 | mrOpt.load_dfa(); 45 | std::unique_ptr 46 | sub(MultiRegexSubmatch::create(mrOpt)); 47 | terark::LineBuf line; 48 | terark::profiling pf; 49 | long t0 = pf.now(); 50 | long lineno = 0; 51 | while (line.getline(stdin) > 0) { 52 | lineno++; 53 | line.chomp(); 54 | if (verbose) 55 | match_and_print(*sub, fstring(line.p, line.n)); 56 | else 57 | sub->match_utf8(fstring(line.p, line.n), ::tolower); 58 | } 59 | long t1 = pf.now(); 60 | printf("time=%f's QPS=%f Latency=%f'us\n" 61 | , pf.sf(t0,t1) 62 | , lineno/pf.sf(t0,t1) 63 | , pf.uf(t0,t1)/lineno 64 | ); 65 | return 0; 66 | } 67 | 68 | -------------------------------------------------------------------------------- /tools/tools.md: -------------------------------------------------------------------------------- 1 | # Terark核心工具集 2 | 3 | [返回首页](../README.md) 4 | 5 | ## terark 核心工具集 6 | 7 | [terark 核心工具集下载](http://www.terark.com/download/tools/latest)，在该下载列表中，核心工具集的名字是 8 | - terark-fsa_all-Linux-x86_64-g++-**VERSION**-bmi2-0.tgz : 能在较旧的机器上使用 9 | - terark-fsa_all-Linux-x86_64-g++-**VERSION**-bmi2-1.tgz : 只能在 intel-haswell 或更新的机器上使用 10 | 11 | 请根据自己的机器，和gcc版本，下载相应的版本。 12 | 13 | Terark 核心工具集package解压后的目录结构：
14 | root = pkg/terark-fsa_all-Linux-x86_64-g++-**VERSION**-bmi2-**X**
15 | 16 | |目录|说明| 17 | --------|---------| 18 | root/bin| **核心命令行工具** | 19 | root/lib| 动态库 | 20 | root/include| 暂时用不上 | 21 | root/samples| | 22 | root/samples/bin| 有一些示例和性能测试工具| 23 | root/samples/src| 示例代码 | 24 | 25 | ### 核心命令行工具 26 | 为了便于编译和各平台统一，即使在 Linux/Mac 中，这些命令行工具也包含 .exe 后缀名。 27 | 28 | |名称|功能描述 | 29 | -----|-----| 30 | [nlt_build.exe](bin/nlt_build.exe.md)|Terark 嵌套Trie树创建(针对 Key)，压缩后的文件可以通过 Terark 的专用 API 加载，并进行搜索
[terark-zip-rocksdb](https://github.com/Terark/terark-zip-rocksdb) 的索引(key)使用了该算法
该算法实践了 Terark 提出的 **CO-Index** (**C**ompressed **O**rdered **Index**) 概念| 31 | [zbs_build.exe](bin/zbs_build.exe.md)|Terark 数据库全局压缩(针对 Value )，压缩后的文件可以通过 Terark 的专用 API 加载，并按记录ID随机提取每条记录。[terark-zip-rocksdb](https://github.com/Terark/terark-zip-rocksdb) 的 value 压缩使用了该算法
该算法实践了 Terark 提出的 **PA-Zip**(**P**oint **A**ccessible **Zip**) 概念| 32 | [zbs_unzip.exe](bin/zbs_unzip.exe.md)|全部解压(或定点提取)由 [zbs_build.exe](bin/zbs_build.exe.md) 压缩的数据，也可用作 benchmark| 33 | [fplcat.exe](bin/fplcat.exe.md)|将多个文件打包在一起，以便使用 [zbs_build.exe](bin/zbs_build.exe.md) 进行压缩，打包出来的文件传递给 [zbs_build.exe](bin/zbs_build.exe.md) 时，需要指定 -B 参数| 34 | |[adfa_build.exe](bin/adfa_build.exe.md)|从输入的 Key 集合创建 ADFA(Acyclic DFA: 无环DFA)，输入文本文件，每行一个 Key，
生成的 DFA 可以进行 Key 匹配（全匹配、前缀匹配），也可以通过[特殊的方式实现 Map 功能](http://nark.cc/p/?p=172)| 35 | [ac_build.exe](bin/ac_build.exe.md)|从文本形式的 Pattern 集合创建 AC 自动机，创建出来的 AC 自动机文件可以通过过 Terark 的专用 API 加载，并调用各种匹配函数
输入的 Pattern 文件中，每行一个 Pattern，每个 Pattern 作为精确字符串匹配，单线程匹配性能可达每秒数百MB甚至上GB| 36 | [regex_build.exe](bin/regex_build.exe.md)|从正则表达式集合创建多正则自动机，创建出来的多正则自动机文件可以通过过 Terark 的专用 API 加载，并调用各种匹配函数
因为是多正则匹配，功能比 AC 自动机要强得多，但匹配性能比 AC 自动机要低一些| 37 | [pinyin_build.exe](bin/pinyin_build.exe.md)|创建“根据拼音对汉字短语纠错”的自动机，demo 见 [nark.cc](http://nark.cc)
生成的自动机可以通过 Terark 的专用 API 加载，并执行搜索和纠错| 38 | 39 | 40 | 其他命令行工具使用说明，可参见各命令的 usage help，详细说明我们会逐渐在该文档中完善。 41 | -------------------------------------------------------------------------------- /tools/bin/ac_build.exe.md: -------------------------------------------------------------------------------- 1 | # ac\_build.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | ## Links 6 | * [AC 自动机的实现](http://nark.cc/p/?p=1453) 7 | * [发掘双数组 Trie (Double Array Trie) 的能力](http://nark.cc/p/?p=1480) 8 | 9 | ## Usage Help 10 | 11 | ```text 12 | Usage: 13 | rls/ac_build.exe Options [Input-Pattern-File] 14 | 15 | Description: 16 | Build AC Automaton from Input-Pattern-File, if Input-Pattern-File is 17 | omitted, use stdin 18 | 19 | Options: 20 | -O AC-Automata-File : Using general dfa trie as base trie 21 | -d AC-Automata-File : Using DoubleArray trie as base trie 22 | BaseAC::restore_word(state, word_id) can be used for 23 | DoubleArray trie based AC-Automata when "-e 1" was used. 24 | -F Full-AC-DFA-File 25 | Patch all fail-and-retry-success link physically, this will produce 26 | a physical DFA for regex ".*(P1|P2|...|Pn)", a simple stupid state 27 | transition table for this DFA will be very large. To reduce memory usage, 28 | a state compression schema is used. Since it is not needed to track the 29 | fail link to reach a target state, it should be much faster, but in real 30 | world, this is much slower, maybe caused by poor CPU cache hit rate and 31 | an additional indirection incured by state compression. 32 | -e 0, 1 or 2: default is 0 33 | 0: Do not save any pattern data, just save AC automata self 34 | 1: Save word length into AC automata, methods will be enabled: 35 | * BaseAC::wlen(word_id) 36 | * BaseAC::restore_word(state, word_id) 37 | 2: Save word length and content into AC automata, methods will be enabled: 38 | * BaseAC::word(word_id) will be valid 39 | 40 | Notes: 41 | word_id will be set as bytes lexicalgraphical ordinal number. 42 | word_id maybe named as pattern_id somewhere else. 43 | ``` 44 | 45 | ## 示例程序 46 | 47 | Pattern 库的 AC 自动机创建好以后，可以使用下面的测试程序进行性能测试，测试工具的具体用法，可以参考下表中的**代码**链接。 48 | 49 | 编译好的测试程序包含在 package 的 `sample/bin` 目录，下面的代码同时也包含在 package 中的 `sample/src` 目录。 50 | 51 | |代码|[package](http://terark.com/zh/download/tools/latest) 中可执行文件的路径| 52 | |----|----| 53 | |[ac_scan.cpp](../../samples/src/ac_scan.cpp)|`sample/bin/ac_scan.exe`| 54 | 55 | -------------------------------------------------------------------------------- /tools/bin/pinyin_build.exe.md: -------------------------------------------------------------------------------- 1 | # pinyin_build.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | ## Links 6 | * [nark.cc 首页即拼音纠错 demo](http://nark.cc) 7 | 8 | ## Usage Help 9 | ```text 10 | Usage: 11 | pinyin_build.exe Options [Input-TXT-File] 12 | 13 | Description: 14 | Build pinyin-to-HanZiWord DFA from Input-HanZiWord-File, 15 | If Input-HanZiWord-File is omitted, use stdin. 16 | 17 | Options: 18 | -h : Show this help infomation 19 | -q : Be quiet, don't print progress info 20 | -O Large-DFA-File : large, but pretty fast 21 | -o Small-DFA-File : small, but pretty slow, now deprecated 22 | -S Super-DFA-File : small, a little (smaller and slower) than Quick-DFA 23 | -U Louds-DFA-File : very small, very slow(~5x slower than Large-DFA) 24 | -u Louds-DFA-File : same as -U, but use RankSelect_IL 25 | -m [Save-As-MemMap]: Argment is optional, default is 1 26 | * If not specified this option or omitted the argument, use default(1) 27 | * If the argument is 0, it means "DO NOT Save-As-MemMap" 28 | -z MaxZpathTrieNum:MinZpathLen : default is 5:2 29 | -E u1:e1,u2:e2,u3:e3,... 30 | Build edit-distance keys into the result DFA 31 | u1:e1 indicate edit-distance 'e1' is tolerated for unicode char num 'u1' 32 | Root of edit-distance is dfa.state_move(initial_state, '\1') 33 | -H : With HanZiWord, the result DFA is not only searched by PinYin, but aslo 34 | the HanZiWord self, a HanZiWord may have satellite data(such as word-freq), 35 | this will enable searching the satellite data by HanZiWord 36 | -j MinJianPinLen : default is 7 37 | Build JianPin(just ShengMu) for HanZiWords which length is at least 38 | MinJianPinLen 39 | -p BasePinYin-File 40 | * BasePinYin should at least include all single HanZi's PinYin. 41 | * BasePinYin could include extra HanZiWord to PinYin pairs, this feature 42 | is for prevent auto-spell Multiple (Pinyin to HanZiWord) pairs. 43 | -2 : Allow for Double-ShengMu JianPin 44 | A JianPin of a HanZiWord should be All-Double-ShengMu or All-Single-ShengMu 45 | -w WarningType 46 | Only one WarningType(nohz) is supportted now. More warning types maybe 47 | added later. 48 | WarningTypes: 49 | * nohz : Warning when unicode chars of word are not all HanZi 50 | ``` 51 | -------------------------------------------------------------------------------- /samples/src/fsa_extract_by_iter.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace terark; 7 | 8 | void usage(const char* prog) { 9 | fprintf(stderr, R"EOS(usage: %s Options [startKey]" 10 | Options: 11 | -i DFA-File 12 | specify input DFA-File 13 | -b 14 | run as benchmark, do not print output for each key, 15 | but print time and speed on finish 16 | 17 | startKey is optional, 18 | if specified, start at startKey, 19 | if not specified, start at first key 20 | 21 | )EOS", prog); 22 | } 23 | 24 | int main(int argc, char* argv[]) { 25 | const char* ifile = NULL; 26 | bool benchmark = false; 27 | profiling pf; 28 | for (int opt=0; (opt = getopt(argc, argv, "i:b")) != -1; ) { 29 | switch (opt) { 30 | case '?': usage(argv[0]); return 3; 31 | case 'b': benchmark = true; break; 32 | case 'i': ifile = optarg; break; 33 | } 34 | } 35 | const char* startKey = NULL; 36 | if (optind < argc) { 37 | startKey = argv[optind]; 38 | } 39 | size_t limit = size_t(-1); 40 | if (optind + 1 < argc) { 41 | limit = (size_t)strtoul(argv[optind+1], NULL, 10); 42 | } 43 | std::unique_ptr dfa; 44 | if (ifile) dfa.reset(BaseDFA::load_from(ifile)); // by filename 45 | else dfa.reset(BaseDFA::load_from(stdin)); // by FILE* 46 | auto adfa = dynamic_cast(dfa.get()); 47 | if (NULL == adfa) { 48 | fprintf(stderr, "file \"%s\" is not a dawg\n", ifile?ifile:"stdin"); 49 | return 1; 50 | } 51 | std::unique_ptr iter(adfa->adfa_make_iter()); 52 | long long t0 = pf.now(); 53 | size_t sumlen = 0; 54 | size_t count = 0; 55 | if (startKey ? iter->seek_lower_bound(startKey) 56 | : iter->seek_begin()) { 57 | do { 58 | fstring w = iter->word(); 59 | sumlen += w.size(); 60 | if (!benchmark) { 61 | printf("%.*s\n", w.ilen(), w.data()); 62 | } 63 | } while (++count < limit && iter->incr()); 64 | } 65 | long long t1 = pf.now(); 66 | if (benchmark) { 67 | if (count) { 68 | printf("Bytes : %9.3f MB\n", sumlen/1e6); 69 | printf("AvgLen : %9.3f Bytes\n", sumlen/1.0/count); 70 | printf("Time : %9.3f seconds\n", pf.sf(t0,t1)); 71 | printf("Throughput : %9.3f MB/s\n", sumlen/pf.uf(t0,t1)); 72 | printf("QPS : %9.3f K op/s\n", count/pf.mf(t0,t1)); 73 | } 74 | else { 75 | printf("Not Found, no result\n"); 76 | } 77 | } 78 | return 0; 79 | } 80 | 81 | -------------------------------------------------------------------------------- /tools/bin/zbs_build.exe.md: -------------------------------------------------------------------------------- 1 | # zbs\_build.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | 直接执行 zbs\_build.exe，输出如下帮助信息： 6 | ``` 7 | Usage: zbs_build.exe Options Input-TXT-File 8 | Options: 9 | -h Show this help information 10 | -c checksumLevel: default 1, can be 0, 1, 2, 3 11 | 0: checksum disabled 12 | 1: checksum file header 13 | 2: checksum each record(needs extra 4 bytes per record) 14 | 3: checksum zip data area, and do not checksum each record 15 | -C Check for correctness 16 | -d Use Dawg String Pool 17 | -e EntropyAlgo: Use EntropyAlgo for entropy zip, default none 18 | h: huffman 19 | f: FSE (Finite State Entropy) 20 | -n Nest Level 21 | -r Random get benchmark 22 | -o Output-Trie-File 23 | -g Output-Graphviz-Dot-File 24 | -b BenchmarkLoop : Run benchmark 25 | -B Input is binary(bson) data 26 | -S FloatNumber : Sample ratio of dictionary compression, usually < 0.1, or -- 27 | FloatNumber@ : sample from , with sample ratio 28 | dict: : is the predefined sample dictionary 29 | -L local_match_opt when using dictionary compression 30 | h: Local Match by hashing, this is the default 31 | s: Local Match by suffix array 32 | -U [optional(0 or 1)] use new Ultra ref encoding, default 1 33 | -Z compress global dictionary 34 | 35 | If Input-TXT-File is omitted, use stdin 36 | Note: 37 | If -S SampleRatio is specified, Input-TXT-File must be a regular file, 38 | because in this case DictZipBlobStore need to read input file two passes, 39 | stdin can be redirected from a regular file, but CAN NOT pass through pipes, 40 | such as `cat Input-TXT-File | ...` 41 | ``` 42 | 43 | ## 简要说明 44 | 45 | zbs_build.exe 将每输入文本文件的每一行看做一条记录，行号就是记录ID（从 0 开始）。产生的输出包含两个文件，一个是 -o 中指定的文件，另一个是在 -o 的文件后加了 `-dict` 后缀的字典文件。通过 API 加载时这两个文件都需要。 46 | 47 | | 选项 | 说明 | 48 | -----|-----| 49 | -c | 见上文 usage| 50 | -C | 检查正确性，需要巨大的额外内存，谨慎使用| 51 | -d | 老选项，不要使用 | 52 | -e | 额外使用熵编码，对压缩率提升很小，一般能提升 3% 左右，但对解压速度影响较大，大约慢 30% | 53 | -n | 老选项，不要使用 | 54 | -b | 压完了顺便跑一下 benchmark，默认按记录 id **顺序**读取| 55 | -r | 跑 benchmark 时，按记录 id **随机**读取| 56 | -B | 输入是 BSON 数据，而不是文本，BSON 数据一条接一条紧挨着存储
只利用 BSON 前 4 个字节是长度的特性，而不管数据是否真的是 BSON
**注意**: BSON 前 4 个字节是 LittleEndian int32 表示的长度，该长度包括这 4 个字节本身，真实长度要减去 4| 57 | -S | 见上文 usage | 58 | -L | 在局部压缩中使用后缀数组，会提高一点压缩率，但会大幅降低压缩速度，对解压速度无任何影响，
单条数据较大时压缩率提高比较明显| 59 | -U | zbs 老版本中使用了简单的编码（因为实现简单），
新版中使用了复杂的编码，会提高压缩率，基本上对压缩速度和解压都没有影响| 60 | -Z | 把全局字典也压缩一下| 61 | -------------------------------------------------------------------------------- /samples/src/ac_scan.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #define _SCL_SECURE_NO_WARNINGS 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace terark; 10 | 11 | void usage(const char* prog) { 12 | fprintf(stderr, R"EOS(usage: %s Options 13 | Options: 14 | 15 | -i AC-Automata-File 16 | AC-Automata-File is a file built by `ac_build.exe` from a set of patterns 17 | 18 | -f TXT-File 19 | a text file to be scaned, if omitted, read from stdin 20 | 21 | )EOS", prog); 22 | } 23 | 24 | struct OnHit { 25 | void operator()(size_t endpos, const uint32_t* words, size_t cnt, size_t state) const { 26 | for (size_t i = 0; i < cnt; ++i) { 27 | if (ac->has_word_length()) { 28 | int wlen = ac->wlen(words[i]); 29 | size_t pos = endpos - wlen; 30 | #ifndef NDEBUG 31 | try { 32 | std::string acWord = ac->restore_word(state, words[i]); 33 | assert(fstring(text + pos, wlen) == acWord); 34 | } 35 | catch (const std::invalid_argument&) { 36 | // not a DoubleArray AC automaton 37 | } 38 | #endif 39 | printf("hit_endpos=%04d : word_id=%06d : %.*s\n", int(endpos), words[i], wlen, text + pos); 40 | } 41 | else { 42 | printf("hit_endpos=%04d : word_id=%06d\n", int(endpos), words[i]); 43 | (void)(state); // remove compiler warning 44 | } 45 | } 46 | } 47 | const BaseAC* ac; 48 | const char* text; 49 | }; 50 | int main(int argc, char* argv[]) { 51 | const char* dfa_file = NULL; 52 | const char* txt_file = NULL; 53 | for (int opt=0; (opt = getopt(argc, argv, "i:f:")) != -1; ) { 54 | switch (opt) { 55 | case '?': usage(argv[0]); return 3; 56 | case 'i': dfa_file = optarg; break; 57 | case 'f': txt_file = optarg; break; 58 | } 59 | } 60 | if (NULL == dfa_file) { 61 | fprintf(stderr, "usage: %s -i input_ac_dfa_file [-f text_file_to_be_matched]\n", argv[0]); 62 | return 1; 63 | } 64 | std::unique_ptr dfa(BaseDFA::load_from(dfa_file)); 65 | if (dfa->get_ac() == NULL) { 66 | fprintf(stderr, "Fail: file: %s is not a AC DFA\n", dfa_file); 67 | return 1; 68 | } 69 | terark::Auto_fclose fp; 70 | if (txt_file) { 71 | fp = fopen(txt_file, "r"); 72 | if (NULL == fp) { 73 | fprintf(stderr, "Fail: fopen(%s, r) = %s\n", txt_file, strerror(errno)); 74 | return 1; 75 | } 76 | } 77 | OnHit on_hit = { dfa->get_ac(), NULL }; 78 | if (!on_hit.ac->has_word_length()) { 79 | fprintf(stderr, "Pattern length was not saved in AC Automata,\n" 80 | " -- Only match endpos and pattern_id will be reported!\n"); 81 | } 82 | terark::LineBuf line; 83 | while (line.getline(fp.self_or(stdin)) > 0) { 84 | line.chomp(); 85 | on_hit.text = line.p; 86 | on_hit.ac->ac_scan(line, ref(on_hit)); 87 | } 88 | return 0; 89 | } 90 | 91 | -------------------------------------------------------------------------------- /tools/bin/adfa_build.exe.md: -------------------------------------------------------------------------------- 1 | # adfa_build.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | ## Links 6 | * [把自动机用作 Key-Value 存储](http://nark.cc/p/?p=172) 7 | * [用自动机表达嵌套的数据](http://nark.cc/p/?p=1085) 8 | * [使用 MapReduce 创建超大巨型自动机](http://nark.cc/p/?p=960) 9 | * [DFA的实现](http://nark.cc/p/?p=163) 10 | * [自动机的一些算法和应用](http://nark.cc/p/?p=161) 11 | 12 | ## Usage Help 13 | ```text 14 | At least one of 15 | -O xfast_dfa_file or -o small_linear_dfa_file or 16 | -U LoudsDFA_SE or -u LoudsDFA_IL or 17 | -Q quick_dfa_file or -S small_quick__dfa_file is required 18 | 19 | Usage: 20 | adfa_build.exe Options [Input-TXT-File] 21 | 22 | Description: 23 | Build DFA from Input-TXT-File, If Input-TXT-File is omitted, use stdin. 24 | This program is named adfa_build is because of the DFA graph is Acyclic. 25 | 26 | Options: 27 | -h : Show this help infomation 28 | -q : Be quiet, don't print progress info 29 | -O Large-DFA-File : large, but pretty fast 30 | -o Small-DFA-File : small, but pretty slow, now deprecated 31 | -Q Quick-DFA-File : small, and pretty fast 32 | -S Super-DFA-File : small, a little (smaller and slower) than Quick-DFA 33 | -U Louds-DFA-File : very small, very slow(~5x slower than Large-DFA) 34 | -u Louds-DFA-File : same as -U, but use RankSelect_IL 35 | -s : Notify the program that Input-TXT-File is sorted 36 | The program will use more efficient(less memory and faster) algorithm 37 | -d : Key-Value Delimiter 38 | -D : Decompose tail-cross-edge, only for Louds-DFA-File 39 | -w dup 40 | Issue warnings when found duplicate lines, argument must be "dup" 41 | -m [Save-As-MemMap]: Argment is optional, default is 1 42 | * If not specified this option or omitted the argument, use default(1) 43 | * If the argument is 0, it means "DO NOT Save-As-MemMap" 44 | -z MaxZpathTrieNum:MinZpathLen : default is 5:2 45 | -c : Check Non-Large-DFA-File for correctness 46 | Note: Enabled when compiling with CXXFLAGS += "-D WITH_CHECK_LINEAR" 47 | Enabled = NO, WITH_CHECK_LINEAR is NOT defined 48 | 49 | -l [Small-DFA-State-Bytes]:WalkMethod 50 | Small-DFA-State-Bytes is deprecated and will be ignored 51 | When specifying WalkMethod, the preceding colon(:) is required 52 | WalkMethod is the DFA-Graph WalkMethod when building Non-LargeDFA-File 53 | WalkMethod should be BFS/DFS/PFS, default is PFS 54 | WalkMethod has no effect when building Louds-DFA-File, which is always BFS 55 | ``` 56 | 57 | ## Description 58 | Option `-U` and `-u` use LOUDS succinct encoding, this will generate very small dfa file, 59 | for `url`, or file path test(`find / | adfa_build.exe ...`), the compression rate may up to 100x. 60 | -------------------------------------------------------------------------------- /samples/src/match_key.cpp: -------------------------------------------------------------------------------- 1 | #define _SCL_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | 5 | using namespace terark; 6 | 7 | void usage(const char* prog) { 8 | fprintf(stderr, R"EOS(usage: %s Options string1 string2 ... 9 | 10 | Options: 11 | 12 | -d [delim] 13 | Argument `delim` is optional, if specified delim is set as the argument, 14 | if argument `delim` is omitted, delim is set as integer 256, this requires 15 | sigma of the DFA to be at least 257. 16 | If option '-d' is omitted, use '\t' as delim. 17 | 18 | -i DFA-File 19 | 20 | -l 21 | use longest match(by `match_key_l`), if {"ab", "abc", "abcd"} are in DFA, 22 | and string to match is "abcde", `match_key_l` will only match "abcd", 23 | `match_key` will match all of { "ab", "abc", "abcd" } 24 | 25 | -r [root_ch] 26 | set root state as state_move(initial_state, root_ch) 27 | currently used for test pinyin_build with edit-distance 28 | 29 | string1 string2 ... 30 | multiple strings, used as parameters of match_key/match_key_l 31 | 32 | )EOS", prog); 33 | } 34 | 35 | int delim = '\t'; 36 | 37 | struct OnMatch { 38 | void operator()(int keylen, int idx, fstring value) { 39 | if (strnlen(value.p, value.n) < value.size() || 256 == delim) { 40 | // value is binary 41 | printf("%-20.*s idx=%08d bin=", keylen, text, idx); 42 | for (int i = 0; i < value.n; ++i) 43 | printf("%02X", (byte_t)value.p[i]); 44 | printf("\n"); 45 | } 46 | else { // value is text 47 | printf("%-20.*s idx=%08d val=%.*s\n" 48 | , keylen, text, idx, value.ilen(), value.data()); 49 | } 50 | this->keylen = keylen; 51 | } 52 | const char* text; 53 | int keylen; 54 | }; 55 | 56 | int main(int argc, char* argv[]) { 57 | const char* ifile = NULL; // input dfa file name 58 | bool longest_match = false; 59 | auchar_t root_ch = 257; 60 | for (int opt=0; (opt = getopt(argc, argv, "d::i:lr::")) != -1; ) { 61 | switch (opt) { 62 | case '?': usage(argv[0]); return 3; 63 | case 'd': 64 | if (optarg) 65 | delim = optarg[0]; 66 | else // no arg for -d, set delim for binary key-val match 67 | delim = 256; // dfa built by kvbin_build use 256 as delim 68 | break; 69 | case 'i': ifile = optarg; break; 70 | case 'l': longest_match = true; break; 71 | case 'r': 72 | // set root state as state_move(initial_state, root_ch) 73 | // currently used for test pinyin_build with edit-distance 74 | if (optarg) 75 | root_ch = optarg[0]; 76 | else 77 | root_ch = 256; 78 | break; 79 | } 80 | } 81 | std::unique_ptr dfa; 82 | if (ifile) dfa.reset(MatchingDFA::load_from(ifile)); // by filename 83 | else dfa.reset(MatchingDFA::load_from(stdin)); // by FILE* 84 | MatchContext ctx; 85 | if (root_ch < 257) { 86 | ctx.root = dfa->v_state_move(initial_state, root_ch); 87 | } 88 | OnMatch on_match; 89 | for(int i = optind; i < argc; ++i) { 90 | const char* text = argv[i]; 91 | on_match.text = text; 92 | on_match.keylen = 0; 93 | printf("----delim=%c[%02X] text=%s\n", delim, delim, text); 94 | int len; ///< max_partial_match_len, could be ignored 95 | if (longest_match) 96 | len = dfa->match_key_l(ctx, delim, text, ref(on_match)); 97 | else 98 | len = dfa->match_key(ctx, delim, text, ref(on_match)); 99 | if (on_match.keylen != len) 100 | printf("max_partial_match_len=%d: %.*s\n", len, len, text); 101 | } 102 | return 0; 103 | } 104 | 105 | -------------------------------------------------------------------------------- /tools/bin/nlt_build.exe.md: -------------------------------------------------------------------------------- 1 | # nlt\_build.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | 该程序生成一种压缩的字典树，生成的字典树可以通过 API 加载，并执行丰富的搜索功能。压缩，指的是的字典树占用的内存，比起构成字典树的那些 Key 的总字节数，压缩了多少倍。 6 | 7 | * 对于自然语言的词表，压缩率一般可以达到 3 到 5 倍 8 | * 对于 url 集合，压缩率一般可以超过 10 倍，如果 url 平均长度 60 字节，压缩后，平均每个 url 占用不超过 6 个字节 9 | 10 | 直接执行 nlt\_build.exe，输出如下帮助信息： 11 | ``` 12 | Usage: rls/nlt_build.exe Options Input-TXT-File 13 | Options: 14 | -h Show this help information 15 | -M maxFragLen 16 | -n Nest Level 17 | -o Output-Trie-File 18 | -g Output-Graphviz-Dot-File 19 | -b BenchmarkLoop : Run benchmark 20 | -w BenchmarkLoop : Run benchmark with dict_index 21 | -s indicate that input is sorted, the top level sort will be omitted 22 | -B Input is binary(bson) data 23 | -6 Input is base64 encoded data 24 | -U StrVecType, can be one of: 25 | x: SortableStrVec, this is the default 26 | s: SortedStrVec, -s must also be specified, for double check 27 | z: ZoSortedStrVec, -s must also be specified, for double check 28 | f: FixedLenStrVec 29 | +--------------------------------------------------------+ 30 | | |Memory Usage|Var Key Len|Can Be UnSorted?| 31 | |SortableStrVec| High | Yes | Yes | 32 | | SortedStrVec| Medium | Yes |!Must be Sorted!| 33 | |ZoSortedStrVec| Low | Yes |!Must be Sorted!| 34 | |FixedLenStrVec| Lowest | No | Yes | 35 | +--------------------------------------------------------+ 36 | ZoSortedStrVec is slower than SortedStrVec(20% ~ 40% slower). 37 | When using ZoSortedStrVec, you should also use -T 4@/path/to/tmpdir, 38 | otherwise warning will be issued. 39 | -T TmpDir, if specified, will use less memory 40 | TmpLevel@TmpDir, TmpLevel is 0-9 41 | -R RankSelect implementation, can be: 42 | se-256 43 | se-512 44 | il-256 45 | m-se-512 46 | m-il-256 47 | m-xl-256, this is the default 48 | If Input-TXT-File is omitted, use stdin 49 | ``` 50 | 51 | ## 简要说明 52 | 53 | 输入文件是文本，每行包含一个 key，重复 key 在输出的 trie 树中会被消重（但build过程中仍会占据重复的内存）。 54 | 55 | 如果总共有 n 个不同的 key，生成的 trie 树中，每个 key 对应一个 ID，ID 的范围是 `0` 到 `n-1` ，加载 trie 时使用 mmap，API 支持以下操作: 56 | 57 | |操作|时间复杂度| 说明 | 58 | ------|-------|------| 59 | 反向搜索|`O(keylen)`| 通过 ID 得到相应的 key| 60 | 正向搜索|`O(keylen)`| 通过 key **精确搜索**（得到相应的 ID）| 61 | 范围搜索|`O(keylen)`| 通过 iterator 支持，相当于 `std::map::lower_bound(key)`
可正向/反向遍历，相当于 `bidirectional_iterator`| 62 | 前缀搜索|搜索过程 `O(prefix_len)`
输出过程 `O(sum(result_keylen))`|按 key **前缀**搜索，搜索匹配的前缀长度，
搜索到以后可输出匹配该前缀的候选 (key,ID) 集合）| 63 | 正则表达式
搜索|最快 `O(regex_len)`
最慢 `O(all_key_len*regex_len)`|不同的正则表达式，时间复杂度差异较大
正则表达式头部不确定性越大（例如`.*abc`），耗时越大| 64 | 65 | ## 命令行详解 66 | | 选项 | 说明 | 67 | -----|-----| 68 | -M | 最长片段尺寸，一般不需要指定| 69 | -n | 嵌套层数，最小 2，一般不超过 5；一般情况下，嵌套越深，压缩率越高，速度越慢| 70 | -o | 输出的嵌套 trie 树文件 | 71 | -b | 如果想运行 benchmark 看一下性能，可以增加该参数，数据尺寸很大时，指定 1 即可| 72 | -g | 输出 graph-viz 的 dot 文件，用于查看树的图形表示| 73 | -s | 如果输入数据已经排序（按 ByteArray 字典序，`env LC_ALL=C sort` 命令行的默认排序），
指定该参数可以省略嵌套树最外层的排序，提高创建速度，对生成的 trie 树无任何影响| 74 | -B | 输入的每个 Key 是 BSON 数据，而不是文本，可用于二进制 Key，不验证 BSON 数据的合法性，
只要求按照 BSON的格式：每条数据前 4 个字节是 LittleEndian 的 int32 ，表示数据的长度。
**特别注意**：该长度包含 4 字节的 int32 本身| 75 | -T | 指定一个临时目录，如果输入数据很大（几十 GB），创建过程中可能会消耗较多内存，
指定该参数可以把一部分并非必须时刻驻留内存的数据保存在临时文件中 | 76 | -U | StrVecType, 参考 usage 77 | -------------------------------------------------------------------------------- /tools/bin/regex_build.exe.md: -------------------------------------------------------------------------------- 1 | # regex\_build.exe 2 | 3 | 该程序位于 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `bin` 目录，要运行该程序，您需要将 [下载 package](http://terark.com/zh/download/tools/latest) 中的 `lib` 目录加入环境变量 `LD_LIBRARY_PATH` 。 4 | 5 | ## Links 6 | * [支持并、交、差的正则表达式引擎](http://nark.cc/p/?p=1280) 7 | * [规则引擎建库工具](http://nark.cc/p/?p=177) 8 | * [多正则表达式匹配（Multiple Regular Expression Matching）](http://nark.cc/p/?p=174) 9 | * [多正则表达式匹配（Multiple Regular Expression Matching）中的动态 DFA 算法](http://nark.cc/p/?p=178) 10 | * [有多个初始状态的 DFA](http://nark.cc/p/?p=176) 11 | * [多正则表达式匹配的应用](http://nark.cc/p/?p=1428) 12 | * [一个很难的字符串问题](http://nark.cc/p/?p=153) 13 | 14 | ## Usage Help 15 | 16 | ```text 17 | Usage: 18 | regex_build.exe Options [Input-Regex-File] 19 | 20 | Description: 21 | Compile multiple regular expressions into one DFA 22 | 23 | Options: 24 | -h : Show this help infomation 25 | -q : Be quiet, don't print progress info 26 | -I : Ignore case for regex 27 | -L : Use Latin1 charset, not utf8(the default charset) 28 | -a Add-DotStar(Optional): can be 0, 1, 2 29 | 0: Do not add DotStar, treat all regular expression as head-anchored 30 | The result DFA will be a "RegexMatch" DFA, not a "RegexSearch" DFA. 31 | 1: Respect the regex's head anchor 32 | 2: Prepend DotStar on the unioned DFA, this is just for **DEBUG TEST** 33 | Note: 34 | * Head anchored regex example: "^Hello\s+World!" 35 | * If this option is omitted, Add-Dot-Star is 0 36 | * If the option argument is omitted, Add-Dot-Star is 1 37 | -O Large-DFA-File: Large, but pretty fast 38 | -o Small-DFA-File: Small(maybe ~10x smaller than Large-DFA) 39 | Small-DFA is essentially a kind of Virtual Machine DFA. 40 | Small-DFA is not only small, but also fast, in most cases, it is as 41 | faster as Large-DFA. But it may be much faster than Large-DFA in some 42 | special/ideal cases. 43 | -b Bin-Meta-File: The meta file used for capturing submatch 44 | -s Optional-Arg 45 | Build dfa with submatch capture, but the dfa algorithm can only capture 46 | one-pass DFA. 47 | If Optional-Arg is 's', the algorithm will ignore the regex's one-pass 48 | property, and try to use one-pass capturing algorithm to capture all 49 | submatches, this may produce bad result. But when using utf8 encoding, 50 | some unicode-one-pass regex is NOT byte-one-pass, for example: 51 | "从([^到]+)到([^怎]+)怎么走" is unicode-one-pass but not byte-one-pass. 52 | By using option "-ss", MultiRegexSubmatch::match_utf8() will successfully 53 | capture the submatches. 54 | Prepend a '*' at the start of a line has the same effect of '-ss' for the 55 | current regex. 56 | -E C-Program-HeaderFile 57 | Generate enum constant definitions in C-Program-HeaderFile 58 | constant name is specified by second column of Input-Regex-File 59 | -D : Build a dynamic matching DFA 60 | In many cases, the full unioned dfa can not be built, in this situation, 61 | dynamic matching DFA is a compromised solution between full-nfa matching 62 | and full-dfa matching. 63 | For speed up online dynamic matching, this program use a heuristic algorithm 64 | to clustering(partial union) SubDFAs offline. 65 | -z MinZpathLen, default is 2, only effective for full-unioned-dfa 66 | -g : Write DFA and NFA dot graph file for every regex 67 | -G : Write dot graph file for unioned DFA 68 | -c Conflict-Report-File 69 | -t DFA-Type: can be 'd', '1', '2', default is 'd' 70 | a: use DFA class which optimized for adfa_build 71 | d: use DenseDFA 72 | 2: use DenseDFA_V2 73 | -T Timeout1[:Timeout2]: Timout2 is optional 74 | Timeout1: Timeout for compiling one regex 75 | Timeout2: Timeout for union all regex 76 | -P : DO NOT Limit cluster_union_power_size, default is true 77 | 78 | Input-Regex-File format 79 | This is a tab separated text file, column 1 is the regex, other columns are 80 | optinal user defined contents. 81 | The regex is advanced regex: http://terark.cc/p/?p=1280 82 | Lines can be continued by backslash(\), same as which in C/C++ language. 83 | 84 | Bin-Meta-File format 85 | This is a tab separated text file, column description: 86 | Column 1: Integer regex-id, start with 0 87 | Column 2: Integer number-of-submaches(including the full match) 88 | Column 3: Boolean do-capturing, 1 or 0 89 | Column 4: Boolean is-one-pass, 1 or 0 90 | Column 5: String copy of full Input-Regex-File line 91 | ``` 92 | 93 | ## 运行匹配测试程序 94 | 95 | 规则库的 DFA 创建好以后，可以使用下面的测试程序进行性能测试，测试工具的具体用法，可以参考下表中的**代码**链接。 96 | 97 | 编译好的测试程序包含在 package 的 `sample/bin` 目录，下面的代码同时也包含在 package 中的 `sample/src` 目录。 98 | 99 | |代码|[package](http://terark.com/zh/download/tools/latest) 中可执行文件的路径| 100 | |----|----| 101 | |[regex_allmatch.cpp](../../samples/src/regex_allmatch.cpp)|`sample/bin/regex_allmatch.exe`| 102 | |[regex_maxmatch.cpp](../../samples/src/regex_maxmatch.cpp)|`sample/bin/regex_maxmatch.exe`| 103 | |[regex_submatch.cpp](../../samples/src/regex_submatch.cpp)|`sample/bin/regex_submatch.exe`| 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /samples/src/regex_maxmatch.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #define _SCL_SECURE_NO_WARNINGS 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #if defined(__DARWIN_C_ANSI) 12 | #define malloc_stats() (void)(0) 13 | #else 14 | #include 15 | #endif 16 | 17 | #ifdef _MSC_VER 18 | #define strcasecmp _stricmp 19 | #define malloc_stats() (void)(0) 20 | typedef intptr_t ssize_t; 21 | #endif 22 | 23 | using namespace terark; 24 | 25 | bool read_one_record(FILE* fp, terark::LineBuf* line, bool binary) { 26 | if (binary) { 27 | int32_t offsets[2]; 28 | return line->read_binary_tuple(offsets, 1, fp); 29 | } 30 | else { 31 | bool ret = line->getline(fp) > 0; 32 | line->chomp(); 33 | return ret; 34 | } 35 | } 36 | 37 | int match_text(MultiRegexFullMatch& fm, fstring text 38 | , bool shortest_match, bool ignore_case) 39 | { 40 | if (shortest_match) { 41 | if (ignore_case) 42 | // return fm.shortest_match(text, ::tolower); 43 | return fm.shortest_match(text, terark::gtab_ascii_tolower); 44 | else return fm.shortest_match(text); 45 | } 46 | else { 47 | if (ignore_case) 48 | // return fm.match(text, ::tolower); 49 | return fm.match(text, terark::gtab_ascii_tolower); 50 | else return fm.match(text); 51 | } 52 | } 53 | 54 | MultiRegexFullMatch::PosLen 55 | find_first(MultiRegexFullMatch& fm, fstring text 56 | , bool latin1, bool shortest_match, bool ignore_case) 57 | { 58 | if (latin1) { 59 | if (shortest_match) { 60 | if (ignore_case) 61 | return fm.shortest_byte_find_first(text, terark::gtab_ascii_tolower); 62 | else return fm.shortest_byte_find_first(text); 63 | } 64 | else { 65 | if (ignore_case) 66 | return fm.byte_find_first(text, terark::gtab_ascii_tolower); 67 | else return fm.byte_find_first(text); 68 | } 69 | } 70 | else { 71 | if (shortest_match) { 72 | if (ignore_case) 73 | return fm.shortest_utf8_find_first(text, terark::gtab_ascii_tolower); 74 | else return fm.shortest_utf8_find_first(text); 75 | } 76 | else { 77 | if (ignore_case) 78 | return fm.utf8_find_first(text, terark::gtab_ascii_tolower); 79 | else return fm.utf8_find_first(text); 80 | } 81 | } 82 | } 83 | 84 | size_t 85 | find_all(MultiRegexFullMatch& fm, fstring text 86 | , bool latin1, bool shortest_match, bool ignore_case) 87 | { 88 | if (latin1) { 89 | if (shortest_match) { 90 | if (ignore_case) 91 | return fm.shortest_byte_find_all(text, terark::gtab_ascii_tolower); 92 | else return fm.shortest_byte_find_all(text); 93 | } 94 | else { 95 | if (ignore_case) 96 | return fm.byte_find_all(text, terark::gtab_ascii_tolower); 97 | else return fm.byte_find_all(text); 98 | } 99 | } 100 | else { 101 | if (shortest_match) { 102 | if (ignore_case) 103 | return fm.shortest_utf8_find_all(text, terark::gtab_ascii_tolower); 104 | else return fm.shortest_utf8_find_all(text); 105 | } 106 | else { 107 | if (ignore_case) 108 | return fm.utf8_find_all(text, terark::gtab_ascii_tolower); 109 | else return fm.utf8_find_all(text); 110 | } 111 | } 112 | } 113 | 114 | int main(int argc, char* argv[]) { 115 | MultiRegexMatchOptions mrOpt; 116 | const char* txt_file = NULL; 117 | bool match_all_text = false; 118 | bool shortest_match = false; 119 | bool first_only = false; 120 | bool latin1 = false; 121 | bool verbose = false; 122 | bool binary = false; 123 | bool ignore_case = false; 124 | for (int opt=0; (opt = getopt(argc, argv, "asFLD:f:i:vBI")) != -1; ) { 125 | switch (opt) { 126 | case '?': return 1; 127 | case 'a': match_all_text = true; break; 128 | case 's': shortest_match = true; break; 129 | case 'F': first_only = true; break; 130 | case 'L': latin1 = true; break; 131 | case 'D': mrOpt.enableDynamicDFA = atoi(optarg) != 0; break; 132 | case 'i': mrOpt.dfaFilePath = optarg; break; 133 | case 'f': txt_file = optarg; break; 134 | case 'v': verbose = true; break; 135 | case 'B': binary = true; break; 136 | case 'I': ignore_case = true;break; 137 | } 138 | } 139 | if (mrOpt.dfaFilePath.empty()) { 140 | fprintf(stderr, "usage: %s -i dfa_file [-f match_file] [-v]\n", argv[0]); 141 | return 1; 142 | } 143 | terark::Auto_fclose fp; 144 | if (txt_file) { 145 | fp = fopen(txt_file, "r"); 146 | if (NULL == fp) { 147 | fprintf(stderr, "FATAL: fopen(%s, r) = %s\n", txt_file, strerror(errno)); 148 | return 1; 149 | } 150 | } 151 | mrOpt.load_dfa(); 152 | std::unique_ptr 153 | fmPtr(MultiRegexFullMatch::create(mrOpt)); 154 | MultiRegexFullMatch& fm = *fmPtr; 155 | terark::profiling pf; 156 | long long ts = pf.now(); 157 | fm.warm_up(); 158 | terark::LineBuf line; 159 | long long t0 = pf.now(); 160 | long lineno = 0; 161 | long sumlen = 0; 162 | long matched = 0; 163 | while (read_one_record(fp.self_or(stdin), &line, binary)) { 164 | lineno++; 165 | if (match_all_text && first_only) { 166 | MultiRegexFullMatch::PosLen res = 167 | find_first(fm, line, latin1, shortest_match, ignore_case); 168 | if (verbose && res.len) { 169 | printf("line:%ld: (%d %d :", lineno, res.pos, res.len); 170 | for (size_t i = 0; i < fm.size(); ++i) { 171 | printf(" %d", fm[i]); 172 | } 173 | printf(")\n"); 174 | } 175 | if (fm.size()) 176 | matched++; 177 | } 178 | else if (match_all_text && !first_only) { 179 | size_t all_match_size = 180 | find_all(fm, line, latin1, shortest_match, ignore_case); 181 | if (verbose && all_match_size) { 182 | printf("line:%ld:", lineno); 183 | for(size_t i = 0; i < all_match_size; ) { 184 | size_t j = i; 185 | int pos = fm.all_match(i).pos; 186 | int len = fm.all_match(i).len; 187 | printf(" (%d %d :", pos, len); 188 | do { 189 | printf(" %d", fm.all_match(j).regex_id); 190 | ++j; 191 | } while (j < all_match_size && pos == fm.all_match(j).pos); 192 | printf(")"); 193 | i = j; 194 | } 195 | printf("\n"); 196 | } 197 | if (all_match_size) 198 | matched++; 199 | } 200 | else { 201 | int len = match_text(fm, line, shortest_match, ignore_case); 202 | if (verbose && fm.size()) { 203 | printf("line:%ld:len=%d:", lineno, len); 204 | for (size_t i = 0; i < fm.size(); ++i) { 205 | printf(" %d", fm[i]); 206 | } 207 | printf("\n"); 208 | } 209 | if (fm.size()) 210 | matched++; 211 | } 212 | sumlen += line.n; 213 | } 214 | long long t1 = pf.now(); 215 | printf("time(warm_up)=%f's\n", pf.sf(ts, t0)); 216 | printf("time=%f's lines=%ld matched=%ld QPS=%f Throughput=%f'MiB Latency=%f'us\n" 217 | , pf.sf(t0,t1) 218 | , lineno 219 | , matched 220 | , lineno/pf.sf(t0,t1) 221 | , sumlen/pf.uf(t0,t1) 222 | , pf.uf(t0,t1)/lineno 223 | ); 224 | malloc_stats(); 225 | return 0; 226 | } 227 | 228 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------