├── CREDITS ├── EXPERIMENTAL ├── config.w32 ├── .gitignore ├── akm.php ├── tests └── 001.phpt ├── ahocorasick ├── mpool.h ├── node.h ├── ahocorasick.h ├── replace.h ├── mpool.c ├── actypes.h ├── ahocorasick.c ├── node.c └── replace.c ├── config.m4 ├── php_akm.h ├── README.md └── akm.c /CREDITS: -------------------------------------------------------------------------------- 1 | akm -------------------------------------------------------------------------------- /EXPERIMENTAL: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /config.w32: -------------------------------------------------------------------------------- 1 | // $Id$ 2 | // vim:ft=javascript 3 | 4 | // If your extension references something external, use ARG_WITH 5 | // ARG_WITH("akm", "for akm support", "no"); 6 | 7 | // Otherwise, use ARG_ENABLE 8 | // ARG_ENABLE("akm", "enable akm support", "no"); 9 | 10 | if (PHP_AKM != "no") { 11 | EXTENSION("akm", "akm.c", PHP_EXTNAME_SHARED, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1"); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Libraries 12 | *.lib 13 | *.a 14 | *.la 15 | *.lo 16 | 17 | # Shared objects (inc. Windows DLLs) 18 | *.dll 19 | *.so 20 | *.so.* 21 | *.dylib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | *.app 27 | *.i*86 28 | *.x86_64 29 | *.hex 30 | 31 | # Debug files 32 | *.dSYM/ 33 | *.su 34 | -------------------------------------------------------------------------------- /akm.php: -------------------------------------------------------------------------------- 1 | "; 3 | 4 | if(!extension_loaded('akm')) { 5 | dl('akm.' . PHP_SHLIB_SUFFIX); 6 | } 7 | $module = 'akm'; 8 | $functions = get_extension_funcs($module); 9 | echo "Functions available in the test extension:$br\n"; 10 | foreach($functions as $func) { 11 | echo $func."$br\n"; 12 | } 13 | echo "$br\n"; 14 | $function = 'confirm_' . $module . '_compiled'; 15 | if (extension_loaded($module)) { 16 | $str = $function($module); 17 | } else { 18 | $str = "Module $module is not compiled into PHP"; 19 | } 20 | echo "$str\n"; 21 | ?> 22 | -------------------------------------------------------------------------------- /tests/001.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Check for akm presence 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 20 | --EXPECT-- 21 | akm extension is available 22 | -------------------------------------------------------------------------------- /ahocorasick/mpool.h: -------------------------------------------------------------------------------- 1 | /* 2 | * mpool.c memory pool management 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _MPOOL_H_ 22 | #define _MPOOL_H_ 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | /* Forward declaration */ 29 | struct mpool; 30 | 31 | 32 | struct mpool *mpool_create (size_t size); 33 | void mpool_free (struct mpool *pool); 34 | 35 | void *mpool_malloc (struct mpool *pool, size_t size); 36 | void *mpool_strdup (struct mpool *pool, const char *str); 37 | void *mpool_strndup (struct mpool *pool, const char *str, size_t n); 38 | 39 | 40 | #ifdef __cplusplus 41 | } 42 | #endif 43 | 44 | #endif /* _MPOOL_H_ */ 45 | -------------------------------------------------------------------------------- /config.m4: -------------------------------------------------------------------------------- 1 | dnl $Id$ 2 | dnl config.m4 for extension akm 3 | 4 | dnl Comments in this file start with the string 'dnl'. 5 | dnl Remove where necessary. This file will not work 6 | dnl without editing. 7 | 8 | dnl If your extension references something external, use with: 9 | 10 | PHP_ARG_WITH(akm, for akm support, 11 | Make sure that the comment is aligned: 12 | [ --with-akm Include akm support]) 13 | 14 | dnl Otherwise use enable: 15 | 16 | PHP_ARG_ENABLE(akm, whether to enable akm support, 17 | Make sure that the comment is aligned: 18 | [ --enable-akm Enable akm support]) 19 | 20 | if test "$PHP_AKM" != "no"; then 21 | dnl Write more examples of tests here... 22 | 23 | dnl # --with-akm -> check with-path 24 | dnl SEARCH_PATH="/usr/local /usr" # you might want to change this 25 | dnl SEARCH_FOR="/include/akm.h" # you most likely want to change this 26 | dnl if test -r $PHP_AKM/$SEARCH_FOR; then # path given as parameter 27 | dnl AKM_DIR=$PHP_AKM 28 | dnl else # search default path list 29 | dnl AC_MSG_CHECKING([for akm files in default path]) 30 | dnl for i in $SEARCH_PATH ; do 31 | dnl if test -r $i/$SEARCH_FOR; then 32 | dnl AKM_DIR=$i 33 | dnl AC_MSG_RESULT(found in $i) 34 | dnl fi 35 | dnl done 36 | dnl fi 37 | dnl 38 | dnl if test -z "$AKM_DIR"; then 39 | dnl AC_MSG_RESULT([not found]) 40 | dnl AC_MSG_ERROR([Please reinstall the akm distribution]) 41 | dnl fi 42 | 43 | dnl # --with-akm -> add include path 44 | dnl PHP_ADD_INCLUDE($AKM_DIR/ahocorasick) 45 | 46 | dnl # --with-akm -> check for lib and symbol presence 47 | dnl LIBNAME=akm # you may want to change this 48 | dnl LIBSYMBOL=akm # you most likely want to change this 49 | 50 | dnl PHP_CHECK_LIBRARY($LIBNAME,$LIBSYMBOL, 51 | dnl [ 52 | dnl PHP_ADD_LIBRARY_WITH_PATH($LIBNAME, $AKM_DIR/$PHP_LIBDIR, AKM_SHARED_LIBADD) 53 | dnl AC_DEFINE(HAVE_AKMLIB,1,[ ]) 54 | dnl ],[ 55 | dnl AC_MSG_ERROR([wrong akm lib version or lib not found]) 56 | dnl ],[ 57 | dnl -L$AKM_DIR/$PHP_LIBDIR -lm 58 | dnl ]) 59 | dnl 60 | dnl PHP_SUBST(AKM_SHARED_LIBADD) 61 | 62 | PHP_NEW_EXTENSION(akm, akm.c ahocorasick/ahocorasick.c ahocorasick/mpool.c ahocorasick/node.c ahocorasick/replace.c, $ext_shared,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1) 63 | fi 64 | -------------------------------------------------------------------------------- /php_akm.h: -------------------------------------------------------------------------------- 1 | /* 2 | +----------------------------------------------------------------------+ 3 | | PHP Version 7 | 4 | +----------------------------------------------------------------------+ 5 | | Copyright (c) 1997-2016 The PHP Group | 6 | +----------------------------------------------------------------------+ 7 | | This source file is subject to version 3.01 of the PHP license, | 8 | | that is bundled with this package in the file LICENSE, and is | 9 | | available through the world-wide-web at the following url: | 10 | | http://www.php.net/license/3_01.txt | 11 | | If you did not receive a copy of the PHP license and are unable to | 12 | | obtain it through the world-wide-web, please send a note to | 13 | | license@php.net so we can mail you a copy immediately. | 14 | +----------------------------------------------------------------------+ 15 | | Author: maben | 16 | +----------------------------------------------------------------------+ 17 | */ 18 | 19 | /* $Id$ */ 20 | 21 | #ifndef PHP_AKM_H 22 | #define PHP_AKM_H 23 | 24 | extern zend_module_entry akm_module_entry; 25 | #define phpext_akm_ptr &akm_module_entry 26 | 27 | #define PHP_AKM_VERSION "0.1.0" /* Replace with version number for your extension */ 28 | 29 | #ifdef PHP_WIN32 30 | # define PHP_AKM_API __declspec(dllexport) 31 | #elif defined(__GNUC__) && __GNUC__ >= 4 32 | # define PHP_AKM_API __attribute__ ((visibility("default"))) 33 | #else 34 | # define PHP_AKM_API 35 | #endif 36 | 37 | #ifdef ZTS 38 | #include "TSRM.h" 39 | #endif 40 | 41 | #include "ahocorasick/ahocorasick.h" 42 | 43 | /* 44 | Declare any global variables you may need between the BEGIN 45 | and END macros here: 46 | 47 | ZEND_BEGIN_MODULE_GLOBALS(akm) 48 | zend_long global_value; 49 | char *global_string; 50 | ZEND_END_MODULE_GLOBALS(akm) 51 | */ 52 | 53 | typedef AC_TRIE_t akm_trie_t; 54 | typedef AC_PATTERN_t akm_pattern_t; 55 | typedef AC_MATCH_t akm_match_t; 56 | typedef AC_TEXT_t akm_text_t; 57 | 58 | #define AKM_PATTID_TYPE_STRING AC_PATTID_TYPE_STRING 59 | 60 | #define akm_trie_add ac_trie_add 61 | #define akm_trie_create ac_trie_create 62 | #define akm_trie_finalize ac_trie_finalize 63 | #define akm_trie_release ac_trie_release 64 | #define akm_trie_settext ac_trie_settext 65 | #define akm_trie_findnext ac_trie_findnext 66 | 67 | PHP_FUNCTION(akm_match); 68 | PHP_FUNCTION(akm_replace); 69 | 70 | /* Always refer to the globals in your function as AKM_G(variable). 71 | You are encouraged to rename these macros something shorter, see 72 | examples in any other php module directory. 73 | */ 74 | #define AKM_G(v) ZEND_MODULE_GLOBALS_ACCESSOR(akm, v) 75 | 76 | #if defined(ZTS) && defined(COMPILE_DL_AKM) 77 | ZEND_TSRMLS_CACHE_EXTERN(); 78 | #endif 79 | 80 | #endif /* PHP_AKM_H */ 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ahocorasick keyword match 2 | 3 | ``` 4 | ____ __ ______ ___ __ __ __ ___ 5 | / __ \/ / / / __ \/ | / //_// |/ / 6 | / /_/ / /_/ / /_/ / /| | / ,< / /|_/ / 7 | / ____/ __ / ____/ ___ |/ /| |/ / / / 8 | /_/ /_/ /_/_/ /_/ |_/_/ |_/_/ /_/ 9 | 10 | ``` 11 | 12 | **关键字快速查找匹配** 13 | 14 | ## 编译安装 15 | 16 | ``` 17 | $ git clone https://github.com/imaben/php-akm.git 18 | $ cd php-akm 19 | $ phpize 20 | $ ./configure 21 | $ make 22 | $ sudo make install 23 | ``` 24 | 25 | ## php.ini配置 26 | ```ini 27 | [akm] 28 | extension=akm.so 29 | akm.enable=On|Off 30 | akm.dict_dir=/home/dict 31 | ``` 32 | 33 | 说明: 34 | 35 | - `akm.enable`表示扩展启用或关闭 36 | - `akm.dict_dir`用来指定关键词词典所在的文件夹 37 | 38 | 39 | ## 函数说明 40 | 41 | ### akm_match 42 | 43 | **关键词匹配** 44 | 45 | ```php 46 | array akm_match(string $dict_name, string $text) 47 | ``` 48 | 49 | **参数说明** 50 | 51 | - `dict_name`:字典名称,即`akm.dict_dir`配置所在文件夹下的字典库名称(文件名) 52 | - `text`:待匹配的文本 53 | 54 | **返回值** 55 | 56 | 返回匹配含有`keyword`、`offset`、`extension`字段数组列表的二维数组,如: 57 | 58 | ```php 59 | [ 60 | { 61 | "keyword" : "敏感词", 62 | "offset": 123, 63 | "extension": "扩展文本" 64 | }, 65 | { 66 | "keyword" : "敏感词2", 67 | "offset": 1231, 68 | "extension": "扩展文本" 69 | } 70 | ] 71 | ``` 72 | 说明: 73 | 74 | - keyword:敏感词 75 | - offset:敏感词所在文本中的位置 76 | - extension:扩展文本 77 | 78 | ### akm_replace 79 | 80 | **关键词替换** 81 | 82 | ```php 83 | int akm_replace(string $dict_name, string &$text, callable $callback) 84 | ``` 85 | 86 | **参数说明** 87 | 88 | - `dict_name`: 字典名称,即`akm.dict_dir`配置所在文件夹下的字典库名称(文件名) 89 | - `text`:待替换文本 90 | - `callback`:处理匹配字符串的回调,接受三个参数 91 | - string `keyword`:匹配出的关键词 92 | - int `index`:关键词在文本中的位置 93 | - string `extension`:扩展文本 94 | 95 | 如回调中返回一个字符串,则把匹配到的关键词替换成返回值。如无返回值,则不做任何处理 96 | 97 | **返回值** 98 | 99 | 返回成功匹配的关键词个数 100 | 101 | ### akm_get_dict_list 102 | 103 | **获取词典列表** 104 | 105 | ```php 106 | array akm_get_dict_list() 107 | ``` 108 | 109 | **返回值** 110 | 111 | 返回已索引的词典名称列表 112 | 113 | ## 字典数据结构 114 | 115 | ``` 116 | 关键词|扩展文本 117 | keyword1|extension_text1 118 | keyword2|extension_text2 119 | keyword3|extension_text3 120 | ``` 121 | 122 | 说明: 123 | 124 | - “|”为关键词和扩展文本之间的分割符 125 | - “|”只对首行第一个有效,例“发票|政治|敏感”,则认定`发票`为关键词,`政治|敏感`为扩展文本 126 | - 如无“|”符,则整行被认为一个关键词,返回时无扩展文本 127 | - 每行定义一个关键词,空行自动跳过 128 | 129 | ## 性能测试 130 | 131 | PC配置: 132 | ``` 133 | CPU:Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz 134 | 内存:4GB*3 1600MHz 135 | 硬盘:东芝Q300 136 | 操作系统:Linux 4.6.4-1-ARCH 137 | ``` 138 | 139 | 测试代码: 140 | 141 | ```php 142 | 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _NODE_H_ 22 | #define _NODE_H_ 23 | 24 | #include "actypes.h" 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | /* Forward Declaration */ 31 | struct act_edge; 32 | struct ac_trie; 33 | 34 | /** 35 | * Aho-Corasick Trie node 36 | */ 37 | typedef struct act_node 38 | { 39 | int id; /**< Node identifier: used for debugging purpose */ 40 | 41 | int final; /**< A final node accepts pattern; 0: not, 1: is final */ 42 | size_t depth; /**< Distance between this node and the root */ 43 | struct act_node *failure_node; /**< The failure transition node */ 44 | 45 | struct act_edge *outgoing; /**< Outgoing edges array */ 46 | size_t outgoing_capacity; /**< Max capacity of outgoing edges */ 47 | size_t outgoing_size; /**< Number of outgoing edges */ 48 | 49 | AC_PATTERN_t *matched; /**< Matched patterns array */ 50 | size_t matched_capacity; /**< Max capacity of the matched patterns */ 51 | size_t matched_size; /**< Number of matched patterns in this node */ 52 | 53 | AC_PATTERN_t *to_be_replaced; /**< Pointer to the pattern that must be 54 | * replaced */ 55 | 56 | struct ac_trie *trie; /**< The trie that this node belongs to */ 57 | 58 | } ACT_NODE_t; 59 | 60 | /** 61 | * Edge of the node 62 | */ 63 | struct act_edge 64 | { 65 | AC_ALPHABET_t alpha; /**< Transition alpha */ 66 | ACT_NODE_t *next; /**< Target of the edge */ 67 | }; 68 | 69 | /* 70 | * Node interface functions 71 | */ 72 | 73 | ACT_NODE_t *node_create (struct ac_trie *trie); 74 | ACT_NODE_t *node_create_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha); 75 | ACT_NODE_t *node_find_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha); 76 | ACT_NODE_t *node_find_next_bs (ACT_NODE_t *nod, AC_ALPHABET_t alpha); 77 | 78 | void node_assign_id (ACT_NODE_t *nod); 79 | void node_add_edge (ACT_NODE_t *nod, ACT_NODE_t *next, AC_ALPHABET_t alpha); 80 | void node_sort_edges (ACT_NODE_t *nod); 81 | void node_accept_pattern (ACT_NODE_t *nod, AC_PATTERN_t *new_patt, int copy); 82 | void node_collect_matches (ACT_NODE_t *nod); 83 | void node_release_vectors (ACT_NODE_t *nod); 84 | int node_book_replacement (ACT_NODE_t *nod); 85 | void node_display (ACT_NODE_t *nod); 86 | 87 | #ifdef __cplusplus 88 | } 89 | #endif 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /ahocorasick/ahocorasick.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ahocorasick.h: The main ahocorasick header file. 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _AHOCORASICK_H_ 22 | #define _AHOCORASICK_H_ 23 | 24 | #include "replace.h" 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | /* Forward declaration */ 31 | struct act_node; 32 | struct mpool; 33 | 34 | /* 35 | * The A.C. Trie data structure 36 | */ 37 | typedef struct ac_trie 38 | { 39 | struct act_node *root; /**< The root node of the trie */ 40 | 41 | size_t patterns_count; /**< Total patterns in the trie */ 42 | 43 | short trie_open; /**< This flag indicates that if trie is finalized 44 | * or not. After finalizing the trie you can not 45 | * add pattern to trie anymore. */ 46 | 47 | struct mpool *mp; /**< Memory pool */ 48 | 49 | /* ******************* Thread specific part ******************** */ 50 | 51 | /* It is possible to search a long input chunk by chunk. In order to 52 | * connect these chunks and make a continuous view of the input, we need 53 | * the following variables. 54 | */ 55 | struct act_node *last_node; /**< Last node we stopped at */ 56 | size_t base_position; /**< Represents the position of the current chunk, 57 | * related to whole input text */ 58 | 59 | AC_TEXT_t *text; /**< A helper variable to hold the input chunk */ 60 | size_t position; /**< A helper variable to hold the relative current 61 | * position in the given text */ 62 | 63 | MF_REPLACEMENT_DATA_t repdata; /**< Replacement data structure */ 64 | 65 | ACT_WORKING_MODE_t wm; /**< Working mode */ 66 | 67 | } AC_TRIE_t; 68 | 69 | /* 70 | * The API functions 71 | */ 72 | 73 | AC_TRIE_t *ac_trie_create (void); 74 | AC_STATUS_t ac_trie_add (AC_TRIE_t *thiz, AC_PATTERN_t *patt, int copy); 75 | void ac_trie_finalize (AC_TRIE_t *thiz); 76 | void ac_trie_release (AC_TRIE_t *thiz); 77 | void ac_trie_display (AC_TRIE_t *thiz); 78 | 79 | int ac_trie_search (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep, 80 | AC_MATCH_CALBACK_f callback, void *param); 81 | 82 | void ac_trie_settext (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep); 83 | AC_MATCH_t ac_trie_findnext (AC_TRIE_t *thiz); 84 | 85 | int multifast_replace (AC_TRIE_t *thiz, AC_TEXT_t *text, 86 | MF_REPLACE_MODE_t mode, MF_REPLACE_CALBACK_f callback, void *param); 87 | void multifast_rep_flush (AC_TRIE_t *thiz, int keep); 88 | 89 | 90 | #ifdef __cplusplus 91 | } 92 | #endif 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /ahocorasick/replace.h: -------------------------------------------------------------------------------- 1 | /* 2 | * replace.h: Defines replacement related data structures 3 | * 4 | * This file is part of multifast. 5 | * 6 | Copyright 2010-2015 Kamiar Kanani 7 | 8 | multifast is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU Lesser General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | multifast is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU Lesser General Public License for more details. 17 | 18 | You should have received a copy of the GNU Lesser General Public License 19 | along with multifast. If not, see . 20 | */ 21 | 22 | #ifndef _MF_REPLACE_H_ 23 | #define _MF_REPLACE_H_ 24 | 25 | #include "actypes.h" 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | /** 32 | * Different replace modes 33 | */ 34 | typedef enum mf_replace_mode 35 | { 36 | MF_REPLACE_MODE_DEFAULT = 0, 37 | MF_REPLACE_MODE_NORMAL, /**< Normal replace mode: Short factors are swollen 38 | * by the big one; All other patterns are replced 39 | * even if they have overlap. 40 | */ 41 | MF_REPLACE_MODE_LAZY /**< Lazy replace mode: every pattern which comes 42 | * first is replced; the overlapping pattrns are 43 | * nullified by the previous patterns; consequently, 44 | * factor patterns nullify the big patterns. 45 | */ 46 | } MF_REPLACE_MODE_t; 47 | 48 | 49 | /** 50 | * Before we replace any pattern we encounter, we should be patient 51 | * because it may be a factor of another longer pattern. So we maintain a record 52 | * of each recognized pattern until we make sure that it is not a sub-pattern 53 | * and can be replaced by its substitute. To keep a record of packets we use 54 | * the following structure. 55 | */ 56 | struct mf_replacement_nominee 57 | { 58 | AC_PATTERN_t *pattern; 59 | size_t position; 60 | }; 61 | 62 | 63 | /** 64 | * Contains replacement related data 65 | */ 66 | typedef struct mf_replacement_date 67 | { 68 | AC_TEXT_t buffer; /**< replacement buffer: maintains the result 69 | * of replacement */ 70 | 71 | AC_TEXT_t backlog; /**< replacement backlog: if a pattern is divided 72 | * between two or more different chunks, then at the 73 | * end of the first chunk we need to keep it here until 74 | * the next chunk comes and we decide if it is a 75 | * pattern or just a pattern prefix. */ 76 | 77 | unsigned int has_replacement; /**< total number of to-be-replaced patterns 78 | */ 79 | 80 | struct mf_replacement_nominee *noms; /**< Replacement nominee array */ 81 | size_t noms_capacity; /**< Max capacity of the array */ 82 | size_t noms_size; /**< Number of nominees in the array */ 83 | 84 | size_t curser; /**< the position in the input text before which all 85 | * patterns are replaced and the result is saved to the 86 | * buffer. */ 87 | 88 | MF_REPLACE_MODE_t replace_mode; /**< Replace mode */ 89 | 90 | MF_REPLACE_CALBACK_f cbf; /**< Callback function */ 91 | void *user; /**< User parameters sent to the callback function */ 92 | 93 | struct ac_trie *trie; /**< Pointer to the trie */ 94 | 95 | } MF_REPLACEMENT_DATA_t; 96 | 97 | 98 | #ifdef __cplusplus 99 | } 100 | #endif 101 | 102 | #endif /* REPLACE_H */ 103 | -------------------------------------------------------------------------------- /ahocorasick/mpool.c: -------------------------------------------------------------------------------- 1 | /* 2 | * mpool.c memory pool management 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "mpool.h" 25 | 26 | 27 | #define MPOOL_BLOCK_SIZE (24*1024) 28 | 29 | #if (MPOOL_BLOCK_SIZE % 16 > 0) 30 | #error "MPOOL_BLOCK_SIZE must be multiple 16" 31 | #endif 32 | 33 | #if (MPOOL_BLOCK_SIZE <= AC_PATTRN_MAX_LENGTH) 34 | #error "MPOOL_BLOCK_SIZE must be bigger than AC_PATTRN_MAX_LENGTH" 35 | #endif 36 | 37 | struct mpool_block 38 | { 39 | size_t size; 40 | unsigned char *bp; /* Block pointer */ 41 | unsigned char *free; /* Free area; End of allocated section */ 42 | 43 | struct mpool_block *next; /* Next block */ 44 | }; 45 | 46 | struct mpool 47 | { 48 | struct mpool_block *block; 49 | }; 50 | 51 | 52 | /** 53 | * @brief Allocate a new block to the pool 54 | * 55 | * @param size 56 | * @return 57 | ******************************************************************************/ 58 | static struct mpool_block *mpool_new_block (size_t size) 59 | { 60 | struct mpool_block *block; 61 | 62 | if (!size) 63 | size = MPOOL_BLOCK_SIZE; 64 | 65 | block = (struct mpool_block *) malloc (sizeof(struct mpool_block)); 66 | 67 | block->bp = block->free = malloc(size); 68 | block->size = size; 69 | block->next = NULL; 70 | 71 | return block; 72 | } 73 | 74 | /** 75 | * @brief Creates a new pool 76 | * 77 | * @param size 78 | * @return 79 | ******************************************************************************/ 80 | struct mpool *mpool_create (size_t size) 81 | { 82 | struct mpool *ret; 83 | 84 | ret = malloc (sizeof(struct mpool)); 85 | ret->block = mpool_new_block(size); 86 | 87 | return ret; 88 | } 89 | 90 | /** 91 | * @brief Free a pool 92 | * 93 | * @param pool 94 | ******************************************************************************/ 95 | void mpool_free (struct mpool *pool) 96 | { 97 | struct mpool_block *p, *p_next; 98 | 99 | if (!pool) 100 | return; 101 | 102 | if (!pool->block) { 103 | free(pool); 104 | return; 105 | } 106 | 107 | p = pool->block; 108 | 109 | while (p) { 110 | p_next = p->next; 111 | free(p->bp); 112 | free(p); 113 | p = p_next; 114 | } 115 | 116 | free(pool); 117 | } 118 | 119 | /** 120 | * @brief Allocate from a pool 121 | * 122 | * @param pool 123 | * @param size 124 | * @return 125 | ******************************************************************************/ 126 | void *mpool_malloc (struct mpool *pool, size_t size) 127 | { 128 | void *ret = NULL; 129 | struct mpool_block *block, *new_block; 130 | size_t remain, block_size; 131 | 132 | if(!pool || !pool->block || !size) 133 | return NULL; 134 | 135 | size = (size + 15) & ~0xF; /* This is to align memory allocation on 136 | * multiple 16 boundary */ 137 | 138 | block = pool->block; 139 | remain = block->size - ((size_t)block->free - (size_t)block->bp); 140 | 141 | if (remain < size) 142 | { 143 | /* Allocate a new block */ 144 | block_size = ((size > block->size) ? size : block->size); 145 | new_block = mpool_new_block (block_size); 146 | new_block->next = block; 147 | block = pool->block = new_block; 148 | } 149 | 150 | ret = block->free; 151 | 152 | block->free = block->bp + (block->free - block->bp + size); 153 | 154 | return ret; 155 | } 156 | 157 | /** 158 | * @brief Makes a copy of a string with known size 159 | * 160 | * @param pool 161 | * @param str 162 | * @param n 163 | * @return 164 | *****************************************************************************/ 165 | void *mpool_strndup (struct mpool *pool, const char *str, size_t n) 166 | { 167 | void *ret; 168 | 169 | if (!str) 170 | return NULL; 171 | 172 | if ((ret = mpool_malloc(pool, n+1))) 173 | { 174 | strncpy((char *)ret, str, n); 175 | ((char *)ret)[n] = '\0'; 176 | } 177 | 178 | return ret; 179 | } 180 | 181 | /** 182 | * @brief Makes a copy of zero terminated string 183 | * 184 | * @param pool 185 | * @param str 186 | * @return 187 | ******************************************************************************/ 188 | void *mpool_strdup (struct mpool *pool, const char *str) 189 | { 190 | size_t len; 191 | 192 | if (!str) 193 | return NULL; 194 | len = strlen(str); 195 | 196 | return mpool_strndup (pool, str, len); 197 | } 198 | -------------------------------------------------------------------------------- /ahocorasick/actypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * actypes.h: Defines basic data types of the trie 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _AC_TYPES_H_ 22 | #define _AC_TYPES_H_ 23 | 24 | #include 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | /** 31 | * @brief The alphabet type 32 | * 33 | * Actually defining AC_ALPHABET_t as a char works for many usage case, but 34 | * sometimes we deal with streams of other basic types e.g. integers or 35 | * enumerators. Although they consists of string of bytes (chars), but using 36 | * their specific types as AC_ALPHABET_t will lead to a better performance. 37 | * So instead of working with strings of chars, we assume that we are working 38 | * with strings of AC_ALPHABET_t and leave it optional for users to define 39 | * their own alphabets. 40 | */ 41 | typedef char AC_ALPHABET_t; 42 | 43 | /** 44 | * The text (strings of alphabets) type that is used for input/output when 45 | * dealing with the A.C. Trie. The text can contain zero value alphabets. 46 | */ 47 | typedef struct ac_text 48 | { 49 | const AC_ALPHABET_t *astring; /**< String of alphabets */ 50 | size_t length; /**< String length */ 51 | } AC_TEXT_t; 52 | 53 | /** 54 | * Pattern ID type 55 | * @see struct ac_pattid 56 | */ 57 | enum ac_pattid_type 58 | { 59 | AC_PATTID_TYPE_DEFAULT = 0, 60 | AC_PATTID_TYPE_NUMBER, 61 | AC_PATTID_TYPE_STRING 62 | }; 63 | 64 | /** 65 | * Provides a more readable representative for the pattern. Because patterns 66 | * themselves are not always suitable for displaying (e.g. patterns containing 67 | * special characters), we offer this type to improve intelligibility of the 68 | * output. Sometimes it can be also useful, when you are retrieving patterns 69 | * from a database, to maintain their identifiers in the trie for further 70 | * reference. We provisioned two possible types as a union. you can add your 71 | * type here. 72 | */ 73 | typedef struct ac_pattid 74 | { 75 | union 76 | { 77 | const char *stringy; /**< Null-terminated string */ 78 | long number; /**< Item indicator */ 79 | } u; 80 | 81 | enum ac_pattid_type type; /**< Shows the type of id */ 82 | 83 | } AC_PATTID_t; 84 | 85 | /** 86 | * This is the pattern type that the trie must be fed by. 87 | */ 88 | typedef struct ac_pattern 89 | { 90 | AC_TEXT_t ptext; /**< The search string */ 91 | AC_TEXT_t rtext; /**< The replace string */ 92 | AC_PATTID_t id; /**< Pattern identifier */ 93 | } AC_PATTERN_t; 94 | 95 | /** 96 | * @brief Provides the structure for reporting a match in the text. 97 | * 98 | * A match occurs when the trie reaches a final node. Any final 99 | * node can match one or more patterns at a position in the input text. 100 | * the 'patterns' field holds these matched patterns. Obviously these 101 | * matched patterns have same end-position in the text. There is a relationship 102 | * between matched patterns: the shorter one is a factor (tail) of the longer 103 | * one. The 'position' maintains the end position of matched patterns. 104 | */ 105 | typedef struct ac_match 106 | { 107 | AC_PATTERN_t *patterns; /**< Array of matched pattern(s) */ 108 | size_t size; /**< Number of matched pattern(s) */ 109 | 110 | size_t position; /**< The end position of the matching pattern(s) in 111 | * the input text */ 112 | } AC_MATCH_t; 113 | 114 | /** 115 | * The return status of various A.C. Trie functions 116 | */ 117 | typedef enum ac_status 118 | { 119 | ACERR_SUCCESS = 0, /**< No error occurred */ 120 | ACERR_DUPLICATE_PATTERN, /**< Duplicate patterns */ 121 | ACERR_LONG_PATTERN, /**< Pattern length is too long */ 122 | ACERR_ZERO_PATTERN, /**< Empty pattern (zero length) */ 123 | ACERR_TRIE_CLOSED /**< Trie is closed. */ 124 | } AC_STATUS_t; 125 | 126 | /** 127 | * @ brief The call-back function to report the matched patterns back to the 128 | * caller. 129 | * 130 | * When a match is found, the trie will reach the caller using this 131 | * function. You can send parameters to the call-back function when you call 132 | * _search() or _replace() functions. The call-back function receives those 133 | * parameters as the second parameter determined by void * in bellow. If you 134 | * return 0 from call-back function, it will tell trie to continue 135 | * searching, otherwise it will return from the trie function. 136 | */ 137 | typedef int (*AC_MATCH_CALBACK_f)(AC_MATCH_t *, void *); 138 | 139 | /** 140 | * @brief Call-back function to receive the replacement text (chunk by chunk). 141 | */ 142 | typedef void (*MF_REPLACE_CALBACK_f)(AC_TEXT_t *, void *); 143 | 144 | /** 145 | * Maximum accepted length of search/replace pattern 146 | */ 147 | #define AC_PATTRN_MAX_LENGTH 1024 148 | 149 | /** 150 | * Replacement buffer size 151 | */ 152 | #define MF_REPLACEMENT_BUFFER_SIZE 2048 153 | 154 | #if (MF_REPLACEMENT_BUFFER_SIZE <= AC_PATTRN_MAX_LENGTH) 155 | #error "REPLACEMENT_BUFFER_SIZE must be bigger than AC_PATTRN_MAX_LENGTH" 156 | #endif 157 | 158 | typedef enum act_working_mode 159 | { 160 | AC_WORKING_MODE_SEARCH = 0, /* Default */ 161 | AC_WORKING_MODE_FINDNEXT, 162 | AC_WORKING_MODE_REPLACE /* Not used */ 163 | } ACT_WORKING_MODE_t; 164 | 165 | 166 | #ifdef __cplusplus 167 | } 168 | #endif 169 | 170 | #endif 171 | -------------------------------------------------------------------------------- /ahocorasick/ahocorasick.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ahocorasick.c: Implements the A. C. Trie functionalities 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "node.h" 26 | #include "ahocorasick.h" 27 | #include "mpool.h" 28 | 29 | /* Privates */ 30 | 31 | static void ac_trie_set_failure 32 | (ACT_NODE_t *node, AC_ALPHABET_t *alphas); 33 | 34 | static void ac_trie_traverse_setfailure 35 | (ACT_NODE_t *node, AC_ALPHABET_t *prefix); 36 | 37 | static void ac_trie_traverse_action 38 | (ACT_NODE_t *node, void(*func)(ACT_NODE_t *), int top_down); 39 | 40 | static void ac_trie_reset 41 | (AC_TRIE_t *thiz); 42 | 43 | static int ac_trie_match_handler 44 | (AC_MATCH_t * matchp, void * param); 45 | 46 | /* Friends */ 47 | 48 | extern void mf_repdata_init (AC_TRIE_t *thiz); 49 | extern void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd); 50 | extern void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd); 51 | extern void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd); 52 | 53 | 54 | /** 55 | * @brief Initializes the trie; allocates memories and sets initial values 56 | * 57 | * @return 58 | *****************************************************************************/ 59 | AC_TRIE_t *ac_trie_create (void) 60 | { 61 | AC_TRIE_t *thiz = (AC_TRIE_t *) malloc (sizeof(AC_TRIE_t)); 62 | thiz->mp = mpool_create(0); 63 | 64 | thiz->root = node_create (thiz); 65 | 66 | thiz->patterns_count = 0; 67 | 68 | mf_repdata_init (thiz); 69 | ac_trie_reset (thiz); 70 | thiz->text = NULL; 71 | thiz->position = 0; 72 | 73 | thiz->wm = AC_WORKING_MODE_SEARCH; 74 | thiz->trie_open = 1; 75 | 76 | return thiz; 77 | } 78 | 79 | /** 80 | * @brief Adds pattern to the trie. 81 | * 82 | * @param Thiz pointer to the trie 83 | * @param Patt pointer to the pattern 84 | * @param copy should trie make a copy of patten strings or not, if not, 85 | * then user must keep the strings valid for the life-time of the trie. If 86 | * the pattern are available in the user program then call the function with 87 | * copy = 0 and do not waste memory. 88 | * 89 | * @return The return value indicates the success or failure of adding action 90 | *****************************************************************************/ 91 | AC_STATUS_t ac_trie_add (AC_TRIE_t *thiz, AC_PATTERN_t *patt, int copy) 92 | { 93 | size_t i; 94 | ACT_NODE_t *n = thiz->root; 95 | ACT_NODE_t *next; 96 | AC_ALPHABET_t alpha; 97 | 98 | if(!thiz->trie_open) 99 | return ACERR_TRIE_CLOSED; 100 | 101 | if (!patt->ptext.length) 102 | return ACERR_ZERO_PATTERN; 103 | 104 | if (patt->ptext.length > AC_PATTRN_MAX_LENGTH) 105 | return ACERR_LONG_PATTERN; 106 | 107 | for (i = 0; i < patt->ptext.length; i++) 108 | { 109 | alpha = patt->ptext.astring[i]; 110 | if ((next = node_find_next (n, alpha))) 111 | { 112 | n = next; 113 | continue; 114 | } 115 | else 116 | { 117 | next = node_create_next (n, alpha); 118 | next->depth = n->depth + 1; 119 | n = next; 120 | } 121 | } 122 | 123 | if(n->final) 124 | return ACERR_DUPLICATE_PATTERN; 125 | 126 | n->final = 1; 127 | node_accept_pattern (n, patt, copy); 128 | thiz->patterns_count++; 129 | 130 | return ACERR_SUCCESS; 131 | } 132 | 133 | /** 134 | * @brief Finalizes the preprocessing stage and gets the trie ready 135 | * 136 | * Locates the failure node for all nodes and collects all matched 137 | * pattern for each node. It also sorts outgoing edges of node, so binary 138 | * search could be performed on them. After calling this function the automate 139 | * will be finalized and you can not add new patterns to the automate. 140 | * 141 | * @param thiz pointer to the trie 142 | *****************************************************************************/ 143 | void ac_trie_finalize (AC_TRIE_t *thiz) 144 | { 145 | AC_ALPHABET_t prefix[AC_PATTRN_MAX_LENGTH]; 146 | 147 | /* 'prefix' defined here, because ac_trie_traverse_setfailure() calls 148 | * itself recursively */ 149 | ac_trie_traverse_setfailure (thiz->root, prefix); 150 | 151 | ac_trie_traverse_action (thiz->root, node_collect_matches, 1); 152 | mf_repdata_allocbuf (&thiz->repdata); 153 | 154 | thiz->trie_open = 0; /* Do not accept patterns any more */ 155 | } 156 | 157 | /** 158 | * @brief Search in the input text using the given trie. 159 | * 160 | * @param thiz pointer to the trie 161 | * @param text input text to be searched 162 | * @param keep indicated that if the input text the successive chunk of the 163 | * previous given text or not 164 | * @param callback when a match occurs this function will be called. The 165 | * call-back function in turn after doing its job, will return an integer 166 | * value, 0 means continue search, and non-0 value means stop search and return 167 | * to the caller. 168 | * @param user this parameter will be send to the call-back function 169 | * 170 | * @return 171 | * -1: failed; trie is not finalized 172 | * 0: success; input text was searched to the end 173 | * 1: success; input text was searched partially. (callback broke the loop) 174 | *****************************************************************************/ 175 | int ac_trie_search (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep, 176 | AC_MATCH_CALBACK_f callback, void *user) 177 | { 178 | size_t position; 179 | ACT_NODE_t *current; 180 | ACT_NODE_t *next; 181 | AC_MATCH_t match; 182 | 183 | if (thiz->trie_open) 184 | return -1; /* Trie must be finalized first. */ 185 | 186 | if (thiz->wm == AC_WORKING_MODE_FINDNEXT) 187 | position = thiz->position; 188 | else 189 | position = 0; 190 | 191 | current = thiz->last_node; 192 | 193 | if (!keep) 194 | ac_trie_reset (thiz); 195 | 196 | /* This is the main search loop. 197 | * It must be kept as lightweight as possible. 198 | */ 199 | while (position < text->length) 200 | { 201 | if (!(next = node_find_next_bs (current, text->astring[position]))) 202 | { 203 | if(current->failure_node /* We are not in the root node */) 204 | current = current->failure_node; 205 | else 206 | position++; 207 | } 208 | else 209 | { 210 | current = next; 211 | position++; 212 | } 213 | 214 | if (current->final && next) 215 | /* We check 'next' to find out if we have come here after a alphabet 216 | * transition or due to a fail transition. in second case we should not 217 | * report match, because it has already been reported */ 218 | { 219 | /* Found a match! */ 220 | match.position = position + thiz->base_position; 221 | match.size = current->matched_size; 222 | match.patterns = current->matched; 223 | 224 | /* Do call-back */ 225 | if (callback(&match, user)) 226 | { 227 | if (thiz->wm == AC_WORKING_MODE_FINDNEXT) { 228 | thiz->position = position; 229 | thiz->last_node = current; 230 | } 231 | return 1; 232 | } 233 | } 234 | } 235 | 236 | /* Save status variables */ 237 | thiz->last_node = current; 238 | thiz->base_position += position; 239 | 240 | return 0; 241 | } 242 | 243 | /** 244 | * @brief sets the input text to be searched by a function call to _findnext() 245 | * 246 | * @param thiz The pointer to the trie 247 | * @param text The text to be searched. The owner of the text is the 248 | * calling program and no local copy is made, so it must be valid until you 249 | * have done with it. 250 | * @param keep Indicates that if the given text is the sequel of the previous 251 | * one or not; 1: it is, 0: it is not 252 | *****************************************************************************/ 253 | void ac_trie_settext (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep) 254 | { 255 | if (!keep) 256 | ac_trie_reset (thiz); 257 | 258 | thiz->text = text; 259 | thiz->position = 0; 260 | } 261 | 262 | /** 263 | * @brief finds the next match in the input text which is set by _settext() 264 | * 265 | * @param thiz The pointer to the trie 266 | * @return A pointer to the matched structure 267 | *****************************************************************************/ 268 | AC_MATCH_t ac_trie_findnext (AC_TRIE_t *thiz) 269 | { 270 | AC_MATCH_t match; 271 | 272 | thiz->wm = AC_WORKING_MODE_FINDNEXT; 273 | match.size = 0; 274 | 275 | ac_trie_search (thiz, thiz->text, 1, 276 | ac_trie_match_handler, (void *)&match); 277 | 278 | thiz->wm = AC_WORKING_MODE_SEARCH; 279 | 280 | return match; 281 | } 282 | 283 | /** 284 | * @brief Release all allocated memories to the trie 285 | * 286 | * @param thiz pointer to the trie 287 | *****************************************************************************/ 288 | void ac_trie_release (AC_TRIE_t *thiz) 289 | { 290 | /* It must be called with a 0 top-down parameter */ 291 | ac_trie_traverse_action (thiz->root, node_release_vectors, 0); 292 | 293 | mf_repdata_release (&thiz->repdata); 294 | mpool_free(thiz->mp); 295 | free(thiz); 296 | } 297 | 298 | /** 299 | * @brief Prints the trie to output in human readable form. It is useful 300 | * for debugging purpose. 301 | * 302 | * @param thiz pointer to the trie 303 | *****************************************************************************/ 304 | void ac_trie_display (AC_TRIE_t *thiz) 305 | { 306 | ac_trie_traverse_action (thiz->root, node_display, 1); 307 | } 308 | 309 | /** 310 | * @brief the match handler function used in _findnext function 311 | * 312 | * @param matchp 313 | * @param param 314 | * @return 315 | *****************************************************************************/ 316 | static int ac_trie_match_handler (AC_MATCH_t * matchp, void * param) 317 | { 318 | AC_MATCH_t * mp = (AC_MATCH_t *)param; 319 | mp->position = matchp->position; 320 | mp->patterns = matchp->patterns; 321 | mp->size = matchp->size; 322 | return 1; 323 | } 324 | 325 | /** 326 | * @brief reset the trie and make it ready for doing new search 327 | * 328 | * @param thiz pointer to the trie 329 | *****************************************************************************/ 330 | static void ac_trie_reset (AC_TRIE_t *thiz) 331 | { 332 | thiz->last_node = thiz->root; 333 | thiz->base_position = 0; 334 | mf_repdata_reset (&thiz->repdata); 335 | } 336 | 337 | /** 338 | * @brief Finds and bookmarks the failure transition for the given node. 339 | * 340 | * @param node the node pointer 341 | * @param prefix The array that contain the prefix that leads the path from 342 | * root the the node. 343 | *****************************************************************************/ 344 | static void ac_trie_set_failure 345 | (ACT_NODE_t *node, AC_ALPHABET_t *prefix) 346 | { 347 | size_t i, j; 348 | ACT_NODE_t *n; 349 | ACT_NODE_t *root = node->trie->root; 350 | 351 | if (node == root) 352 | return; /* Failure transition is not defined for the root */ 353 | 354 | for (i = 1; i < node->depth; i++) 355 | { 356 | n = root; 357 | for (j = i; j < node->depth && n; j++) 358 | n = node_find_next (n, prefix[j]); 359 | if (n) 360 | { 361 | node->failure_node = n; 362 | break; 363 | } 364 | } 365 | 366 | if (!node->failure_node) 367 | node->failure_node = root; 368 | } 369 | 370 | /** 371 | * @brief Sets the failure transition node for all nodes 372 | * 373 | * Traverse all trie nodes using DFS (Depth First Search), meanwhile it set 374 | * the failure node for every node it passes through. this function is called 375 | * after adding last pattern to trie. 376 | * 377 | * @param node The pointer to the root node 378 | * @param prefix The array that contain the prefix that leads the path from 379 | * root the the node 380 | *****************************************************************************/ 381 | static void ac_trie_traverse_setfailure 382 | (ACT_NODE_t *node, AC_ALPHABET_t *prefix) 383 | { 384 | size_t i; 385 | 386 | /* In each node, look for its failure node */ 387 | ac_trie_set_failure (node, prefix); 388 | 389 | for (i = 0; i < node->outgoing_size; i++) 390 | { 391 | prefix[node->depth] = node->outgoing[i].alpha; /* Make the prefix */ 392 | 393 | /* Recursively call itself to traverse all nodes */ 394 | ac_trie_traverse_setfailure (node->outgoing[i].next, prefix); 395 | } 396 | } 397 | 398 | /** 399 | * @brief Traverses the trie using DFS method and applies the 400 | * given @param func on all nodes. At top level it should be called by 401 | * sending the the root node. 402 | * 403 | * @param node Pointer to trie root node 404 | * @param func The function that must be applied to all nodes 405 | * @param top_down Indicates that if the action should be applied to the note 406 | * itself and then to its children or vise versa. 407 | *****************************************************************************/ 408 | static void ac_trie_traverse_action 409 | (ACT_NODE_t *node, void(*func)(ACT_NODE_t *), int top_down) 410 | { 411 | size_t i; 412 | 413 | if (top_down) 414 | func (node); 415 | 416 | for (i = 0; i < node->outgoing_size; i++) 417 | /* Recursively call itself to traverse all nodes */ 418 | ac_trie_traverse_action (node->outgoing[i].next, func, top_down); 419 | 420 | if (!top_down) 421 | func (node); 422 | } 423 | -------------------------------------------------------------------------------- /ahocorasick/node.c: -------------------------------------------------------------------------------- 1 | /* 2 | * node.c: Implements the A.C. Trie node 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "node.h" 26 | #include "mpool.h" 27 | #include "ahocorasick.h" 28 | 29 | /* Privates */ 30 | static void node_init (ACT_NODE_t *thiz); 31 | static int node_edge_compare (const void *l, const void *r); 32 | static int node_has_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *patt); 33 | static void node_grow_outgoing_vector (ACT_NODE_t *thiz); 34 | static void node_grow_matched_vector (ACT_NODE_t *thiz); 35 | static void node_copy_pattern (ACT_NODE_t *thiz, 36 | AC_PATTERN_t *to, AC_PATTERN_t *from); 37 | 38 | /** 39 | * @brief Creates the node 40 | * 41 | * @return 42 | ******************************************************************************/ 43 | struct act_node * node_create (struct ac_trie *trie) 44 | { 45 | ACT_NODE_t *node; 46 | 47 | node = (ACT_NODE_t *) mpool_malloc (trie->mp, sizeof(ACT_NODE_t)); 48 | node_init (node); 49 | node->trie = trie; 50 | 51 | return node; 52 | } 53 | 54 | /** 55 | * @brief Initializes the node 56 | * 57 | * @param thiz 58 | *****************************************************************************/ 59 | static void node_init (ACT_NODE_t *thiz) 60 | { 61 | node_assign_id (thiz); 62 | 63 | thiz->final = 0; 64 | thiz->failure_node = NULL; 65 | thiz->depth = 0; 66 | 67 | thiz->matched = NULL; 68 | thiz->matched_capacity = 0; 69 | thiz->matched_size = 0; 70 | 71 | thiz->outgoing = NULL; 72 | thiz->outgoing_capacity = 0; 73 | thiz->outgoing_size = 0; 74 | 75 | thiz->to_be_replaced = NULL; 76 | } 77 | 78 | /** 79 | * @brief Releases the node memories 80 | * 81 | * @param thiz 82 | *****************************************************************************/ 83 | void node_release_vectors(ACT_NODE_t *nod) 84 | { 85 | free(nod->matched); 86 | free(nod->outgoing); 87 | } 88 | 89 | /** 90 | * @brief Finds out the next node for a given alpha. this function is used in 91 | * the pre-processing stage in which edge array is not sorted. so it uses 92 | * linear search. 93 | * 94 | * @param thiz 95 | * @param alpha 96 | * @return 97 | *****************************************************************************/ 98 | ACT_NODE_t * node_find_next(ACT_NODE_t *nod, AC_ALPHABET_t alpha) 99 | { 100 | size_t i; 101 | 102 | for (i=0; i < nod->outgoing_size; i++) 103 | { 104 | if(nod->outgoing[i].alpha == alpha) 105 | return (nod->outgoing[i].next); 106 | } 107 | return NULL; 108 | } 109 | 110 | /** 111 | * @brief Finds out the next node for a given alpha. this function is used 112 | * after the pre-processing stage in which we sort edges. so it uses Binary 113 | * Search. 114 | * 115 | * @param thiz 116 | * @param alpha 117 | * @return 118 | *****************************************************************************/ 119 | ACT_NODE_t *node_find_next_bs (ACT_NODE_t *nod, AC_ALPHABET_t alpha) 120 | { 121 | size_t mid; 122 | int min, max; 123 | AC_ALPHABET_t amid; 124 | 125 | min = 0; 126 | max = nod->outgoing_size - 1; 127 | 128 | while (min <= max) 129 | { 130 | mid = (min + max) >> 1; 131 | amid = nod->outgoing[mid].alpha; 132 | if (alpha > amid) 133 | min = mid + 1; 134 | else if (alpha < amid) 135 | max = mid - 1; 136 | else 137 | return (nod->outgoing[mid].next); 138 | } 139 | return NULL; 140 | } 141 | 142 | /** 143 | * @brief Determines if a final node contains a pattern in its accepted pattern 144 | * list or not. 145 | * 146 | * @param thiz 147 | * @param newstr 148 | * @return 1: has the pattern, 0: doesn't have it 149 | *****************************************************************************/ 150 | static int node_has_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *patt) 151 | { 152 | size_t i, j; 153 | AC_TEXT_t *txt; 154 | AC_TEXT_t *new_txt = &patt->ptext; 155 | 156 | for (i = 0; i < thiz->matched_size; i++) 157 | { 158 | txt = &thiz->matched[i].ptext; 159 | 160 | if (txt->length != new_txt->length) 161 | continue; 162 | 163 | /* The following loop is futile! Because the input pattern always come 164 | * from a failure node, and if they have the same length, then they are 165 | * equal. But for the sake of functional integrity we leave it here. */ 166 | 167 | for (j = 0; j < txt->length; j++) 168 | if (txt->astring[j] != new_txt->astring[j]) 169 | break; 170 | 171 | if (j == txt->length) 172 | return 1; 173 | } 174 | return 0; 175 | } 176 | 177 | /** 178 | * @brief Create the next node for the given alpha. 179 | * 180 | * @param thiz 181 | * @param alpha 182 | * @return 183 | *****************************************************************************/ 184 | ACT_NODE_t *node_create_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha) 185 | { 186 | ACT_NODE_t *next; 187 | 188 | if (node_find_next (nod, alpha) != NULL) 189 | /* The edge already exists */ 190 | return NULL; 191 | 192 | next = node_create (nod->trie); 193 | node_add_edge (nod, next, alpha); 194 | 195 | return next; 196 | } 197 | 198 | /** 199 | * @brief Adds the pattern to the list of accepted pattern. 200 | * 201 | * @param thiz 202 | * @param str 203 | * @param copy 204 | *****************************************************************************/ 205 | void node_accept_pattern (ACT_NODE_t *nod, AC_PATTERN_t *new_patt, int copy) 206 | { 207 | AC_PATTERN_t *patt; 208 | 209 | /* Check if the new pattern already exists in the node list */ 210 | if (node_has_pattern(nod, new_patt)) 211 | return; 212 | 213 | /* Manage memory */ 214 | if (nod->matched_size == nod->matched_capacity) 215 | node_grow_matched_vector (nod); 216 | 217 | patt = &nod->matched[nod->matched_size++]; 218 | 219 | if (copy) 220 | { 221 | /* Deep copy */ 222 | node_copy_pattern (nod, patt, new_patt); 223 | } 224 | else 225 | { 226 | /* Shallow copy */ 227 | *patt = *new_patt; 228 | } 229 | } 230 | 231 | /** 232 | * @brief Makes a deep copy of the pattern 233 | * 234 | * @param thiz pointer to the owner node 235 | * @param from 236 | * @param to 237 | *****************************************************************************/ 238 | static void node_copy_pattern 239 | (ACT_NODE_t *thiz, AC_PATTERN_t *to, AC_PATTERN_t *from) 240 | { 241 | struct mpool *mp = thiz->trie->mp; 242 | 243 | to->ptext.astring = (AC_ALPHABET_t *) mpool_strndup (mp, 244 | (const char *) from->ptext.astring, 245 | from->ptext.length * sizeof(AC_ALPHABET_t)); 246 | to->ptext.length = from->ptext.length; 247 | 248 | to->rtext.astring = (AC_ALPHABET_t *) mpool_strndup (mp, 249 | (const char *) from->rtext.astring, 250 | from->rtext.length * sizeof(AC_ALPHABET_t)); 251 | to->rtext.length = from->rtext.length; 252 | 253 | if (from->id.type == AC_PATTID_TYPE_STRING) 254 | to->id.u.stringy = (const char *) mpool_strdup (mp, 255 | (const char *) from->id.u.stringy); 256 | else 257 | to->id.u.number = from->id.u.number; 258 | 259 | to->id.type = from->id.type; 260 | } 261 | 262 | /** 263 | * @brief Establish an edge between two nodes 264 | * 265 | * @param thiz 266 | * @param next 267 | * @param alpha 268 | *****************************************************************************/ 269 | void node_add_edge (ACT_NODE_t *nod, ACT_NODE_t *next, AC_ALPHABET_t alpha) 270 | { 271 | struct act_edge *oe; /* Outgoing edge */ 272 | 273 | if(nod->outgoing_size == nod->outgoing_capacity) 274 | node_grow_outgoing_vector (nod); 275 | 276 | oe = &nod->outgoing[nod->outgoing_size]; 277 | oe->alpha = alpha; 278 | oe->next = next; 279 | nod->outgoing_size++; 280 | } 281 | 282 | /** 283 | * @brief Assigns a unique ID to the node (used for debugging purpose) 284 | * 285 | * @param thiz 286 | *****************************************************************************/ 287 | void node_assign_id (ACT_NODE_t *nod) 288 | { 289 | static int unique_id = 1; 290 | nod->id = unique_id++; 291 | } 292 | 293 | /** 294 | * @brief Comparison function for qsort. see man qsort. 295 | * 296 | * @param l left side 297 | * @param r right side 298 | * @return According to the man page: The comparison function must return an 299 | * integer less than, equal to, or greater than zero if the first argument is 300 | * considered to be respectively less than, equal to, or greater than the 301 | * second. if two members compare as equal, their order in the sorted array is 302 | * undefined. 303 | *****************************************************************************/ 304 | static int node_edge_compare (const void *l, const void *r) 305 | { 306 | /* 307 | * NOTE: Because edge alphabets are unique in every node we ignore 308 | * equivalence case. 309 | */ 310 | if (((struct act_edge *)l)->alpha >= ((struct act_edge *)r)->alpha) 311 | return 1; 312 | else 313 | return -1; 314 | } 315 | 316 | /** 317 | * @brief Sorts edges alphabets. 318 | * 319 | * @param thiz 320 | *****************************************************************************/ 321 | void node_sort_edges (ACT_NODE_t *nod) 322 | { 323 | qsort ((void *)nod->outgoing, nod->outgoing_size, 324 | sizeof(struct act_edge), node_edge_compare); 325 | } 326 | 327 | /** 328 | * @brief Bookmarks the to-be-replaced patterns 329 | * 330 | * If there was more than one pattern accepted in a node then only one of them 331 | * must be replaced: The longest pattern that has a requested replacement. 332 | * 333 | * @param node 334 | * @return 1 if there was any replacement, 0 otherwise 335 | *****************************************************************************/ 336 | int node_book_replacement (ACT_NODE_t *nod) 337 | { 338 | size_t j; 339 | AC_PATTERN_t *pattern; 340 | AC_PATTERN_t *longest = NULL; 341 | 342 | if(!nod->final) 343 | return 0; 344 | 345 | for (j=0; j < nod->matched_size; j++) 346 | { 347 | pattern = &nod->matched[j]; 348 | 349 | if (pattern->rtext.astring != NULL) 350 | { 351 | if (!longest) 352 | longest = pattern; 353 | else if (pattern->ptext.length > longest->ptext.length) 354 | longest = pattern; 355 | } 356 | } 357 | 358 | nod->to_be_replaced = longest; 359 | 360 | return longest ? 1 : 0; 361 | } 362 | 363 | /** 364 | * @brief Grows the size of outgoing edges vector 365 | * 366 | * @param thiz 367 | *****************************************************************************/ 368 | static void node_grow_outgoing_vector (ACT_NODE_t *thiz) 369 | { 370 | const size_t grow_factor = (8 / (thiz->depth + 1)) + 1; 371 | 372 | /* The outgoing edges of nodes grow with different pace in different 373 | * depths; the shallower nodes the bigger outgoing number of nodes. 374 | * So for efficiency (speed & memory usage), we apply a measure to 375 | * manage different growth rate. 376 | */ 377 | 378 | if (thiz->outgoing_capacity == 0) 379 | { 380 | thiz->outgoing_capacity = grow_factor; 381 | thiz->outgoing = (struct act_edge *) malloc 382 | (thiz->outgoing_capacity * sizeof(struct act_edge)); 383 | } 384 | else 385 | { 386 | thiz->outgoing_capacity += grow_factor; 387 | thiz->outgoing = (struct act_edge *) realloc ( 388 | thiz->outgoing, 389 | thiz->outgoing_capacity * sizeof(struct act_edge)); 390 | } 391 | } 392 | 393 | /** 394 | * @brief Grows the size of matched patterns vector 395 | * 396 | * @param thiz 397 | *****************************************************************************/ 398 | static void node_grow_matched_vector (ACT_NODE_t *thiz) 399 | { 400 | if (thiz->matched_capacity == 0) 401 | { 402 | thiz->matched_capacity = 1; 403 | thiz->matched = (AC_PATTERN_t *) malloc 404 | (thiz->matched_capacity * sizeof(AC_PATTERN_t)); 405 | } 406 | else 407 | { 408 | thiz->matched_capacity += 2; 409 | thiz->matched = (AC_PATTERN_t *) realloc ( 410 | thiz->matched, 411 | thiz->matched_capacity * sizeof(AC_PATTERN_t)); 412 | } 413 | } 414 | 415 | /** 416 | * @brief Collect accepted patterns of the node. 417 | * 418 | * The accepted patterns consist of the node's own accepted pattern plus 419 | * accepted patterns of its failure node. 420 | * 421 | * @param node 422 | *****************************************************************************/ 423 | void node_collect_matches (ACT_NODE_t *nod) 424 | { 425 | size_t i; 426 | ACT_NODE_t *n = nod; 427 | 428 | while ((n = n->failure_node)) 429 | { 430 | for (i = 0; i < n->matched_size; i++) 431 | /* Always call with copy parameter 0 */ 432 | node_accept_pattern (nod, &(n->matched[i]), 0); 433 | 434 | if (n->final) 435 | nod->final = 1; 436 | } 437 | 438 | node_sort_edges (nod); 439 | /* Sort matched patterns? Is that necessary? I don't think so. */ 440 | } 441 | 442 | /** 443 | * @brief Displays all nodes recursively 444 | * 445 | * @param n 446 | * @param repcast 447 | *****************************************************************************/ 448 | void node_display (ACT_NODE_t *nod) 449 | { 450 | size_t j; 451 | struct act_edge *e; 452 | AC_PATTERN_t patt; 453 | 454 | printf("NODE(%3d)/....fail....> ", nod->id); 455 | if (nod->failure_node) 456 | printf("NODE(%3d)\n", nod->failure_node->id); 457 | else 458 | printf ("N.A.\n"); 459 | 460 | for (j = 0; j < nod->outgoing_size; j++) 461 | { 462 | e = &nod->outgoing[j]; 463 | printf(" |----("); 464 | if(isgraph(e->alpha)) 465 | printf("%c)---", e->alpha); 466 | else 467 | printf("0x%x)", e->alpha); 468 | printf("--> NODE(%3d)\n", e->next->id); 469 | } 470 | 471 | if (nod->matched_size) 472 | { 473 | printf("Accepts: {"); 474 | for (j = 0; j < nod->matched_size; j++) 475 | { 476 | patt = nod->matched[j]; 477 | if(j) 478 | printf(", "); 479 | switch (patt.id.type) 480 | { 481 | case AC_PATTID_TYPE_DEFAULT: 482 | case AC_PATTID_TYPE_NUMBER: 483 | printf("%ld", patt.id.u.number); 484 | break; 485 | case AC_PATTID_TYPE_STRING: 486 | printf("%s", patt.id.u.stringy); 487 | break; 488 | } 489 | printf(": %.*s", (int)patt.ptext.length, patt.ptext.astring); 490 | } 491 | printf("}\n"); 492 | } 493 | printf("\n"); 494 | } 495 | -------------------------------------------------------------------------------- /akm.c: -------------------------------------------------------------------------------- 1 | /* 2 | +----------------------------------------------------------------------+ 3 | | PHP Version 7 | 4 | +----------------------------------------------------------------------+ 5 | | Copyright (c) 1997-2016 The PHP Group | 6 | +----------------------------------------------------------------------+ 7 | | This source file is subject to version 3.01 of the PHP license, | 8 | | that is bundled with this package in the file LICENSE, and is | 9 | | available through the world-wide-web at the following url: | 10 | | http://www.php.net/license/3_01.txt | 11 | | If you did not receive a copy of the PHP license and are unable to | 12 | | obtain it through the world-wide-web, please send a note to | 13 | | license@php.net so we can mail you a copy immediately. | 14 | +----------------------------------------------------------------------+ 15 | | Author: maben | 16 | +----------------------------------------------------------------------+ 17 | */ 18 | 19 | /* $Id$ */ 20 | 21 | #ifdef HAVE_CONFIG_H 22 | #include "config.h" 23 | #endif 24 | 25 | #include "php.h" 26 | #include "php_ini.h" 27 | #include "ext/standard/info.h" 28 | #include "php_akm.h" 29 | #include "zend_smart_str.h" 30 | 31 | #include 32 | #include 33 | 34 | /* If you declare any globals in php_akm.h uncomment this: 35 | ZEND_DECLARE_MODULE_GLOBALS(akm) 36 | */ 37 | 38 | /* True global resources - no need for thread safety here */ 39 | static int le_akm; 40 | 41 | static int akm_enable = 0; 42 | static char *akm_dict_dir = NULL; 43 | 44 | static HashTable *akm_dict_ht = NULL; 45 | 46 | #define DELIMITER '|' 47 | #define MAX_KEYWORD_LENGTH 256 48 | 49 | /** 50 | * {{{ akm_dict_ht 51 | */ 52 | 53 | static inline void akm_build_node(akm_trie_t *trie, char *keyword, 54 | size_t keyword_len, char *extension) 55 | { 56 | akm_pattern_t patt; 57 | 58 | /* Fill the pattern data */ 59 | patt.ptext.astring = keyword; 60 | patt.ptext.length = keyword_len; 61 | 62 | /* The replacement pattern is not applicable in this program, so better 63 | * to initialize it with 0 */ 64 | patt.rtext.astring = NULL; 65 | patt.rtext.length = 0; 66 | 67 | patt.id.u.stringy = extension; 68 | patt.id.type = AKM_PATTID_TYPE_STRING; 69 | 70 | /* Add pattern to automata */ 71 | akm_trie_add (trie, &patt, 1); 72 | } 73 | 74 | static void akm_build_tree(char *filename, char *fullpath) 75 | { 76 | FILE *fp; 77 | size_t ll = 0, i; 78 | 79 | char *keyword, 80 | *extension; 81 | size_t keyword_len; 82 | struct stat st; 83 | char *line = emalloc(MAX_KEYWORD_LENGTH); 84 | 85 | stat(fullpath, &st); 86 | if (st.st_size == 0) { 87 | return; 88 | } 89 | 90 | /* add to HashTable */ 91 | akm_trie_t *trie = akm_trie_create (); 92 | zval ztrie; 93 | ZVAL_PTR(&ztrie, trie); 94 | 95 | zend_hash_add(akm_dict_ht, 96 | zend_string_init(filename, strlen(filename), 1), 97 | &ztrie); 98 | 99 | fp = fopen(fullpath, "r"); 100 | if (!fp) { 101 | php_error_docref(NULL, E_ERROR, "Cannot open dict file %s, errno:%d", fullpath, errno); 102 | return; 103 | } 104 | 105 | while (NULL != fgets(line, MAX_KEYWORD_LENGTH, fp)) { 106 | ll = strlen(line); 107 | /* remove \r\n */ 108 | if (ll > 0 && line[ll - 1] == '\n') { 109 | line[ll - 1] = '\0'; 110 | } 111 | 112 | if (ll > 1 && line[ll - 2] == '\r') { 113 | line[ll - 2] = '\0'; 114 | } 115 | 116 | // recheck 117 | ll = strlen(line); 118 | if (ll == 0 || line[0] == DELIMITER) { 119 | continue; 120 | } 121 | 122 | /* find delimiter */ 123 | keyword = line; 124 | keyword_len = 0; 125 | extension = NULL; 126 | for (i = 0; i < ll; i++) { 127 | if (line[i] == DELIMITER) { 128 | keyword_len = i; 129 | break; 130 | } 131 | } 132 | 133 | if (keyword_len == 0) { /* not found */ 134 | keyword_len = ll; 135 | } else { 136 | if (keyword_len + 1 == ll) { /* example: "keyword|" */ 137 | keyword_len = ll - 1; 138 | } else { 139 | extension = keyword + keyword_len + 1; 140 | } 141 | } 142 | 143 | akm_build_node(trie, keyword, keyword_len, extension); 144 | } 145 | fclose(fp); 146 | efree(line); 147 | akm_trie_finalize (trie); 148 | } 149 | 150 | static int akm_scan_directory(char *dirname, 151 | void (*callback)(char *filename, char *fullpath)) 152 | { 153 | int success = 0; 154 | char fullpath[PATH_MAX] = { 0 }; 155 | struct dirent *ent = NULL; 156 | 157 | DIR *dir = opendir(dirname); 158 | if (dir == NULL) { 159 | return -1; 160 | } 161 | 162 | while (NULL != (ent = readdir(dir))) { 163 | if (ent->d_type == DT_REG) { 164 | sprintf(fullpath, "%s%s", dirname, ent->d_name); 165 | callback(ent->d_name, fullpath); 166 | success++; 167 | } 168 | } 169 | closedir(dir); 170 | return success; 171 | } 172 | 173 | static int akm_dict_ht_init() 174 | { 175 | akm_dict_ht = pemalloc(sizeof(HashTable), 1); 176 | if (akm_dict_ht == NULL) { 177 | php_error_docref(NULL, E_ERROR, "Cannot alloc memory"); 178 | return -1; 179 | } 180 | 181 | zend_hash_init(akm_dict_ht, 0, NULL, ZVAL_PTR_DTOR, 1); 182 | if (access(akm_dict_dir, R_OK) < 0) { 183 | php_error_docref(NULL, E_ERROR, "Cannot access directory %s", akm_dict_dir); 184 | return -1; 185 | } 186 | if (akm_scan_directory(akm_dict_dir, akm_build_tree) < 0) { 187 | php_error_docref(NULL, E_ERROR, "Cannot open directory %s", akm_dict_dir); 188 | return -1; 189 | } 190 | return 0; 191 | } 192 | 193 | static void akm_dict_ht_free() 194 | { 195 | if (akm_dict_ht) { 196 | zend_string *key; 197 | zval *value; 198 | zend_ulong idx; 199 | akm_trie_t *trie; 200 | 201 | ZEND_HASH_FOREACH_KEY_VAL(akm_dict_ht, idx, key, value) { 202 | 203 | zend_string_free(key); 204 | trie = Z_PTR_P(value); 205 | akm_trie_release (trie); 206 | 207 | } ZEND_HASH_FOREACH_END(); 208 | 209 | pefree(akm_dict_ht, 1); 210 | } 211 | } 212 | 213 | static akm_trie_t *akm_get_trie(zend_string *key) 214 | { 215 | zval *trie = zend_hash_find(akm_dict_ht, key); 216 | if (trie) { 217 | return Z_PTR_P(trie); 218 | } 219 | return NULL; 220 | } 221 | 222 | /* }}} */ 223 | 224 | /* {{{ PHP_INI 225 | */ 226 | 227 | ZEND_INI_MH(php_akm_enable) 228 | { 229 | if (!new_value || new_value->len == 0) { 230 | return FAILURE; 231 | } 232 | 233 | if (!strcasecmp(new_value->val, "on") || !strcmp(new_value->val, "1")) { 234 | akm_enable = 1; 235 | } else { 236 | akm_enable = 0; 237 | } 238 | 239 | return SUCCESS; 240 | } 241 | 242 | ZEND_INI_MH(php_akm_dict_dir) 243 | { 244 | if (!new_value || new_value->len == 0) { 245 | return FAILURE; 246 | } 247 | if (new_value->val[new_value->len] != '/') { 248 | akm_dict_dir = pemalloc(new_value->len + 2, 1); 249 | strcpy(akm_dict_dir, new_value->val); 250 | akm_dict_dir[new_value->len] = '/'; 251 | akm_dict_dir[new_value->len + 1] = '\0'; 252 | } else { 253 | akm_dict_dir = strdup(new_value->val); 254 | } 255 | if (akm_dict_dir == NULL) { 256 | return FAILURE; 257 | } 258 | return SUCCESS; 259 | } 260 | 261 | PHP_INI_BEGIN() 262 | PHP_INI_ENTRY("akm.enable", "0", PHP_INI_ALL, php_akm_enable) 263 | PHP_INI_ENTRY("akm.dict_dir", "", PHP_INI_ALL, php_akm_dict_dir) 264 | PHP_INI_END() 265 | /* }}} */ 266 | 267 | struct _akm_replace_params { 268 | zval *return_value; 269 | zend_fcall_info *fci; 270 | zend_fcall_info_cache *fci_cache; 271 | HashTable *ht; 272 | }; 273 | 274 | static void akm_trie_traversal(akm_trie_t *trie, 275 | void(*callback)(zend_string *, zend_ulong, zend_string *, void *), void *args) 276 | { 277 | akm_match_t m; 278 | unsigned int j; 279 | akm_pattern_t *pp; 280 | zend_string *keyword, 281 | *extension; 282 | zend_ulong offset; 283 | while ((m = akm_trie_findnext(trie)).size) { 284 | for (j = 0; j < m.size; j++) { 285 | pp = &m.patterns[j]; 286 | keyword = zend_string_init(pp->ptext.astring, pp->ptext.length, 0); 287 | extension = pp->id.u.stringy == NULL ? NULL : 288 | zend_string_init(pp->id.u.stringy, strlen(pp->id.u.stringy), 0); 289 | offset = m.position; 290 | callback(keyword, offset, extension, args); 291 | } 292 | } 293 | } 294 | 295 | 296 | static void akm_match_handler(zend_string *keyword, zend_ulong offset, zend_string *extension, void *args) 297 | { 298 | zval *return_value = args; 299 | zval entry, 300 | zkeyword, 301 | zoffset, 302 | zextension; 303 | 304 | zend_ulong hash_size; 305 | 306 | array_init_size(&entry, 3); 307 | 308 | ZVAL_NEW_STR(&zkeyword, keyword); 309 | ZVAL_LONG(&zoffset, offset); 310 | if (extension == NULL) { 311 | ZVAL_NULL(&zextension); 312 | } else { 313 | ZVAL_NEW_STR(&zextension, extension); 314 | } 315 | 316 | zend_hash_str_add(Z_ARRVAL_P(&entry), "keyword", sizeof("keyword") - 1, &zkeyword); 317 | zend_hash_str_add(Z_ARRVAL_P(&entry), "offset", sizeof("offset") - 1, &zoffset); 318 | zend_hash_str_add(Z_ARRVAL_P(&entry), "extension", sizeof("extension") - 1, &zextension); 319 | zend_hash_index_add(Z_ARRVAL_P(return_value), zend_array_count(Z_ARRVAL_P(return_value)), &entry); 320 | } 321 | 322 | static void akm_replace_handler(zend_string *keyword, zend_ulong offset, zend_string *extension, void *args) 323 | { 324 | struct _akm_replace_params *params = (struct _akm_replace_params *)args; 325 | zval cb_args[3]; 326 | zval retval; 327 | zval entry; 328 | 329 | ZVAL_NEW_STR(&cb_args[0], keyword); 330 | ZVAL_LONG(&cb_args[1], offset); 331 | if (extension == NULL) { 332 | ZVAL_NULL(&cb_args[2]); 333 | } else { 334 | ZVAL_NEW_STR(&cb_args[2], extension); 335 | } 336 | 337 | params->fci->params = cb_args; 338 | params->fci->retval = &retval; 339 | 340 | array_init_size(&entry, 3); 341 | 342 | if (zend_call_function(params->fci, params->fci_cache) == SUCCESS) { 343 | if (Z_TYPE(retval) == IS_STRING) { 344 | zend_hash_str_add(Z_ARRVAL_P(&entry), "keyword", sizeof("keyword") - 1, &cb_args[0]); 345 | zend_hash_str_add(Z_ARRVAL_P(&entry), "offset", sizeof("offset") - 1, &cb_args[1]); 346 | zend_hash_str_add(Z_ARRVAL_P(&entry), "replace", sizeof("replace") - 1, &retval); 347 | zend_hash_index_add(params->ht, zend_array_count(params->ht), &entry); 348 | } 349 | } 350 | } 351 | 352 | /* {{{ php function */ 353 | 354 | PHP_FUNCTION(akm_match) 355 | { 356 | zend_string *dict, 357 | *text; 358 | 359 | akm_text_t chunk; 360 | 361 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "SS", &dict, &text) == FAILURE) { 362 | return; 363 | } 364 | 365 | akm_trie_t *trie = akm_get_trie(dict); 366 | if (trie == NULL) { 367 | php_error_docref(NULL, E_WARNING, "Dict name #%s is not found", ZSTR_VAL(dict)); 368 | RETURN_FALSE; 369 | } 370 | 371 | array_init(return_value); 372 | 373 | chunk.astring = ZSTR_VAL(text); 374 | chunk.length = ZSTR_LEN(text); 375 | 376 | akm_trie_settext (trie, &chunk, 0); 377 | akm_trie_traversal(trie, akm_match_handler, (void *)return_value); 378 | } 379 | 380 | PHP_FUNCTION(akm_replace) 381 | { 382 | zval *text; 383 | char *text_c; 384 | size_t text_l; 385 | zend_string *dict; 386 | 387 | zend_ulong replace_count = 0; 388 | 389 | HashTable *ht; 390 | 391 | ALLOC_HASHTABLE(ht); 392 | zend_hash_init(ht, 0, NULL, ZVAL_PTR_DTOR, 0); 393 | 394 | zend_fcall_info fci = empty_fcall_info; 395 | zend_fcall_info_cache fci_cache = empty_fcall_info_cache; 396 | 397 | akm_text_t chunk; 398 | akm_match_t match; 399 | 400 | zend_ulong idx = 0; 401 | 402 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "Sz/f", &dict, 403 | &text, &fci, &fci_cache) == FAILURE) { 404 | return; 405 | } 406 | 407 | akm_trie_t *trie = akm_get_trie(dict); 408 | if (trie == NULL) { 409 | php_error_docref(NULL, E_WARNING, "Dict name #%s is not found", ZSTR_VAL(dict)); 410 | RETURN_FALSE; 411 | } 412 | 413 | text_c = ZSTR_VAL(Z_STR_P(text)); 414 | text_l = ZSTR_LEN(Z_STR_P(text)); 415 | 416 | fci.no_separation = 0; 417 | fci.param_count = 3; 418 | 419 | chunk.astring = text_c; 420 | chunk.length = text_l; 421 | 422 | akm_trie_settext (trie, &chunk, 0); 423 | 424 | struct _akm_replace_params params; 425 | params.return_value = return_value; 426 | params.fci = &fci; 427 | params.fci_cache = &fci_cache; 428 | params.ht = ht; 429 | 430 | akm_trie_traversal(trie, akm_replace_handler, (void *)¶ms); 431 | 432 | if (zend_array_count(ht) == 0) goto finally; 433 | 434 | smart_str replaced = { 0 }; 435 | zend_ulong copied_idx = 0, last_copy_len = 0; 436 | int copy_len= 0; 437 | 438 | zval *entry, 439 | *keyword, 440 | *offset, 441 | *replace; 442 | 443 | ZEND_HASH_FOREACH_NUM_KEY_VAL(ht, idx, entry) { 444 | keyword = zend_hash_str_find(Z_ARRVAL_P(entry), "keyword", sizeof("keyword") - 1); 445 | offset = zend_hash_str_find(Z_ARRVAL_P(entry), "offset", sizeof("offset") - 1); 446 | replace = zend_hash_str_find(Z_ARRVAL_P(entry), "replace", sizeof("replace") - 1); 447 | 448 | copy_len = Z_LVAL_P(offset) - copied_idx - Z_STRLEN_P(keyword); 449 | 450 | if (copy_len <= 0 && idx != 0) { /* cover previous keyword */ 451 | replaced.s->len -= last_copy_len; 452 | copied_idx -= last_copy_len; 453 | copy_len = 0; 454 | replace_count--; 455 | } 456 | 457 | if (copy_len) 458 | smart_str_appendl(&replaced, text_c + copied_idx, copy_len); 459 | 460 | smart_str_appendl(&replaced, Z_STRVAL_P(replace), Z_STRLEN_P(replace)); 461 | last_copy_len = Z_STRLEN_P(replace); 462 | replace_count++; 463 | 464 | copied_idx = Z_LVAL_P(offset); 465 | 466 | zval_ptr_dtor(keyword); 467 | zval_ptr_dtor(offset); 468 | zval_ptr_dtor(replace); 469 | } ZEND_HASH_FOREACH_END(); 470 | 471 | if (copied_idx < text_l) { 472 | smart_str_appendl(&replaced, text_c + copied_idx, text_l - copied_idx); 473 | } 474 | smart_str_0(&replaced); 475 | 476 | /* replace */ 477 | zval_ptr_dtor(text); 478 | ZVAL_NEW_STR(text, replaced.s); 479 | 480 | finally: 481 | 482 | FREE_HASHTABLE(ht); 483 | RETURN_LONG(replace_count); 484 | } 485 | 486 | PHP_FUNCTION(akm_get_dict_list) 487 | { 488 | array_init(return_value); 489 | 490 | zval dict; 491 | zend_string *key; 492 | zval *value; 493 | zend_ulong idx; 494 | akm_trie_t *trie; 495 | 496 | ZEND_HASH_FOREACH_KEY_VAL(akm_dict_ht, idx, key, value) { 497 | ZVAL_NEW_STR(&dict, key); 498 | zend_hash_index_add(Z_ARRVAL_P(return_value), zend_array_count(Z_ARRVAL_P(return_value)), &dict); 499 | } ZEND_HASH_FOREACH_END(); 500 | } 501 | 502 | 503 | /* }}} */ 504 | 505 | 506 | /* {{{ php_akm_init_globals 507 | */ 508 | /* Uncomment this function if you have INI entries 509 | static void php_akm_init_globals(zend_akm_globals *akm_globals) 510 | { 511 | akm_globals->global_value = 0; 512 | akm_globals->global_string = NULL; 513 | } 514 | */ 515 | /* }}} */ 516 | 517 | /* {{{ PHP_MINIT_FUNCTION 518 | */ 519 | PHP_MINIT_FUNCTION(akm) 520 | { 521 | REGISTER_INI_ENTRIES(); 522 | 523 | if (!akm_enable) { 524 | return SUCCESS; 525 | } 526 | 527 | if (akm_dict_ht_init() < 0) { 528 | return FAILURE; 529 | } 530 | 531 | return SUCCESS; 532 | } 533 | /* }}} */ 534 | 535 | /* {{{ PHP_MSHUTDOWN_FUNCTION 536 | */ 537 | PHP_MSHUTDOWN_FUNCTION(akm) 538 | { 539 | UNREGISTER_INI_ENTRIES(); 540 | 541 | akm_dict_ht_free(); 542 | 543 | return SUCCESS; 544 | } 545 | /* }}} */ 546 | 547 | /* Remove if there's nothing to do at request start */ 548 | /* {{{ PHP_RINIT_FUNCTION 549 | */ 550 | PHP_RINIT_FUNCTION(akm) 551 | { 552 | #if defined(COMPILE_DL_AKM) && defined(ZTS) 553 | ZEND_TSRMLS_CACHE_UPDATE(); 554 | #endif 555 | return SUCCESS; 556 | } 557 | /* }}} */ 558 | 559 | /* Remove if there's nothing to do at request end */ 560 | /* {{{ PHP_RSHUTDOWN_FUNCTION 561 | */ 562 | PHP_RSHUTDOWN_FUNCTION(akm) 563 | { 564 | return SUCCESS; 565 | } 566 | /* }}} */ 567 | 568 | /* {{{ PHP_MINFO_FUNCTION 569 | */ 570 | PHP_MINFO_FUNCTION(akm) 571 | { 572 | php_info_print_table_start(); 573 | php_info_print_table_header(2, "akm support", "enabled"); 574 | php_info_print_table_end(); 575 | 576 | DISPLAY_INI_ENTRIES(); 577 | } 578 | /* }}} */ 579 | 580 | ZEND_BEGIN_ARG_INFO_EX(arginfo_akm_match, 0, 0, 2) 581 | ZEND_ARG_INFO(0, dict_name) 582 | ZEND_ARG_INFO(0, text) 583 | ZEND_END_ARG_INFO() 584 | 585 | ZEND_BEGIN_ARG_INFO_EX(arginfo_akm_replace, 0, 0, 3) 586 | ZEND_ARG_INFO(0, dict_name) 587 | ZEND_ARG_INFO(1, text) 588 | ZEND_ARG_INFO(0, callback) 589 | ZEND_END_ARG_INFO() 590 | 591 | /* {{{ akm_functions[] 592 | * 593 | * Every user visible function must have an entry in akm_functions[]. 594 | */ 595 | const zend_function_entry akm_functions[] = { 596 | PHP_FE(akm_get_dict_list, NULL) 597 | PHP_FE(akm_match, arginfo_akm_match) 598 | PHP_FE(akm_replace, arginfo_akm_replace) 599 | PHP_FE_END /* Must be the last line in akm_functions[] */ 600 | }; 601 | /* }}} */ 602 | 603 | /* {{{ akm_module_entry 604 | */ 605 | zend_module_entry akm_module_entry = { 606 | STANDARD_MODULE_HEADER, 607 | "akm", 608 | akm_functions, 609 | PHP_MINIT(akm), 610 | PHP_MSHUTDOWN(akm), 611 | PHP_RINIT(akm), /* Replace with NULL if there's nothing to do at request start */ 612 | PHP_RSHUTDOWN(akm), /* Replace with NULL if there's nothing to do at request end */ 613 | PHP_MINFO(akm), 614 | PHP_AKM_VERSION, 615 | STANDARD_MODULE_PROPERTIES 616 | }; 617 | /* }}} */ 618 | 619 | #ifdef COMPILE_DL_AKM 620 | #ifdef ZTS 621 | ZEND_TSRMLS_CACHE_DEFINE(); 622 | #endif 623 | ZEND_GET_MODULE(akm) 624 | #endif 625 | -------------------------------------------------------------------------------- /ahocorasick/replace.c: -------------------------------------------------------------------------------- 1 | /* 2 | * replace.c: Implements the replacement functionality 3 | * 4 | * This file is part of multifast. 5 | * 6 | Copyright 2010-2015 Kamiar Kanani 7 | 8 | multifast is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU Lesser General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | multifast is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU Lesser General Public License for more details. 17 | 18 | You should have received a copy of the GNU Lesser General Public License 19 | along with multifast. If not, see . 20 | */ 21 | 22 | #include 23 | 24 | #include "node.h" 25 | #include "ahocorasick.h" 26 | 27 | 28 | /* Privates */ 29 | 30 | static void mf_repdata_do_replace 31 | (MF_REPLACEMENT_DATA_t *rd, size_t to_position); 32 | 33 | static void mf_repdata_booknominee 34 | (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom); 35 | 36 | static void mf_repdata_push_nominee 37 | (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom); 38 | 39 | static void mf_repdata_grow_noms_array 40 | (MF_REPLACEMENT_DATA_t *rd); 41 | 42 | static void mf_repdata_appendtext 43 | (MF_REPLACEMENT_DATA_t *rd, AC_TEXT_t *text); 44 | 45 | static void mf_repdata_appendfactor 46 | (MF_REPLACEMENT_DATA_t *rd, size_t from, size_t to); 47 | 48 | static void mf_repdata_savetobacklog 49 | (MF_REPLACEMENT_DATA_t *rd, size_t to_position_r); 50 | 51 | static void mf_repdata_flush 52 | (MF_REPLACEMENT_DATA_t *rd); 53 | 54 | static unsigned int mf_repdata_bookreplacements 55 | (ACT_NODE_t *node); 56 | 57 | /* Publics */ 58 | 59 | void mf_repdata_init (AC_TRIE_t *trie); 60 | void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd); 61 | void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd); 62 | void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd); 63 | 64 | 65 | /** 66 | * @brief Initializes the replacement data part of the trie 67 | * 68 | * @param trie 69 | *****************************************************************************/ 70 | void mf_repdata_init (AC_TRIE_t *trie) 71 | { 72 | MF_REPLACEMENT_DATA_t *rd = &trie->repdata; 73 | 74 | rd->buffer.astring = NULL; 75 | rd->buffer.length = 0; 76 | rd->backlog.astring = NULL; 77 | rd->backlog.length = 0; 78 | rd->has_replacement = 0; 79 | rd->curser = 0; 80 | 81 | rd->noms = NULL; 82 | rd->noms_capacity = 0; 83 | rd->noms_size = 0; 84 | 85 | rd->replace_mode = MF_REPLACE_MODE_DEFAULT; 86 | rd->trie = trie; 87 | } 88 | 89 | /** 90 | * @brief Performs finalization tasks on replacement data. 91 | * Must be called when finalizing the trie itself 92 | * 93 | * @param rd 94 | *****************************************************************************/ 95 | void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd) 96 | { 97 | /* Bookmark replacement pattern for faster retrieval */ 98 | rd->has_replacement = mf_repdata_bookreplacements (rd->trie->root); 99 | 100 | if (rd->has_replacement) 101 | { 102 | rd->buffer.astring = (AC_ALPHABET_t *) 103 | malloc (MF_REPLACEMENT_BUFFER_SIZE * sizeof(AC_ALPHABET_t)); 104 | 105 | rd->backlog.astring = (AC_ALPHABET_t *) 106 | malloc (AC_PATTRN_MAX_LENGTH * sizeof(AC_ALPHABET_t)); 107 | 108 | /* Backlog length is not bigger than the max pattern length */ 109 | } 110 | } 111 | 112 | /** 113 | * @brief Bookmarks the to-be-replaced patterns for all nodes 114 | * 115 | * @param node 116 | * @return 117 | *****************************************************************************/ 118 | static unsigned int mf_repdata_bookreplacements (ACT_NODE_t *node) 119 | { 120 | size_t i; 121 | unsigned int ret; 122 | 123 | ret = node_book_replacement (node); 124 | 125 | for (i = 0; i < node->outgoing_size; i++) 126 | { 127 | /* Recursively call itself to traverse all nodes */ 128 | ret += mf_repdata_bookreplacements (node->outgoing[i].next); 129 | } 130 | 131 | return ret; 132 | } 133 | 134 | /** 135 | * @brief Resets the replacement data and prepares it for a new operation 136 | * 137 | * @param rd 138 | *****************************************************************************/ 139 | void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd) 140 | { 141 | rd->buffer.length = 0; 142 | rd->backlog.length = 0; 143 | rd->curser = 0; 144 | rd->noms_size = 0; 145 | } 146 | 147 | /** 148 | * @brief Release the allocated resources to the replacement data 149 | * 150 | * @param rd 151 | *****************************************************************************/ 152 | void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd) 153 | { 154 | free((AC_ALPHABET_t *)rd->buffer.astring); 155 | free((AC_ALPHABET_t *)rd->backlog.astring); 156 | free(rd->noms); 157 | } 158 | 159 | /** 160 | * @brief Flushes out all the available stuff in the buffer to the user 161 | * 162 | * @param rd 163 | *****************************************************************************/ 164 | static void mf_repdata_flush (MF_REPLACEMENT_DATA_t *rd) 165 | { 166 | rd->cbf(&rd->buffer, rd->user); 167 | rd->buffer.length = 0; 168 | } 169 | 170 | /** 171 | * @brief Extends the nominees array 172 | * 173 | * @param rd 174 | *****************************************************************************/ 175 | static void mf_repdata_grow_noms_array (MF_REPLACEMENT_DATA_t *rd) 176 | { 177 | const size_t grow_factor = 128; 178 | 179 | if (rd->noms_capacity == 0) 180 | { 181 | rd->noms_capacity = grow_factor; 182 | rd->noms = (struct mf_replacement_nominee *) malloc 183 | (rd->noms_capacity * sizeof(struct mf_replacement_nominee)); 184 | rd->noms_size = 0; 185 | } 186 | else 187 | { 188 | rd->noms_capacity += grow_factor; 189 | rd->noms = (struct mf_replacement_nominee *) realloc (rd->noms, 190 | rd->noms_capacity * sizeof(struct mf_replacement_nominee)); 191 | } 192 | } 193 | 194 | /** 195 | * @brief Adds the nominee to the end of the nominee list 196 | * 197 | * @param rd 198 | * @param new_nom 199 | *****************************************************************************/ 200 | static void mf_repdata_push_nominee 201 | (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom) 202 | { 203 | struct mf_replacement_nominee *nomp; 204 | 205 | /* Extend the vector if needed */ 206 | if (rd->noms_size == rd->noms_capacity) 207 | mf_repdata_grow_noms_array (rd); 208 | 209 | /* Add the new nominee to the end */ 210 | nomp = &rd->noms[rd->noms_size]; 211 | nomp->pattern = new_nom->pattern; 212 | nomp->position = new_nom->position; 213 | rd->noms_size ++; 214 | } 215 | 216 | /** 217 | * @brief Tries to add the nominee to the end of the nominee list 218 | * 219 | * @param rd 220 | * @param new_nom 221 | *****************************************************************************/ 222 | static void mf_repdata_booknominee (MF_REPLACEMENT_DATA_t *rd, 223 | struct mf_replacement_nominee *new_nom) 224 | { 225 | struct mf_replacement_nominee *prev_nom; 226 | size_t prev_start_pos, prev_end_pos, new_start_pos; 227 | 228 | if (new_nom->pattern == NULL) 229 | return; /* This is not a to-be-replaced pattern; ignore it. */ 230 | 231 | new_start_pos = new_nom->position - new_nom->pattern->ptext.length; 232 | 233 | switch (rd->replace_mode) 234 | { 235 | case MF_REPLACE_MODE_LAZY: 236 | 237 | if (new_start_pos < rd->curser) 238 | return; /* Ignore the new nominee, because it overlaps with the 239 | * previous replacement */ 240 | 241 | if (rd->noms_size > 0) 242 | { 243 | prev_nom = &rd->noms[rd->noms_size - 1]; 244 | prev_end_pos = prev_nom->position; 245 | 246 | if (new_start_pos < prev_end_pos) 247 | return; 248 | } 249 | break; 250 | 251 | case MF_REPLACE_MODE_DEFAULT: 252 | case MF_REPLACE_MODE_NORMAL: 253 | default: 254 | 255 | while (rd->noms_size > 0) 256 | { 257 | prev_nom = &rd->noms[rd->noms_size - 1]; 258 | prev_start_pos = 259 | prev_nom->position - prev_nom->pattern->ptext.length; 260 | prev_end_pos = prev_nom->position; 261 | 262 | if (new_start_pos <= prev_start_pos) 263 | rd->noms_size--; /* Remove that nominee, because it is a 264 | * factor of the new nominee */ 265 | else 266 | break; /* Get out the loop and add the new nominee */ 267 | } 268 | break; 269 | } 270 | 271 | mf_repdata_push_nominee(rd, new_nom); 272 | } 273 | 274 | /** 275 | * @brief Append the given text to the output buffer 276 | * 277 | * @param rd 278 | * @param text 279 | *****************************************************************************/ 280 | static void mf_repdata_appendtext (MF_REPLACEMENT_DATA_t *rd, AC_TEXT_t *text) 281 | { 282 | size_t remaining_bufspace = 0; 283 | size_t remaining_text = 0; 284 | size_t copy_len = 0; 285 | size_t copy_index = 0; 286 | 287 | while (copy_index < text->length) 288 | { 289 | remaining_bufspace = MF_REPLACEMENT_BUFFER_SIZE - rd->buffer.length; 290 | remaining_text = text->length - copy_index; 291 | 292 | copy_len = (remaining_bufspace >= remaining_text)? 293 | remaining_text : remaining_bufspace; 294 | 295 | memcpy((void *)&rd->buffer.astring[rd->buffer.length], 296 | (void *)&text->astring[copy_index], 297 | copy_len * sizeof(AC_ALPHABET_t)); 298 | 299 | rd->buffer.length += copy_len; 300 | copy_index += copy_len; 301 | 302 | if (rd->buffer.length == MF_REPLACEMENT_BUFFER_SIZE) 303 | mf_repdata_flush(rd); 304 | } 305 | } 306 | 307 | /** 308 | * @brief Append a factor of the current text to the output buffer 309 | * 310 | * @param rd 311 | * @param from 312 | * @param to 313 | *****************************************************************************/ 314 | static void mf_repdata_appendfactor 315 | (MF_REPLACEMENT_DATA_t *rd, size_t from, size_t to) 316 | { 317 | AC_TEXT_t *instr = rd->trie->text; 318 | AC_TEXT_t factor; 319 | size_t backlog_base_pos; 320 | size_t base_position = rd->trie->base_position; 321 | 322 | if (to < from) 323 | return; 324 | 325 | if (base_position <= from) 326 | { 327 | /* The backlog located in the input text part */ 328 | factor.astring = &instr->astring[from - base_position]; 329 | factor.length = to - from; 330 | mf_repdata_appendtext(rd, &factor); 331 | } 332 | else 333 | { 334 | backlog_base_pos = base_position - rd->backlog.length; 335 | if (from < backlog_base_pos) 336 | return; /* shouldn't come here */ 337 | 338 | if (to < base_position) 339 | { 340 | /* The backlog located in the backlog part */ 341 | factor.astring = &rd->backlog.astring[from - backlog_base_pos]; 342 | factor.length = to - from; 343 | mf_repdata_appendtext (rd, &factor); 344 | } 345 | else 346 | { 347 | /* The factor is divided between backlog and input text */ 348 | 349 | /* The backlog part */ 350 | factor.astring = &rd->backlog.astring[from - backlog_base_pos]; 351 | factor.length = rd->backlog.length - from + backlog_base_pos; 352 | mf_repdata_appendtext (rd, &factor); 353 | 354 | /* The input text part */ 355 | factor.astring = instr->astring; 356 | factor.length = to - base_position; 357 | mf_repdata_appendtext (rd, &factor); 358 | } 359 | } 360 | } 361 | 362 | /** 363 | * @brief Saves the backlog part of the current text to the backlog buffer. The 364 | * backlog part is the part after @p bg_pos 365 | * 366 | * @param rd 367 | * @param bg_pos backlog position 368 | *****************************************************************************/ 369 | static void mf_repdata_savetobacklog (MF_REPLACEMENT_DATA_t *rd, size_t bg_pos) 370 | { 371 | size_t bg_pos_r; /* relative backlog position */ 372 | AC_TEXT_t *instr = rd->trie->text; 373 | size_t base_position = rd->trie->base_position; 374 | 375 | if (base_position < bg_pos) 376 | bg_pos_r = bg_pos - base_position; 377 | else 378 | bg_pos_r = 0; /* the whole input text must go to backlog */ 379 | 380 | if (instr->length == bg_pos_r) 381 | return; /* Nothing left for the backlog */ 382 | 383 | if (instr->length < bg_pos_r) 384 | return; /* unexpected : assert (instr->length >= bg_pos_r) */ 385 | 386 | /* Copy the part after bg_pos_r to the backlog buffer */ 387 | memcpy( (AC_ALPHABET_t *) 388 | &rd->backlog.astring[rd->backlog.length], 389 | &instr->astring[bg_pos_r], 390 | instr->length - bg_pos_r ); 391 | 392 | rd->backlog.length += instr->length - bg_pos_r; 393 | } 394 | 395 | /** 396 | * @brief Perform replacement operations on the non-backlog part of the current 397 | * text. In-range nominees will be replaced the original pattern and the result 398 | * will be pushed to the output buffer. 399 | * 400 | * @param rd 401 | * @param to_position 402 | *****************************************************************************/ 403 | static void mf_repdata_do_replace 404 | (MF_REPLACEMENT_DATA_t *rd, size_t to_position) 405 | { 406 | unsigned int index; 407 | struct mf_replacement_nominee *nom; 408 | size_t base_position = rd->trie->base_position; 409 | 410 | if (to_position < base_position) 411 | return; 412 | 413 | /* Replace the candidate patterns */ 414 | if (rd->noms_size > 0) 415 | { 416 | for (index = 0; index < rd->noms_size; index++) 417 | { 418 | nom = &rd->noms[index]; 419 | 420 | if (to_position <= (nom->position - nom->pattern->ptext.length)) 421 | break; 422 | 423 | /* Append the space before pattern */ 424 | mf_repdata_appendfactor (rd, rd->curser, /* from */ 425 | nom->position - nom->pattern->ptext.length /* to */); 426 | 427 | /* Append the replacement instead of the pattern */ 428 | mf_repdata_appendtext(rd, &nom->pattern->rtext); 429 | 430 | rd->curser = nom->position; 431 | } 432 | rd->noms_size -= index; 433 | 434 | /* Shift the array to the left to eliminate the consumed nominees */ 435 | if (rd->noms_size && index) 436 | { 437 | memcpy (&rd->noms[0], &rd->noms[index], 438 | rd->noms_size * sizeof(struct mf_replacement_nominee)); 439 | /* TODO: implement a circular queue */ 440 | } 441 | } 442 | 443 | /* Append the chunk between the last pattern and to_position */ 444 | if (to_position > rd->curser) 445 | { 446 | mf_repdata_appendfactor (rd, rd->curser, to_position); 447 | 448 | rd->curser = to_position; 449 | } 450 | 451 | if (base_position <= rd->curser) 452 | { 453 | /* we consume the whole backlog or none of it */ 454 | rd->backlog.length = 0; 455 | } 456 | } 457 | 458 | /** 459 | * @brief Replaces the patterns in the given text with their correspondence 460 | * replacement in the A.C. Trie 461 | * 462 | * @param thiz 463 | * @param instr 464 | * @param mode 465 | * @param callback 466 | * @param param 467 | * @return 468 | *****************************************************************************/ 469 | int multifast_replace (AC_TRIE_t *thiz, AC_TEXT_t *instr, 470 | MF_REPLACE_MODE_t mode, MF_REPLACE_CALBACK_f callback, void *param) 471 | { 472 | ACT_NODE_t *current; 473 | ACT_NODE_t *next; 474 | struct mf_replacement_nominee nom; 475 | MF_REPLACEMENT_DATA_t *rd = &thiz->repdata; 476 | 477 | size_t position_r = 0; /* Relative current position in the input string */ 478 | size_t backlog_pos = 0; /* Relative backlog position in the input string */ 479 | 480 | if (thiz->trie_open) 481 | return -1; /* _finalize() must be called first */ 482 | 483 | if (!rd->has_replacement) 484 | return -2; /* Trie doesn't have any to-be-replaced pattern */ 485 | 486 | rd->cbf = callback; 487 | rd->user = param; 488 | rd->replace_mode = mode; 489 | 490 | thiz->text = instr; /* Save the input string in a helper variable 491 | * for convenience */ 492 | 493 | current = thiz->last_node; 494 | 495 | /* Main replace loop: 496 | * Find patterns and bookmark them 497 | */ 498 | while (position_r < instr->length) 499 | { 500 | if (!(next = node_find_next_bs(current, instr->astring[position_r]))) 501 | { 502 | /* Failed to follow a pattern */ 503 | if(current->failure_node) 504 | current = current->failure_node; 505 | else 506 | position_r++; 507 | } 508 | else 509 | { 510 | current = next; 511 | position_r++; 512 | } 513 | 514 | if (current->final && next) 515 | { 516 | /* Bookmark nominee patterns for replacement */ 517 | nom.pattern = current->to_be_replaced; 518 | nom.position = thiz->base_position + position_r; 519 | 520 | mf_repdata_booknominee (rd, &nom); 521 | } 522 | } 523 | 524 | /* 525 | * At the end of input chunk, if the tail of the chunk is a prefix of a 526 | * pattern, then we must keep it in the backlog buffer and wait for the 527 | * next chunk to decide about it. */ 528 | 529 | backlog_pos = thiz->base_position + instr->length - current->depth; 530 | 531 | /* Now replace the patterns up to the backlog_pos point */ 532 | mf_repdata_do_replace (rd, backlog_pos); 533 | 534 | /* Save the remaining to the backlog buffer */ 535 | mf_repdata_savetobacklog (rd, backlog_pos); 536 | 537 | /* Save status variables */ 538 | thiz->last_node = current; 539 | thiz->base_position += position_r; 540 | 541 | return 0; 542 | } 543 | 544 | /** 545 | * @brief Flushes the remaining data back to the user and ends the replacement 546 | * operation. 547 | * 548 | * @param thiz 549 | * @param keep Indicates the continuity of the chunks. 0 means that the last 550 | * chunk has been fed in, and we want to end the replacement and receive the 551 | * final result. 552 | *****************************************************************************/ 553 | void multifast_rep_flush (AC_TRIE_t *thiz, int keep) 554 | { 555 | if (!keep) 556 | { 557 | mf_repdata_do_replace (&thiz->repdata, thiz->base_position); 558 | } 559 | 560 | mf_repdata_flush (&thiz->repdata); 561 | 562 | if (!keep) 563 | { 564 | mf_repdata_reset (&thiz->repdata); 565 | thiz->last_node = thiz->root; 566 | thiz->base_position = 0; 567 | } 568 | } 569 | --------------------------------------------------------------------------------