├── README.md ├── config.m4 ├── php7_wrapper.h ├── php_trie_filter.h ├── trie_filter.c ├── triefilter.php └── triefiltertest.php /README.md: -------------------------------------------------------------------------------- 1 | php-ext-trie-filter 2 | =================== 3 | 4 | php extension for spam word filter based on Double-Array Trie tree, it can detect if a spam word exists in a text message. 5 | 6 | 关键词过滤扩展,用于检查一段文本中是否出现敏感词,基于Double-Array Trie 树实现。 7 | 8 | ## 升级历史 9 | 10 | ### 2017-08-08 11 | 1. 同时支持php5&php7 12 | 1. 新增方法: 13 | 1. trie_filter_read,从string中读取二进制字典数据 14 | 1. trie_filter_write,将当前对象导出成二进制string 15 | 1. trie_filter_delete,从当前对象中删除一个word 16 | 17 | ### 2013-06-23 18 | 1. trie_filter_search_all,一次返回所有的命中词 19 | 1. 修复内存泄露 20 | 21 | ## 依赖库 22 | 23 | [libdatrie-0.2.4 or later](http://linux.thai.net/~thep/datrie/datrie.html) 24 | 25 | ## 安装步骤 26 | 27 | 下面的$LIB_PATH为依赖库安装目录,$INSTALL_PHP_PATH为PHP安装目录。 28 | 29 | ### 安装libdatrie 30 | ``` 31 | $ tar zxvf libdatrie-0.2.4.tar.gz 32 | $ cd libdatrie-0.2.4 33 | $ make clean 34 | $ ./configure --prefix=$LIB_PATH 35 | $ make 36 | $ make install 37 | ``` 38 | ### 安装扩展 39 | ``` 40 | $ $INSTALL_PHP_PATH/bin/phpize 41 | $ ./configure --with-php-config=$INSTALL_PHP_PATH/bin/php-config --with-trie_filter=$LIB_PATH 42 | $ make 43 | $ make install 44 | ``` 45 | 然后修改php.ini,增加一行:extension=trie_filter.so,然后重启PHP。 46 | 47 | ## 使用示例 48 | ``` 49 | $v) { 53 | trie_filter_store($resTrie, $v); 54 | } 55 | trie_filter_save($resTrie, __DIR__ . '/blackword.tree'); 56 | 57 | $resTrie = trie_filter_load(__DIR__ . '/blackword.tree'); 58 | 59 | $strContent = 'hello word2 word1'; 60 | $arrRet = trie_filter_search($resTrie, $strContent); 61 | print_r($arrRet); //Array(0 => 6, 1 => 5) 62 | echo substr($strContent, $arrRet[0], $arrRet[1]); //word2 63 | $arrRet = trie_filter_search_all($resTrie, $strContent); 64 | print_r($arrRet); //Array(0 => Array(0 => 6, 1 => 5), 1 => Array(0 => 12, 1 => 5)) 65 | 66 | $arrRet = trie_filter_search($resTrie, 'hello word'); 67 | print_r($arrRet); //Array() 68 | 69 | trie_filter_free($resTrie); 70 | ``` 71 | # PHP版本 72 | 73 | PHP 5.2 ~ 7.1. 74 | 75 | Windows is not support until now. 76 | 77 | ## License 78 | 79 | Apache License 2.0 80 | 81 | ## 致谢 82 | 83 | 本项目是在[用于检测敏感词的 PHP 扩展](http://blog.anbutu.com/php/php-ext-trie-filter)的基础上改写的。 84 | 85 | -------------------------------------------------------------------------------- /config.m4: -------------------------------------------------------------------------------- 1 | dnl $Id$ 2 | dnl config.m4 for extension trie_filter 3 | 4 | dnl Comments in this file start with the string 'dnl'. 5 | dnl Remove where necessary. This file will not work 6 | dnl without editing. 7 | 8 | dnl If your extension references something external, use with: 9 | 10 | PHP_ARG_WITH(trie_filter, for trie_filter support, 11 | [ --with-trie_filter Include trie_filter support]) 12 | 13 | if test "$PHP_TRIE_FILTER" != "no"; then 14 | SEARCH_PATH="/usr/local /usr" 15 | SEARCH_FOR="/include/datrie/trie.h" 16 | if test -r $PHP_TRIE_FILTER/$SEARCH_FOR; then 17 | TRIE_FILTER_DIR=$PHP_TRIE_FILTER 18 | else 19 | AC_MSG_CHECKING([for trie_filter files in default path]) 20 | for i in $SEARCH_PATH ; do 21 | if test -r $i/$SEARCH_FOR; then 22 | TRIE_FILTER_DIR=$i 23 | AC_MSG_RESULT(found in $i) 24 | fi 25 | done 26 | fi 27 | 28 | if test -z "$TRIE_FILTER_DIR"; then 29 | AC_MSG_RESULT([not found]) 30 | AC_MSG_ERROR([Please install the libdatrie]) 31 | fi 32 | 33 | PHP_ADD_INCLUDE($TRIE_FILTER_DIR/include) 34 | 35 | LIBNAME=datrie 36 | LIBSYMBOL=trie_new_from_file 37 | 38 | PHP_CHECK_LIBRARY($LIBNAME,$LIBSYMBOL, 39 | [ 40 | PHP_ADD_LIBRARY_WITH_PATH($LIBNAME, $TRIE_FILTER_DIR/lib, TRIE_FILTER_SHARED_LIBADD) 41 | AC_DEFINE(HAVE_TRIE_FILTERLIB,1,[libdatrie found and included]) 42 | ],[ 43 | AC_MSG_ERROR([wrong libdatrie version or lib not found]) 44 | ],[ 45 | -L$TRIE_FILTER_DIR/lib -ldatrie 46 | ]) 47 | 48 | PHP_SUBST(TRIE_FILTER_SHARED_LIBADD) 49 | PHP_NEW_EXTENSION(trie_filter, trie_filter.c, $ext_shared) 50 | fi 51 | -------------------------------------------------------------------------------- /php7_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef EXT_PHP_TRIE_FILTER_PHP7_WRAPPER_H_ 2 | #define EXT_PHP_TRIE_FILTER_PHP7_WRAPPER_H_ 3 | 4 | #include "ext/standard/php_http.h" 5 | 6 | #if PHP_MAJOR_VERSION < 7 7 | typedef int zend_size_t; 8 | 9 | #define TRIE_ZEND_REGISTER_RESOURCE ZEND_REGISTER_RESOURCE 10 | #define TRIE_ZEND_FETCH_RESOURCE ZEND_FETCH_RESOURCE 11 | #define TRIE_MAKE_STD_ZVAL(p) MAKE_STD_ZVAL(p) 12 | #define TRIE_RESOURCE_FREE(resource) zend_list_delete(Z_RESVAL_P(resource)) 13 | #else /* PHP Version 7 */ 14 | typedef size_t zend_size_t; 15 | typedef zend_resource zend_rsrc_list_entry; 16 | 17 | #define TRIE_ZEND_REGISTER_RESOURCE(return_value, result, le_result) ZVAL_RES(return_value,zend_register_resource(result, le_result)) 18 | #define TRIE_ZEND_FETCH_RESOURCE(rsrc, rsrc_type, passed_id, default_id, resource_type_name, resource_type) \ 19 | (rsrc = (rsrc_type) zend_fetch_resource(Z_RES_P(*passed_id), resource_type_name, resource_type)) 20 | #define TRIE_MAKE_STD_ZVAL(p) zval _stack_zval_##p; p = &(_stack_zval_##p) 21 | #define TRIE_RESOURCE_FREE(resource) zend_list_close(Z_RES_P(resource)) 22 | #endif /* PHP Version */ 23 | 24 | #endif /* EXT_PHP_TRIE_FILTER_PHP7_WRAPPER_H_ */ -------------------------------------------------------------------------------- /php_trie_filter.h: -------------------------------------------------------------------------------- 1 | /* 2 | +----------------------------------------------------------------------+ 3 | | PHP Version 5 | 4 | +----------------------------------------------------------------------+ 5 | | Copyright (c) 1997-2010 The PHP Group | 6 | +----------------------------------------------------------------------+ 7 | | This source file is subject to version 3.01 of the PHP license, | 8 | | that is bundled with this package in the file LICENSE, and is | 9 | | available through the world-wide-web at the following url: | 10 | | http://www.php.net/license/3_01.txt | 11 | | If you did not receive a copy of the PHP license and are unable to | 12 | | obtain it through the world-wide-web, please send a note to | 13 | | license@php.net so we can mail you a copy immediately. | 14 | +----------------------------------------------------------------------+ 15 | | Author: Lijun Wu | 16 | +----------------------------------------------------------------------+ 17 | */ 18 | 19 | /* $Id$ */ 20 | 21 | #include 22 | #include 23 | 24 | #ifndef PHP_TRIE_FILTER_H 25 | #define PHP_TRIE_FILTER_H 26 | 27 | extern zend_module_entry trie_filter_module_entry; 28 | #define phpext_trie_filter_ptr &trie_filter_module_entry 29 | 30 | #ifdef PHP_WIN32 31 | #define PHP_TRIE_FILTER_API __declspec(dllexport) 32 | #else 33 | #define PHP_TRIE_FILTER_API 34 | #endif 35 | 36 | #ifdef ZTS 37 | #include "TSRM.h" 38 | #endif 39 | 40 | #define ALPHA_CHARSET "UCS-4LE" 41 | #define PHP_TRIE_FILTER_RES_NAME "Trie tree filter" 42 | 43 | PHP_MINIT_FUNCTION(trie_filter); 44 | PHP_MSHUTDOWN_FUNCTION(trie_filter); 45 | PHP_RINIT_FUNCTION(trie_filter); 46 | PHP_RSHUTDOWN_FUNCTION(trie_filter); 47 | PHP_MINFO_FUNCTION(trie_filter); 48 | 49 | PHP_FUNCTION(trie_filter_load); 50 | PHP_FUNCTION(trie_filter_read); 51 | PHP_FUNCTION(trie_filter_search); 52 | PHP_FUNCTION(trie_filter_search_all); 53 | PHP_FUNCTION(trie_filter_new); 54 | PHP_FUNCTION(trie_filter_store); 55 | PHP_FUNCTION(trie_filter_delete); 56 | PHP_FUNCTION(trie_filter_save); 57 | PHP_FUNCTION(trie_filter_write); 58 | PHP_FUNCTION(trie_filter_free); 59 | 60 | #ifdef ZTS 61 | #define TRIE_FILTER_G(v) TSRMG(trie_filter_globals_id, zend_trie_filter_globals *, v) 62 | #else 63 | #define TRIE_FILTER_G(v) (trie_filter_globals.v) 64 | #endif 65 | 66 | #endif /* PHP_TRIE_FILTER_H */ 67 | 68 | 69 | /* 70 | * Local variables: 71 | * tab-width: 4 72 | * c-basic-offset: 4 73 | * End: 74 | * vim600: noet sw=4 ts=4 fdm=marker 75 | * vim<600: noet sw=4 ts=4 76 | */ 77 | -------------------------------------------------------------------------------- /trie_filter.c: -------------------------------------------------------------------------------- 1 | /* 2 | +----------------------------------------------------------------------+ 3 | | PHP Version 5 | 4 | +----------------------------------------------------------------------+ 5 | | Copyright (c) 1997-2010 The PHP Group | 6 | +----------------------------------------------------------------------+ 7 | | This source file is subject to version 3.01 of the PHP license, | 8 | | that is bundled with this package in the file LICENSE, and is | 9 | | available through the world-wide-web at the following url: | 10 | | http://www.php.net/license/3_01.txt | 11 | | If you did not receive a copy of the PHP license and are unable to | 12 | | obtain it through the world-wide-web, please send a note to | 13 | | license@php.net so we can mail you a copy immediately. | 14 | +----------------------------------------------------------------------+ 15 | | Author: Lijun Wu | 16 | +----------------------------------------------------------------------+ 17 | */ 18 | 19 | /* $Id$ */ 20 | 21 | #ifdef HAVE_CONFIG_H 22 | #include "config.h" 23 | #endif 24 | 25 | #include "php.h" 26 | #include "php_ini.h" 27 | #include "ext/standard/info.h" 28 | #include "php_trie_filter.h" 29 | 30 | #include "php7_wrapper.h" 31 | 32 | /* True global resources - no need for thread safety here */ 33 | static int le_trie_filter; 34 | 35 | /* {{{ trie_filter_functions[] 36 | * 37 | * Every user visible function must have an entry in trie_filter_functions[]. 38 | */ 39 | zend_function_entry trie_filter_functions[] = { 40 | PHP_FE(trie_filter_load, NULL) 41 | PHP_FE(trie_filter_read, NULL) 42 | PHP_FE(trie_filter_search, NULL) 43 | PHP_FE(trie_filter_search_all, NULL) 44 | PHP_FE(trie_filter_new, NULL) 45 | PHP_FE(trie_filter_store, NULL) 46 | PHP_FE(trie_filter_delete, NULL) 47 | PHP_FE(trie_filter_save, NULL) 48 | PHP_FE(trie_filter_write, NULL) 49 | PHP_FE(trie_filter_free, NULL) 50 | {NULL, NULL, NULL} /* Must be the last line in trie_filter_functions[] */ 51 | }; 52 | /* }}} */ 53 | 54 | /* {{{ trie_filter_module_entry 55 | */ 56 | zend_module_entry trie_filter_module_entry = { 57 | #if ZEND_MODULE_API_NO >= 20010901 58 | STANDARD_MODULE_HEADER, 59 | #endif 60 | "trie_filter", 61 | trie_filter_functions, 62 | PHP_MINIT(trie_filter), 63 | PHP_MSHUTDOWN(trie_filter), 64 | NULL, 65 | NULL, 66 | PHP_MINFO(trie_filter), 67 | #if ZEND_MODULE_API_NO >= 20010901 68 | "0.1", /* Replace with version number for your extension */ 69 | #endif 70 | STANDARD_MODULE_PROPERTIES 71 | }; 72 | /* }}} */ 73 | 74 | #ifdef COMPILE_DL_TRIE_FILTER 75 | ZEND_GET_MODULE(trie_filter) 76 | #endif 77 | 78 | /* {{{ PHP_INI 79 | */ 80 | /* 81 | PHP_INI_BEGIN() 82 | PHP_INI_ENTRY("trie_filter.dict_charset", "utf-8", PHP_INI_ALL, NULL) 83 | PHP_INI_END() 84 | */ 85 | /* }}} */ 86 | #if PHP_MAJOR_VERSION < 7 87 | static void php_trie_filter_dtor(zend_rsrc_list_entry *rsrc TSRMLS_DC) 88 | #else 89 | static void php_trie_filter_dtor(zend_resource *rsrc TSRMLS_DC) 90 | #endif 91 | { 92 | Trie *trie = (Trie *)rsrc->ptr; 93 | trie_free(trie); 94 | } 95 | 96 | /* {{{ PHP_MINIT_FUNCTION 97 | */ 98 | PHP_MINIT_FUNCTION(trie_filter) 99 | { 100 | le_trie_filter = zend_register_list_destructors_ex( 101 | php_trie_filter_dtor, 102 | NULL, PHP_TRIE_FILTER_RES_NAME, module_number); 103 | return SUCCESS; 104 | } 105 | /* }}} */ 106 | 107 | /* {{{ PHP_MSHUTDOWN_FUNCTION 108 | */ 109 | PHP_MSHUTDOWN_FUNCTION(trie_filter) 110 | { 111 | return SUCCESS; 112 | } 113 | /* }}} */ 114 | 115 | /* {{{ PHP_MINFO_FUNCTION 116 | */ 117 | PHP_MINFO_FUNCTION(trie_filter) 118 | { 119 | php_info_print_table_start(); 120 | php_info_print_table_header(2, "trie_filter support", "enabled"); 121 | php_info_print_table_end(); 122 | } 123 | /* }}} */ 124 | 125 | /* {{{ proto resource trie_filter_load(string dict_file_path) 126 | Returns resource id, or NULL on error*/ 127 | PHP_FUNCTION(trie_filter_load) 128 | { 129 | Trie *trie; 130 | char *path; 131 | zend_size_t path_len; 132 | 133 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &path, &path_len) == FAILURE) { 134 | RETURN_NULL(); 135 | } 136 | 137 | trie = trie_new_from_file(path); 138 | if (!trie) { 139 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to load %s", path); 140 | RETURN_NULL(); 141 | } 142 | TRIE_ZEND_REGISTER_RESOURCE(return_value, trie, le_trie_filter); 143 | } 144 | /* }}} */ 145 | 146 | /* {{{ proto resource trie_filter_read(string dict_bin) 147 | Returns resource id, or NULL on error*/ 148 | PHP_FUNCTION(trie_filter_read) 149 | { 150 | Trie *trie; 151 | char *path; 152 | zend_size_t path_len; 153 | FILE *fp; 154 | 155 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &path, &path_len) == FAILURE) { 156 | RETURN_NULL(); 157 | } 158 | 159 | fp = fmemopen(path, path_len, "rb"); 160 | trie = trie_fread(fp); 161 | fclose(fp); 162 | 163 | if (!trie) { 164 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to read"); 165 | RETURN_NULL(); 166 | } 167 | TRIE_ZEND_REGISTER_RESOURCE(return_value, trie, le_trie_filter); 168 | } 169 | /* }}} */ 170 | 171 | static int trie_search_one(Trie *trie, const AlphaChar *text, int *offset, TrieData *length) 172 | { 173 | TrieState *s; 174 | const AlphaChar *p; 175 | const AlphaChar *base; 176 | 177 | base = text; 178 | if (! (s = trie_root(trie))) { 179 | return -1; 180 | } 181 | 182 | while (*text) { 183 | p = text; 184 | if (! trie_state_is_walkable(s, *p)) { 185 | trie_state_rewind(s); 186 | text++; 187 | continue; 188 | } else { 189 | trie_state_walk(s, *p++); 190 | } 191 | 192 | while (trie_state_is_walkable(s, *p) && ! trie_state_is_terminal(s)) 193 | trie_state_walk(s, *p++); 194 | 195 | if (trie_state_is_terminal(s)) { 196 | *offset = text - base; 197 | *length = p - text; 198 | trie_state_free(s); 199 | 200 | return 1; 201 | } 202 | 203 | trie_state_rewind(s); 204 | text++; 205 | } 206 | trie_state_free(s); 207 | 208 | return 0; 209 | } 210 | 211 | static int trie_search_all(Trie *trie, const AlphaChar *text, zval *data) 212 | { 213 | TrieState *s; 214 | const AlphaChar *p; 215 | const AlphaChar *base; 216 | #if PHP_MAJOR_VERSION < 7 217 | zval *word = NULL; 218 | #else 219 | zval word; 220 | #endif 221 | 222 | base = text; 223 | if (! (s = trie_root(trie))) { 224 | return -1; 225 | } 226 | 227 | while (*text) { 228 | p = text; 229 | if(! trie_state_is_walkable(s, *p)) { 230 | trie_state_rewind(s); 231 | text++; 232 | continue; 233 | } 234 | 235 | while(*p && trie_state_is_walkable(s, *p) && ! trie_state_is_leaf(s)) { 236 | trie_state_walk(s, *p++); 237 | if (trie_state_is_terminal(s)) { 238 | #if PHP_MAJOR_VERSION < 7 239 | MAKE_STD_ZVAL(word); 240 | array_init_size(word, 3); 241 | add_next_index_long(word, text - base); 242 | add_next_index_long(word, p - text); 243 | add_next_index_zval(data, word); 244 | #else 245 | array_init_size(&word, 3); 246 | add_next_index_long(&word, text - base); 247 | add_next_index_long(&word, p - text); 248 | add_next_index_zval(data, &word); 249 | #endif 250 | } 251 | } 252 | trie_state_rewind(s); 253 | text++; 254 | } 255 | trie_state_free(s); 256 | 257 | return 0; 258 | } 259 | 260 | /* {{{ proto array trie_filter_search(int trie_tree_identifier, string centent) 261 | Returns info about first keyword, or false on error*/ 262 | PHP_FUNCTION(trie_filter_search) 263 | { 264 | Trie *trie; 265 | zval *trie_resource; 266 | unsigned char *text; 267 | #if PHP_MAJOR_VERSION < 7 268 | int text_len; 269 | #else 270 | size_t text_len; 271 | #endif 272 | 273 | int offset = -1, i, ret; 274 | TrieData length = 0; 275 | 276 | AlphaChar *alpha_text; 277 | 278 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", &trie_resource, &text, &text_len) == FAILURE) { 279 | RETURN_FALSE; 280 | } 281 | 282 | array_init(return_value); 283 | if (text_len < 1 || strlen(text) != text_len) { 284 | php_error_docref(NULL TSRMLS_CC, E_NOTICE, "input is empty"); 285 | return; 286 | } 287 | 288 | #if PHP_MAJOR_VERSION < 7 289 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, 290 | PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 291 | #else 292 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 293 | #endif 294 | if (trie == NULL) { 295 | RETURN_FALSE; 296 | } 297 | 298 | alpha_text = emalloc(sizeof(AlphaChar) * (text_len + 1)); 299 | 300 | for (i = 0; i < text_len; i++) { 301 | alpha_text[i] = (AlphaChar) text[i]; 302 | } 303 | 304 | alpha_text[text_len] = TRIE_CHAR_TERM; 305 | 306 | ret = trie_search_one(trie, alpha_text, &offset, &length); 307 | efree(alpha_text); 308 | if (ret == 0) { 309 | return; 310 | } else if (ret == 1) { 311 | add_next_index_long(return_value, offset); 312 | add_next_index_long(return_value, length); 313 | } else { 314 | RETURN_FALSE; 315 | } 316 | } 317 | /* }}} */ 318 | 319 | /* {{{ proto array trie_filter_search_all(int trie_tree_identifier, string centent) 320 | Returns info about all keywords, or false on error*/ 321 | PHP_FUNCTION(trie_filter_search_all) 322 | { 323 | Trie *trie; 324 | zval *trie_resource; 325 | unsigned char *text; 326 | #if PHP_MAJOR_VERSION < 7 327 | int text_len; 328 | #else 329 | size_t text_len; 330 | #endif 331 | 332 | int i, ret; 333 | 334 | AlphaChar *alpha_text; 335 | 336 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 337 | &trie_resource, &text, &text_len) == FAILURE) { 338 | RETURN_FALSE; 339 | } 340 | 341 | array_init(return_value); 342 | if (text_len < 1 || strlen(text) != text_len) { 343 | php_error_docref(NULL TSRMLS_CC, E_NOTICE, "input is empty"); 344 | return; 345 | } 346 | 347 | #if PHP_MAJOR_VERSION < 7 348 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, 349 | PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 350 | #else 351 | trie = (Trie*) zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 352 | #endif 353 | if (trie == NULL) { 354 | RETURN_FALSE; 355 | } 356 | 357 | alpha_text = emalloc(sizeof(AlphaChar) * (text_len + 1)); 358 | 359 | for (i = 0; i < text_len; i++) { 360 | alpha_text[i] = (AlphaChar) text[i]; 361 | } 362 | 363 | alpha_text[text_len] = TRIE_CHAR_TERM; 364 | 365 | ret = trie_search_all(trie, alpha_text, return_value); 366 | efree(alpha_text); 367 | if (ret == 0) { 368 | return; 369 | } else { 370 | RETURN_FALSE; 371 | } 372 | } 373 | /* }}} */ 374 | 375 | /* {{{ proto resource trie_filter_new() 376 | Returns resource id, or NULL on error*/ 377 | PHP_FUNCTION(trie_filter_new) 378 | { 379 | Trie *trie; 380 | AlphaMap *alpha_map; 381 | int ret; 382 | 383 | alpha_map = alpha_map_new(); 384 | if (! alpha_map) { 385 | RETURN_NULL(); 386 | } 387 | 388 | if (alpha_map_add_range(alpha_map, 0x00, 0xff) != 0) { 389 | /* treat all strings as byte stream */ 390 | alpha_map_free(alpha_map); 391 | RETURN_NULL(); 392 | } 393 | 394 | trie = trie_new(alpha_map); 395 | alpha_map_free(alpha_map); 396 | if (! trie) { 397 | RETURN_NULL(); 398 | } 399 | 400 | #if PHP_MAJOR_VERSION < 7 401 | ZEND_REGISTER_RESOURCE(return_value, trie, le_trie_filter); 402 | #else 403 | RETURN_RES(zend_register_resource(trie, le_trie_filter)); 404 | #endif 405 | } 406 | /* }}} */ 407 | 408 | #define KEYWORD_MAX_LEN 1024 409 | /* {{{ proto bool trie_filter_store(int trie_tree_identifier, string keyword) 410 | Returns true, or false on error*/ 411 | PHP_FUNCTION(trie_filter_store) 412 | { 413 | Trie *trie; 414 | zval *trie_resource; 415 | unsigned char *keyword, *p; 416 | #if PHP_MAJOR_VERSION < 7 417 | int keyword_len, i; 418 | #else 419 | size_t keyword_len; 420 | int i; 421 | #endif 422 | AlphaChar alpha_key[KEYWORD_MAX_LEN+1]; 423 | 424 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 425 | &trie_resource, &keyword, &keyword_len) == FAILURE) { 426 | RETURN_FALSE; 427 | } 428 | 429 | if (keyword_len > KEYWORD_MAX_LEN || keyword_len < 1) { 430 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "keyword should has [1, %d] bytes", KEYWORD_MAX_LEN); 431 | RETURN_FALSE; 432 | } 433 | 434 | #if PHP_MAJOR_VERSION < 7 435 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 436 | #else 437 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 438 | #endif 439 | 440 | if (trie == NULL) { 441 | RETURN_FALSE; 442 | } 443 | 444 | p = keyword; 445 | i = 0; 446 | while (*p && *p != '\n' && *p != '\r') { 447 | alpha_key[i++] = (AlphaChar)*p; 448 | p++; 449 | } 450 | alpha_key[i] = TRIE_CHAR_TERM; 451 | 452 | if (! trie_store(trie, alpha_key, -1)) { 453 | RETURN_FALSE; 454 | } 455 | RETURN_TRUE; 456 | } 457 | /* }}} */ 458 | 459 | /* {{{ proto bool trie_filter_delete(int trie_tree_identifier, string keyword) 460 | Returns true, or false on error*/ 461 | PHP_FUNCTION(trie_filter_delete) 462 | { 463 | Trie *trie; 464 | zval *trie_resource; 465 | unsigned char *keyword, *p; 466 | zend_size_t keyword_len; 467 | int i; 468 | AlphaChar alpha_key[KEYWORD_MAX_LEN+1]; 469 | 470 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 471 | &trie_resource, &keyword, &keyword_len) == FAILURE) { 472 | RETURN_FALSE; 473 | } 474 | if (keyword_len > KEYWORD_MAX_LEN || keyword_len < 1) { 475 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "keyword should has [1, %d] bytes", KEYWORD_MAX_LEN); 476 | RETURN_FALSE; 477 | } 478 | 479 | TRIE_ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 480 | if (trie == NULL) { 481 | RETURN_FALSE; 482 | } 483 | 484 | p = keyword; 485 | i = 0; 486 | while (*p && *p != '\n' && *p != '\r') { 487 | alpha_key[i++] = (AlphaChar)*p; 488 | p++; 489 | } 490 | alpha_key[i] = TRIE_CHAR_TERM; 491 | 492 | if (! trie_delete(trie, alpha_key)) { 493 | RETURN_FALSE; 494 | } 495 | RETURN_TRUE; 496 | } 497 | /* }}} */ 498 | 499 | /* {{{ proto bool trie_filter_save(int trie_tree_identifier, string dict_path) 500 | Returns true, or false on error*/ 501 | PHP_FUNCTION(trie_filter_save) 502 | { 503 | Trie *trie; 504 | zval *trie_resource; 505 | unsigned char *filename; 506 | #if PHP_MAJOR_VERSION < 7 507 | int filename_len; 508 | #else 509 | size_t filename_len; 510 | #endif 511 | 512 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 513 | &trie_resource, &filename, &filename_len) == FAILURE) { 514 | RETURN_FALSE; 515 | } 516 | if (filename_len < 1 || strlen(filename) != filename_len) { 517 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "save path required"); 518 | RETURN_FALSE; 519 | } 520 | 521 | #if PHP_MAJOR_VERSION < 7 522 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 523 | #else 524 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 525 | #endif 526 | 527 | if (trie == NULL) { 528 | RETURN_FALSE; 529 | } 530 | 531 | if (trie_save(trie, filename)) { 532 | RETURN_FALSE; 533 | } 534 | RETURN_TRUE; 535 | } 536 | /* }}} */ 537 | 538 | /* {{{ proto string trie_filter_write(int trie_tree_identifier) 539 | Returns true, or false on error*/ 540 | PHP_FUNCTION(trie_filter_write) 541 | { 542 | Trie *trie; 543 | zval *trie_resource; 544 | FILE *fp; 545 | int res = 0; 546 | char *p; 547 | zend_size_t plen; 548 | 549 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &trie_resource) == FAILURE) { 550 | RETURN_FALSE; 551 | } 552 | 553 | TRIE_ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 554 | if (trie == NULL) { 555 | RETURN_FALSE; 556 | } 557 | 558 | fp = open_memstream(&p, &plen); 559 | res = trie_fwrite(trie, fp); 560 | fclose(fp); 561 | 562 | if (res) { 563 | RETURN_FALSE; 564 | } 565 | 566 | RETVAL_STRINGL(p, plen); 567 | free(p); 568 | } 569 | /* }}} */ 570 | 571 | /* {{{ proto bool trie_filter_free(int trie_tree_identifier) 572 | Returns true, or false on error*/ 573 | PHP_FUNCTION(trie_filter_free) 574 | { 575 | Trie *trie; 576 | zval *trie_resource; 577 | 578 | #if PHP_MAJOR_VERSION < 7 579 | int resource_id; 580 | #endif 581 | 582 | 583 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &trie_resource) == FAILURE) { 584 | RETURN_FALSE; 585 | } 586 | 587 | #if PHP_MAJOR_VERSION < 7 588 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 589 | resource_id = Z_RESVAL_P(trie_resource); 590 | #else 591 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 592 | #endif 593 | 594 | if (trie == NULL) { 595 | RETURN_FALSE; 596 | } 597 | 598 | #if PHP_MAJOR_VERSION < 7 599 | if (zend_list_delete(resource_id) == SUCCESS) { 600 | #else 601 | if (zend_list_close(Z_RES_P(trie_resource)) == SUCCESS) { 602 | #endif 603 | RETURN_TRUE; 604 | } 605 | RETURN_FALSE; 606 | } 607 | /* }}} */ 608 | 609 | /* 610 | * Local variables: 611 | * tab-width: 4 612 | * c-basic-offset: 4 613 | * End: 614 | * vim600: noet sw=4 ts=4 fdm=marker 615 | * vim<600: noet sw=4 ts=4 616 | */ 617 | -------------------------------------------------------------------------------- /triefilter.php: -------------------------------------------------------------------------------- 1 | 5 | * @link https://github.com/wulijun/php-ext-trie-filter 6 | */ 7 | 8 | /** 9 | * Load a trie tree from a saved trie tree file 10 | * 11 | * @param string $strDictFile Path to saved trie tree file 12 | * @return resource The trie tree handler or NULL on error. 13 | */ 14 | function trie_filter_load($strDictFile) { 15 | 16 | } 17 | 18 | /** 19 | * Find if a spam word exists in the content 20 | * 21 | * @param resource $resTree 22 | * @param string $strContent 23 | * @return array spam word info, like Array(0 => start position, 1 => spam word len), if no one found, return empty array, false on error. 24 | * @example 25 | *
 26 |  * $arrSpamWord = trie_filter_search($resTree, $strContent);
 27 |  * if (! empty($arrSpamWord)) {
 28 |  *     echo substr($strContent, $arrSpamWord[0], $arrSpamWord[1]);
 29 |  * }
 30 |  * 
31 | */ 32 | function trie_filter_search($resTree, $strContent) { 33 | 34 | } 35 | 36 | /** 37 | * Find all spam word exists in the content 38 | * 39 | * @param resource $resTree 40 | * @param string $strContent 41 | * @return array spam word info, like Array(Array(0 => start position, 1 => spam word len)), if no one found, return empty array, false on error. 42 | * @example 43 | *
 44 |  * $arrSpamWord = trie_filter_search_all($resTree, $strContent);
 45 |  * if (! empty($arrSpamWord)) {
 46 |  *     foreach ($arrSpamWord as $arrOneWord) {
 47 |  *         echo substr($strContent, $arrOneWord[0], $arrOneWord[1]);
 48 |  *     }
 49 |  * }
 50 |  * 
51 | */ 52 | function trie_filter_search_all($resTree, $strContent) { 53 | 54 | } 55 | 56 | /** 57 | * Create an empty trie tree 58 | * 59 | * @return resource The trie tree handler or NULL on error. 60 | */ 61 | function trie_filter_new() { 62 | 63 | } 64 | 65 | /** 66 | * Add a word to the trie tree 67 | * 68 | * @param resource $resTree 69 | * @param string $strWord 70 | * @return bool true on success or false on error. 71 | */ 72 | function trie_filter_store($resTree, $strWord) { 73 | 74 | } 75 | 76 | /* {{{ proto array (int trie_tree_identifier, string dict_path) 77 | Returns true, or false on error*/ 78 | 79 | /** 80 | * Save trie tree to a file 81 | * 82 | * @param resource $resTree 83 | * @param string $strDictFile 84 | * @return bool true on success or false on error. 85 | */ 86 | function trie_filter_save($resTree, $strDictFile) { 87 | 88 | } 89 | 90 | /** 91 | * Free trie tree 92 | * 93 | * Trie tree will be destructed automaticly when script finished, however, you can free it yourself. 94 | * 95 | * @param resource $resTree 96 | * @return bool true on success or false on error. 97 | */ 98 | function trie_filter_free($resTree) { 99 | 100 | } 101 | -------------------------------------------------------------------------------- /triefiltertest.php: -------------------------------------------------------------------------------- 1 | $v) { 6 | trie_filter_store($resTrie, $v); 7 | } 8 | trie_filter_save($resTrie, __DIR__ . '/blackword.tree'); 9 | 10 | $resTrie = trie_filter_load(__DIR__ . '/blackword.tree'); 11 | $str='hello word2 haha word1 word4 word2'; 12 | $arrRet = trie_filter_search($resTrie, $str); 13 | print_all($str,array($arrRet)); //Array(0 => 6, 1 => 5) 14 | echo "\ntest1///////////////////\n"; 15 | $str = 'hello word2 haha word1 word4 word2'; 16 | $arrRet = trie_filter_search_all($resTrie, $str); 17 | print_all($str, $arrRet); 18 | echo "\ntest2///////////////////\n"; 19 | $str = 'hello word'; 20 | $arrRet = trie_filter_search($resTrie, $str); 21 | print_all($str, array($arrRet)); //Array() 22 | $arrRet = trie_filter_search_all($resTrie, 'hello word'); 23 | print_all($str, $arrRet); 24 | 25 | echo "\ntest3///////////////////\n"; 26 | echo "start memory=".memory_get_usage(true)."\n";date_default_timezone_set('Asia/Chongqing'); 27 | $test = array('a', 'abd', 'dad', 'pab', 'dda', 'word1f', 'cword1', 'cword1t'); 28 | foreach ($test as $v) { 29 | // echo "per start memory=".memory_get_usage(true)."\n"; 30 | $arrRet = trie_filter_search_all($resTrie, $v); 31 | // echo "per end memory=".memory_get_usage(true)."\n"; 32 | //print_all($v, $arrRet); 33 | } 34 | echo "end memory=".memory_get_usage(true)."\n"; 35 | echo date('Y-m-d H:i:s'); 36 | trie_filter_free($resTrie); 37 | function print_all($str, $res) {//print_r($res); 38 | echo "$str\n"; 39 | foreach ($res as $k => $v) { 40 | echo $k."=>{$v[0]}-{$v[1]}-".substr($str, $v[0], $v[1])."\n"; 41 | } 42 | } 43 | 44 | --------------------------------------------------------------------------------