├── README.md ├── config.m4 ├── php7_wrapper.h ├── php_trie_filter.h ├── trie_filter.c ├── triefilter.php └── triefiltertest.php /README.md: -------------------------------------------------------------------------------- 1 | php-ext-trie-filter 2 | =================== 3 | 4 | php extension for spam word filter based on Double-Array Trie tree, it can detect if a spam word exists in a text message. 5 | 6 | 关键词过滤扩展,用于检查一段文本中是否出现敏感词,基于Double-Array Trie 树实现。 7 | 8 | ## 升级历史 9 | 10 | ### 2017-08-08 11 | 1. 同时支持php5&php7 12 | 1. 新增方法: 13 | 1. trie_filter_read,从string中读取二进制字典数据 14 | 1. trie_filter_write,将当前对象导出成二进制string 15 | 1. trie_filter_delete,从当前对象中删除一个word 16 | 17 | ### 2013-06-23 18 | 1. trie_filter_search_all,一次返回所有的命中词 19 | 1. 修复内存泄露 20 | 21 | ## 依赖库 22 | 23 | [libdatrie-0.2.4 or later](http://linux.thai.net/~thep/datrie/datrie.html) 24 | 25 | ## 安装步骤 26 | 27 | 下面的$LIB_PATH为依赖库安装目录,$INSTALL_PHP_PATH为PHP安装目录。 28 | 29 | ### 安装libdatrie 30 | ``` 31 | $ tar zxvf libdatrie-0.2.4.tar.gz 32 | $ cd libdatrie-0.2.4 33 | $ make clean 34 | $ ./configure --prefix=$LIB_PATH 35 | $ make 36 | $ make install 37 | ``` 38 | ### 安装扩展 39 | ``` 40 | $ $INSTALL_PHP_PATH/bin/phpize 41 | $ ./configure --with-php-config=$INSTALL_PHP_PATH/bin/php-config --with-trie_filter=$LIB_PATH 42 | $ make 43 | $ make install 44 | ``` 45 | 然后修改php.ini,增加一行:extension=trie_filter.so,然后重启PHP。 46 | 47 | ## 使用示例 48 | ``` 49 | $v) { 53 | trie_filter_store($resTrie, $v); 54 | } 55 | trie_filter_save($resTrie, __DIR__ . '/blackword.tree'); 56 | 57 | $resTrie = trie_filter_load(__DIR__ . '/blackword.tree'); 58 | 59 | $strContent = 'hello word2 word1'; 60 | $arrRet = trie_filter_search($resTrie, $strContent); 61 | print_r($arrRet); //Array(0 => 6, 1 => 5) 62 | echo substr($strContent, $arrRet[0], $arrRet[1]); //word2 63 | $arrRet = trie_filter_search_all($resTrie, $strContent); 64 | print_r($arrRet); //Array(0 => Array(0 => 6, 1 => 5), 1 => Array(0 => 12, 1 => 5)) 65 | 66 | $arrRet = trie_filter_search($resTrie, 'hello word'); 67 | print_r($arrRet); //Array() 68 | 69 | trie_filter_free($resTrie); 70 | ``` 71 | # PHP版本 72 | 73 | PHP 5.2 ~ 7.1. 74 | 75 | Windows is not support until now. 76 | 77 | ## License 78 | 79 | Apache License 2.0 80 | 81 | ## 致谢 82 | 83 | 本项目是在[用于检测敏感词的 PHP 扩展](http://blog.anbutu.com/php/php-ext-trie-filter)的基础上改写的。 84 | 85 | -------------------------------------------------------------------------------- /config.m4: -------------------------------------------------------------------------------- 1 | dnl $Id$ 2 | dnl config.m4 for extension trie_filter 3 | 4 | dnl Comments in this file start with the string 'dnl'. 5 | dnl Remove where necessary. This file will not work 6 | dnl without editing. 7 | 8 | dnl If your extension references something external, use with: 9 | 10 | PHP_ARG_WITH(trie_filter, for trie_filter support, 11 | [ --with-trie_filter Include trie_filter support]) 12 | 13 | if test "$PHP_TRIE_FILTER" != "no"; then 14 | SEARCH_PATH="/usr/local /usr" 15 | SEARCH_FOR="/include/datrie/trie.h" 16 | if test -r $PHP_TRIE_FILTER/$SEARCH_FOR; then 17 | TRIE_FILTER_DIR=$PHP_TRIE_FILTER 18 | else 19 | AC_MSG_CHECKING([for trie_filter files in default path]) 20 | for i in $SEARCH_PATH ; do 21 | if test -r $i/$SEARCH_FOR; then 22 | TRIE_FILTER_DIR=$i 23 | AC_MSG_RESULT(found in $i) 24 | fi 25 | done 26 | fi 27 | 28 | if test -z "$TRIE_FILTER_DIR"; then 29 | AC_MSG_RESULT([not found]) 30 | AC_MSG_ERROR([Please install the libdatrie]) 31 | fi 32 | 33 | PHP_ADD_INCLUDE($TRIE_FILTER_DIR/include) 34 | 35 | LIBNAME=datrie 36 | LIBSYMBOL=trie_new_from_file 37 | 38 | PHP_CHECK_LIBRARY($LIBNAME,$LIBSYMBOL, 39 | [ 40 | PHP_ADD_LIBRARY_WITH_PATH($LIBNAME, $TRIE_FILTER_DIR/lib, TRIE_FILTER_SHARED_LIBADD) 41 | AC_DEFINE(HAVE_TRIE_FILTERLIB,1,[libdatrie found and included]) 42 | ],[ 43 | AC_MSG_ERROR([wrong libdatrie version or lib not found]) 44 | ],[ 45 | -L$TRIE_FILTER_DIR/lib -ldatrie 46 | ]) 47 | 48 | PHP_SUBST(TRIE_FILTER_SHARED_LIBADD) 49 | PHP_NEW_EXTENSION(trie_filter, trie_filter.c, $ext_shared) 50 | fi 51 | -------------------------------------------------------------------------------- /php7_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef EXT_PHP_TRIE_FILTER_PHP7_WRAPPER_H_ 2 | #define EXT_PHP_TRIE_FILTER_PHP7_WRAPPER_H_ 3 | 4 | #include "ext/standard/php_http.h" 5 | 6 | #if PHP_MAJOR_VERSION < 7 7 | typedef int zend_size_t; 8 | 9 | #define TRIE_ZEND_REGISTER_RESOURCE ZEND_REGISTER_RESOURCE 10 | #define TRIE_ZEND_FETCH_RESOURCE ZEND_FETCH_RESOURCE 11 | #define TRIE_MAKE_STD_ZVAL(p) MAKE_STD_ZVAL(p) 12 | #define TRIE_RESOURCE_FREE(resource) zend_list_delete(Z_RESVAL_P(resource)) 13 | #else /* PHP Version 7 */ 14 | typedef size_t zend_size_t; 15 | typedef zend_resource zend_rsrc_list_entry; 16 | 17 | #define TRIE_ZEND_REGISTER_RESOURCE(return_value, result, le_result) ZVAL_RES(return_value,zend_register_resource(result, le_result)) 18 | #define TRIE_ZEND_FETCH_RESOURCE(rsrc, rsrc_type, passed_id, default_id, resource_type_name, resource_type) \ 19 | (rsrc = (rsrc_type) zend_fetch_resource(Z_RES_P(*passed_id), resource_type_name, resource_type)) 20 | #define TRIE_MAKE_STD_ZVAL(p) zval _stack_zval_##p; p = &(_stack_zval_##p) 21 | #define TRIE_RESOURCE_FREE(resource) zend_list_close(Z_RES_P(resource)) 22 | #endif /* PHP Version */ 23 | 24 | #endif /* EXT_PHP_TRIE_FILTER_PHP7_WRAPPER_H_ */ -------------------------------------------------------------------------------- /php_trie_filter.h: -------------------------------------------------------------------------------- 1 | /* 2 | +----------------------------------------------------------------------+ 3 | | PHP Version 5 | 4 | +----------------------------------------------------------------------+ 5 | | Copyright (c) 1997-2010 The PHP Group | 6 | +----------------------------------------------------------------------+ 7 | | This source file is subject to version 3.01 of the PHP license, | 8 | | that is bundled with this package in the file LICENSE, and is | 9 | | available through the world-wide-web at the following url: | 10 | | http://www.php.net/license/3_01.txt | 11 | | If you did not receive a copy of the PHP license and are unable to | 12 | | obtain it through the world-wide-web, please send a note to | 13 | | license@php.net so we can mail you a copy immediately. | 14 | +----------------------------------------------------------------------+ 15 | | Author: Lijun Wu | 16 | +----------------------------------------------------------------------+ 17 | */ 18 | 19 | /* $Id$ */ 20 | 21 | #include 22 | #include 23 | 24 | #ifndef PHP_TRIE_FILTER_H 25 | #define PHP_TRIE_FILTER_H 26 | 27 | extern zend_module_entry trie_filter_module_entry; 28 | #define phpext_trie_filter_ptr &trie_filter_module_entry 29 | 30 | #ifdef PHP_WIN32 31 | #define PHP_TRIE_FILTER_API __declspec(dllexport) 32 | #else 33 | #define PHP_TRIE_FILTER_API 34 | #endif 35 | 36 | #ifdef ZTS 37 | #include "TSRM.h" 38 | #endif 39 | 40 | #define ALPHA_CHARSET "UCS-4LE" 41 | #define PHP_TRIE_FILTER_RES_NAME "Trie tree filter" 42 | 43 | PHP_MINIT_FUNCTION(trie_filter); 44 | PHP_MSHUTDOWN_FUNCTION(trie_filter); 45 | PHP_RINIT_FUNCTION(trie_filter); 46 | PHP_RSHUTDOWN_FUNCTION(trie_filter); 47 | PHP_MINFO_FUNCTION(trie_filter); 48 | 49 | PHP_FUNCTION(trie_filter_load); 50 | PHP_FUNCTION(trie_filter_read); 51 | PHP_FUNCTION(trie_filter_search); 52 | PHP_FUNCTION(trie_filter_search_all); 53 | PHP_FUNCTION(trie_filter_new); 54 | PHP_FUNCTION(trie_filter_store); 55 | PHP_FUNCTION(trie_filter_delete); 56 | PHP_FUNCTION(trie_filter_save); 57 | PHP_FUNCTION(trie_filter_write); 58 | PHP_FUNCTION(trie_filter_free); 59 | 60 | #ifdef ZTS 61 | #define TRIE_FILTER_G(v) TSRMG(trie_filter_globals_id, zend_trie_filter_globals *, v) 62 | #else 63 | #define TRIE_FILTER_G(v) (trie_filter_globals.v) 64 | #endif 65 | 66 | #endif /* PHP_TRIE_FILTER_H */ 67 | 68 | 69 | /* 70 | * Local variables: 71 | * tab-width: 4 72 | * c-basic-offset: 4 73 | * End: 74 | * vim600: noet sw=4 ts=4 fdm=marker 75 | * vim<600: noet sw=4 ts=4 76 | */ 77 | -------------------------------------------------------------------------------- /trie_filter.c: -------------------------------------------------------------------------------- 1 | /* 2 | +----------------------------------------------------------------------+ 3 | | PHP Version 5 | 4 | +----------------------------------------------------------------------+ 5 | | Copyright (c) 1997-2010 The PHP Group | 6 | +----------------------------------------------------------------------+ 7 | | This source file is subject to version 3.01 of the PHP license, | 8 | | that is bundled with this package in the file LICENSE, and is | 9 | | available through the world-wide-web at the following url: | 10 | | http://www.php.net/license/3_01.txt | 11 | | If you did not receive a copy of the PHP license and are unable to | 12 | | obtain it through the world-wide-web, please send a note to | 13 | | license@php.net so we can mail you a copy immediately. | 14 | +----------------------------------------------------------------------+ 15 | | Author: Lijun Wu | 16 | +----------------------------------------------------------------------+ 17 | */ 18 | 19 | /* $Id$ */ 20 | 21 | #ifdef HAVE_CONFIG_H 22 | #include "config.h" 23 | #endif 24 | 25 | #include "php.h" 26 | #include "php_ini.h" 27 | #include "ext/standard/info.h" 28 | #include "php_trie_filter.h" 29 | 30 | #include "php7_wrapper.h" 31 | 32 | /* True global resources - no need for thread safety here */ 33 | static int le_trie_filter; 34 | 35 | /* {{{ trie_filter_functions[] 36 | * 37 | * Every user visible function must have an entry in trie_filter_functions[]. 38 | */ 39 | zend_function_entry trie_filter_functions[] = { 40 | PHP_FE(trie_filter_load, NULL) 41 | PHP_FE(trie_filter_read, NULL) 42 | PHP_FE(trie_filter_search, NULL) 43 | PHP_FE(trie_filter_search_all, NULL) 44 | PHP_FE(trie_filter_new, NULL) 45 | PHP_FE(trie_filter_store, NULL) 46 | PHP_FE(trie_filter_delete, NULL) 47 | PHP_FE(trie_filter_save, NULL) 48 | PHP_FE(trie_filter_write, NULL) 49 | PHP_FE(trie_filter_free, NULL) 50 | {NULL, NULL, NULL} /* Must be the last line in trie_filter_functions[] */ 51 | }; 52 | /* }}} */ 53 | 54 | /* {{{ trie_filter_module_entry 55 | */ 56 | zend_module_entry trie_filter_module_entry = { 57 | #if ZEND_MODULE_API_NO >= 20010901 58 | STANDARD_MODULE_HEADER, 59 | #endif 60 | "trie_filter", 61 | trie_filter_functions, 62 | PHP_MINIT(trie_filter), 63 | PHP_MSHUTDOWN(trie_filter), 64 | NULL, 65 | NULL, 66 | PHP_MINFO(trie_filter), 67 | #if ZEND_MODULE_API_NO >= 20010901 68 | "0.1", /* Replace with version number for your extension */ 69 | #endif 70 | STANDARD_MODULE_PROPERTIES 71 | }; 72 | /* }}} */ 73 | 74 | #ifdef COMPILE_DL_TRIE_FILTER 75 | ZEND_GET_MODULE(trie_filter) 76 | #endif 77 | 78 | /* {{{ PHP_INI 79 | */ 80 | /* 81 | PHP_INI_BEGIN() 82 | PHP_INI_ENTRY("trie_filter.dict_charset", "utf-8", PHP_INI_ALL, NULL) 83 | PHP_INI_END() 84 | */ 85 | /* }}} */ 86 | #if PHP_MAJOR_VERSION < 7 87 | static void php_trie_filter_dtor(zend_rsrc_list_entry *rsrc TSRMLS_DC) 88 | #else 89 | static void php_trie_filter_dtor(zend_resource *rsrc TSRMLS_DC) 90 | #endif 91 | { 92 | Trie *trie = (Trie *)rsrc->ptr; 93 | trie_free(trie); 94 | } 95 | 96 | /* {{{ PHP_MINIT_FUNCTION 97 | */ 98 | PHP_MINIT_FUNCTION(trie_filter) 99 | { 100 | le_trie_filter = zend_register_list_destructors_ex( 101 | php_trie_filter_dtor, 102 | NULL, PHP_TRIE_FILTER_RES_NAME, module_number); 103 | return SUCCESS; 104 | } 105 | /* }}} */ 106 | 107 | /* {{{ PHP_MSHUTDOWN_FUNCTION 108 | */ 109 | PHP_MSHUTDOWN_FUNCTION(trie_filter) 110 | { 111 | return SUCCESS; 112 | } 113 | /* }}} */ 114 | 115 | /* {{{ PHP_MINFO_FUNCTION 116 | */ 117 | PHP_MINFO_FUNCTION(trie_filter) 118 | { 119 | php_info_print_table_start(); 120 | php_info_print_table_header(2, "trie_filter support", "enabled"); 121 | php_info_print_table_end(); 122 | } 123 | /* }}} */ 124 | 125 | /* {{{ proto resource trie_filter_load(string dict_file_path) 126 | Returns resource id, or NULL on error*/ 127 | PHP_FUNCTION(trie_filter_load) 128 | { 129 | Trie *trie; 130 | char *path; 131 | zend_size_t path_len; 132 | 133 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &path, &path_len) == FAILURE) { 134 | RETURN_NULL(); 135 | } 136 | 137 | trie = trie_new_from_file(path); 138 | if (!trie) { 139 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to load %s", path); 140 | RETURN_NULL(); 141 | } 142 | TRIE_ZEND_REGISTER_RESOURCE(return_value, trie, le_trie_filter); 143 | } 144 | /* }}} */ 145 | 146 | /* {{{ proto resource trie_filter_read(string dict_bin) 147 | Returns resource id, or NULL on error*/ 148 | PHP_FUNCTION(trie_filter_read) 149 | { 150 | Trie *trie; 151 | char *path; 152 | zend_size_t path_len; 153 | FILE *fp; 154 | 155 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &path, &path_len) == FAILURE) { 156 | RETURN_NULL(); 157 | } 158 | 159 | fp = fmemopen(path, path_len, "rb"); 160 | trie = trie_fread(fp); 161 | fclose(fp); 162 | 163 | if (!trie) { 164 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to read"); 165 | RETURN_NULL(); 166 | } 167 | TRIE_ZEND_REGISTER_RESOURCE(return_value, trie, le_trie_filter); 168 | } 169 | /* }}} */ 170 | 171 | static int trie_search_one(Trie *trie, const AlphaChar *text, int *offset, TrieData *length) 172 | { 173 | TrieState *s; 174 | const AlphaChar *p; 175 | const AlphaChar *base; 176 | 177 | base = text; 178 | if (! (s = trie_root(trie))) { 179 | return -1; 180 | } 181 | 182 | while (*text) { 183 | p = text; 184 | if (! trie_state_is_walkable(s, *p)) { 185 | trie_state_rewind(s); 186 | text++; 187 | continue; 188 | } else { 189 | trie_state_walk(s, *p++); 190 | } 191 | 192 | while (trie_state_is_walkable(s, *p) && ! trie_state_is_terminal(s)) 193 | trie_state_walk(s, *p++); 194 | 195 | if (trie_state_is_terminal(s)) { 196 | *offset = text - base; 197 | *length = p - text; 198 | trie_state_free(s); 199 | return 1; 200 | } 201 | 202 | trie_state_rewind(s); 203 | text++; 204 | } 205 | trie_state_free(s); 206 | 207 | return 0; 208 | } 209 | 210 | static int trie_search_all(Trie *trie, const AlphaChar *text, zval *data) 211 | { 212 | TrieState *s; 213 | const AlphaChar *p; 214 | const AlphaChar *base; 215 | #if PHP_MAJOR_VERSION < 7 216 | zval *word = NULL; 217 | #else 218 | zval word; 219 | #endif 220 | 221 | base = text; 222 | if (! (s = trie_root(trie))) { 223 | return -1; 224 | } 225 | 226 | while (*text) { 227 | p = text; 228 | if(! trie_state_is_walkable(s, *p)) { 229 | trie_state_rewind(s); 230 | text++; 231 | continue; 232 | } 233 | 234 | while(*p && trie_state_is_walkable(s, *p) && ! trie_state_is_leaf(s)) { 235 | trie_state_walk(s, *p++); 236 | if (trie_state_is_terminal(s)) { 237 | #if PHP_MAJOR_VERSION < 7 238 | MAKE_STD_ZVAL(word); 239 | array_init_size(word, 3); 240 | add_next_index_long(word, text - base); 241 | add_next_index_long(word, p - text); 242 | add_next_index_zval(data, word); 243 | #else 244 | array_init_size(&word, 3); 245 | add_next_index_long(&word, text - base); 246 | add_next_index_long(&word, p - text); 247 | add_next_index_zval(data, &word); 248 | #endif 249 | } 250 | } 251 | trie_state_rewind(s); 252 | text++; 253 | } 254 | trie_state_free(s); 255 | 256 | return 0; 257 | } 258 | 259 | /* {{{ proto array trie_filter_search(int trie_tree_identifier, string centent) 260 | Returns info about first keyword, or false on error*/ 261 | PHP_FUNCTION(trie_filter_search) 262 | { 263 | Trie *trie; 264 | zval *trie_resource; 265 | unsigned char *text; 266 | #if PHP_MAJOR_VERSION < 7 267 | int text_len; 268 | #else 269 | size_t text_len; 270 | #endif 271 | 272 | int offset = -1, i, ret; 273 | TrieData length = 0; 274 | 275 | AlphaChar *alpha_text; 276 | 277 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", &trie_resource, &text, &text_len) == FAILURE) { 278 | RETURN_FALSE; 279 | } 280 | 281 | array_init(return_value); 282 | if (text_len < 1 || strlen(text) != text_len) { 283 | php_error_docref(NULL TSRMLS_CC, E_NOTICE, "input is empty"); 284 | return; 285 | } 286 | 287 | #if PHP_MAJOR_VERSION < 7 288 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, 289 | PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 290 | #else 291 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 292 | #endif 293 | if (trie == NULL) { 294 | RETURN_FALSE; 295 | } 296 | 297 | alpha_text = emalloc(sizeof(AlphaChar) * (text_len + 1)); 298 | 299 | for (i = 0; i < text_len; i++) { 300 | alpha_text[i] = (AlphaChar) text[i]; 301 | } 302 | 303 | alpha_text[text_len] = TRIE_CHAR_TERM; 304 | 305 | ret = trie_search_one(trie, alpha_text, &offset, &length); 306 | efree(alpha_text); 307 | if (ret == 0) { 308 | return; 309 | } else if (ret == 1) { 310 | add_next_index_long(return_value, offset); 311 | add_next_index_long(return_value, length); 312 | } else { 313 | RETURN_FALSE; 314 | } 315 | } 316 | /* }}} */ 317 | 318 | /* {{{ proto array trie_filter_search_all(int trie_tree_identifier, string centent) 319 | Returns info about all keywords, or false on error*/ 320 | PHP_FUNCTION(trie_filter_search_all) 321 | { 322 | Trie *trie; 323 | zval *trie_resource; 324 | unsigned char *text; 325 | #if PHP_MAJOR_VERSION < 7 326 | int text_len; 327 | #else 328 | size_t text_len; 329 | #endif 330 | 331 | int i, ret; 332 | 333 | AlphaChar *alpha_text; 334 | 335 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 336 | &trie_resource, &text, &text_len) == FAILURE) { 337 | RETURN_FALSE; 338 | } 339 | 340 | array_init(return_value); 341 | if (text_len < 1 || strlen(text) != text_len) { 342 | php_error_docref(NULL TSRMLS_CC, E_NOTICE, "input is empty"); 343 | return; 344 | } 345 | 346 | #if PHP_MAJOR_VERSION < 7 347 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, 348 | PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 349 | #else 350 | trie = (Trie*) zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 351 | #endif 352 | if (trie == NULL) { 353 | RETURN_FALSE; 354 | } 355 | 356 | alpha_text = emalloc(sizeof(AlphaChar) * (text_len + 1)); 357 | 358 | for (i = 0; i < text_len; i++) { 359 | alpha_text[i] = (AlphaChar) text[i]; 360 | } 361 | 362 | alpha_text[text_len] = TRIE_CHAR_TERM; 363 | 364 | ret = trie_search_all(trie, alpha_text, return_value); 365 | efree(alpha_text); 366 | if (ret == 0) { 367 | return; 368 | } else { 369 | RETURN_FALSE; 370 | } 371 | } 372 | /* }}} */ 373 | 374 | /* {{{ proto resource trie_filter_new() 375 | Returns resource id, or NULL on error*/ 376 | PHP_FUNCTION(trie_filter_new) 377 | { 378 | Trie *trie; 379 | AlphaMap *alpha_map; 380 | int ret; 381 | 382 | alpha_map = alpha_map_new(); 383 | if (! alpha_map) { 384 | RETURN_NULL(); 385 | } 386 | 387 | if (alpha_map_add_range(alpha_map, 0x00, 0xff) != 0) { 388 | /* treat all strings as byte stream */ 389 | alpha_map_free(alpha_map); 390 | RETURN_NULL(); 391 | } 392 | 393 | trie = trie_new(alpha_map); 394 | alpha_map_free(alpha_map); 395 | if (! trie) { 396 | RETURN_NULL(); 397 | } 398 | 399 | #if PHP_MAJOR_VERSION < 7 400 | ZEND_REGISTER_RESOURCE(return_value, trie, le_trie_filter); 401 | #else 402 | RETURN_RES(zend_register_resource(trie, le_trie_filter)); 403 | #endif 404 | } 405 | /* }}} */ 406 | 407 | #define KEYWORD_MAX_LEN 1024 408 | /* {{{ proto bool trie_filter_store(int trie_tree_identifier, string keyword) 409 | Returns true, or false on error*/ 410 | PHP_FUNCTION(trie_filter_store) 411 | { 412 | Trie *trie; 413 | zval *trie_resource; 414 | unsigned char *keyword, *p; 415 | #if PHP_MAJOR_VERSION < 7 416 | int keyword_len, i; 417 | #else 418 | size_t keyword_len; 419 | int i; 420 | #endif 421 | AlphaChar alpha_key[KEYWORD_MAX_LEN+1]; 422 | 423 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 424 | &trie_resource, &keyword, &keyword_len) == FAILURE) { 425 | RETURN_FALSE; 426 | } 427 | 428 | if (keyword_len > KEYWORD_MAX_LEN || keyword_len < 1) { 429 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "keyword should has [1, %d] bytes", KEYWORD_MAX_LEN); 430 | RETURN_FALSE; 431 | } 432 | 433 | #if PHP_MAJOR_VERSION < 7 434 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 435 | #else 436 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 437 | #endif 438 | 439 | if (trie == NULL) { 440 | RETURN_FALSE; 441 | } 442 | 443 | p = keyword; 444 | i = 0; 445 | while (*p && *p != '\n' && *p != '\r') { 446 | alpha_key[i++] = (AlphaChar)*p; 447 | p++; 448 | } 449 | alpha_key[i] = TRIE_CHAR_TERM; 450 | 451 | if (! trie_store(trie, alpha_key, -1)) { 452 | RETURN_FALSE; 453 | } 454 | RETURN_TRUE; 455 | } 456 | /* }}} */ 457 | 458 | /* {{{ proto bool trie_filter_delete(int trie_tree_identifier, string keyword) 459 | Returns true, or false on error*/ 460 | PHP_FUNCTION(trie_filter_delete) 461 | { 462 | Trie *trie; 463 | zval *trie_resource; 464 | unsigned char *keyword, *p; 465 | zend_size_t keyword_len; 466 | int i; 467 | AlphaChar alpha_key[KEYWORD_MAX_LEN+1]; 468 | 469 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 470 | &trie_resource, &keyword, &keyword_len) == FAILURE) { 471 | RETURN_FALSE; 472 | } 473 | if (keyword_len > KEYWORD_MAX_LEN || keyword_len < 1) { 474 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "keyword should has [1, %d] bytes", KEYWORD_MAX_LEN); 475 | RETURN_FALSE; 476 | } 477 | 478 | TRIE_ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 479 | if (trie == NULL) { 480 | RETURN_FALSE; 481 | } 482 | 483 | p = keyword; 484 | i = 0; 485 | while (*p && *p != '\n' && *p != '\r') { 486 | alpha_key[i++] = (AlphaChar)*p; 487 | p++; 488 | } 489 | alpha_key[i] = TRIE_CHAR_TERM; 490 | 491 | if (! trie_delete(trie, alpha_key)) { 492 | RETURN_FALSE; 493 | } 494 | RETURN_TRUE; 495 | } 496 | /* }}} */ 497 | 498 | /* {{{ proto bool trie_filter_save(int trie_tree_identifier, string dict_path) 499 | Returns true, or false on error*/ 500 | PHP_FUNCTION(trie_filter_save) 501 | { 502 | Trie *trie; 503 | zval *trie_resource; 504 | unsigned char *filename; 505 | #if PHP_MAJOR_VERSION < 7 506 | int filename_len; 507 | #else 508 | size_t filename_len; 509 | #endif 510 | 511 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", 512 | &trie_resource, &filename, &filename_len) == FAILURE) { 513 | RETURN_FALSE; 514 | } 515 | if (filename_len < 1) { 516 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "save path required"); 517 | RETURN_FALSE; 518 | } 519 | 520 | #if PHP_MAJOR_VERSION < 7 521 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 522 | #else 523 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 524 | #endif 525 | 526 | if (trie == NULL) { 527 | RETURN_FALSE; 528 | } 529 | 530 | if (trie_save(trie, filename)) { 531 | RETURN_FALSE; 532 | } 533 | RETURN_TRUE; 534 | } 535 | /* }}} */ 536 | 537 | /* {{{ proto string trie_filter_write(int trie_tree_identifier) 538 | Returns true, or false on error*/ 539 | PHP_FUNCTION(trie_filter_write) 540 | { 541 | Trie *trie; 542 | zval *trie_resource; 543 | FILE *fp; 544 | int res = 0; 545 | char *p; 546 | zend_size_t plen; 547 | 548 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &trie_resource) == FAILURE) { 549 | RETURN_FALSE; 550 | } 551 | 552 | TRIE_ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 553 | if (trie == NULL) { 554 | RETURN_FALSE; 555 | } 556 | 557 | fp = open_memstream(&p, &plen); 558 | res = trie_fwrite(trie, fp); 559 | fclose(fp); 560 | 561 | if (res) { 562 | RETURN_FALSE; 563 | } 564 | 565 | RETVAL_STRINGL(p, plen); 566 | free(p); 567 | } 568 | /* }}} */ 569 | 570 | /* {{{ proto bool trie_filter_free(int trie_tree_identifier) 571 | Returns true, or false on error*/ 572 | PHP_FUNCTION(trie_filter_free) 573 | { 574 | Trie *trie; 575 | zval *trie_resource; 576 | 577 | #if PHP_MAJOR_VERSION < 7 578 | int resource_id; 579 | #endif 580 | 581 | 582 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &trie_resource) == FAILURE) { 583 | RETURN_FALSE; 584 | } 585 | 586 | #if PHP_MAJOR_VERSION < 7 587 | ZEND_FETCH_RESOURCE(trie, Trie *, &trie_resource, -1, PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 588 | resource_id = Z_RESVAL_P(trie_resource); 589 | #else 590 | trie = zend_fetch_resource(Z_RES_P(trie_resource), PHP_TRIE_FILTER_RES_NAME, le_trie_filter); 591 | #endif 592 | 593 | if (trie == NULL) { 594 | RETURN_FALSE; 595 | } 596 | 597 | #if PHP_MAJOR_VERSION < 7 598 | if (zend_list_delete(resource_id) == SUCCESS) { 599 | #else 600 | if (zend_list_close(Z_RES_P(trie_resource)) == SUCCESS) { 601 | #endif 602 | RETURN_TRUE; 603 | } 604 | RETURN_FALSE; 605 | } 606 | /* }}} */ 607 | 608 | /* 609 | * Local variables: 610 | * tab-width: 4 611 | * c-basic-offset: 4 612 | * End: 613 | * vim600: noet sw=4 ts=4 fdm=marker 614 | * vim<600: noet sw=4 ts=4 615 | */ 616 | -------------------------------------------------------------------------------- /triefilter.php: -------------------------------------------------------------------------------- 1 | 5 | * @link https://github.com/wulijun/php-ext-trie-filter 6 | */ 7 | 8 | /** 9 | * Load a trie tree from a saved trie tree file 10 | * 11 | * @param string $strDictFile Path to saved trie tree file 12 | * @return resource The trie tree handler or NULL on error. 13 | */ 14 | function trie_filter_load($strDictFile) { 15 | 16 | } 17 | 18 | /** 19 | * Find if a spam word exists in the content 20 | * 21 | * @param resource $resTree 22 | * @param string $strContent 23 | * @return array spam word info, like Array(0 => start position, 1 => spam word len), if no one found, return empty array, false on error. 24 | * @example 25 | *
 26 |  * $arrSpamWord = trie_filter_search($resTree, $strContent);
 27 |  * if (! empty($arrSpamWord)) {
 28 |  *     echo substr($strContent, $arrSpamWord[0], $arrSpamWord[1]);
 29 |  * }
 30 |  * 
31 | */ 32 | function trie_filter_search($resTree, $strContent) { 33 | 34 | } 35 | 36 | /** 37 | * Find all spam word exists in the content 38 | * 39 | * @param resource $resTree 40 | * @param string $strContent 41 | * @return array spam word info, like Array(Array(0 => start position, 1 => spam word len)), if no one found, return empty array, false on error. 42 | * @example 43 | *
 44 |  * $arrSpamWord = trie_filter_search_all($resTree, $strContent);
 45 |  * if (! empty($arrSpamWord)) {
 46 |  *     foreach ($arrSpamWord as $arrOneWord) {
 47 |  *         echo substr($strContent, $arrOneWord[0], $arrOneWord[1]);
 48 |  *     }
 49 |  * }
 50 |  * 
51 | */ 52 | function trie_filter_search_all($resTree, $strContent) { 53 | 54 | } 55 | 56 | /** 57 | * Create an empty trie tree 58 | * 59 | * @return resource The trie tree handler or NULL on error. 60 | */ 61 | function trie_filter_new() { 62 | 63 | } 64 | 65 | /** 66 | * Add a word to the trie tree 67 | * 68 | * @param resource $resTree 69 | * @param string $strWord 70 | * @return bool true on success or false on error. 71 | */ 72 | function trie_filter_store($resTree, $strWord) { 73 | 74 | } 75 | 76 | /* {{{ proto array (int trie_tree_identifier, string dict_path) 77 | Returns true, or false on error*/ 78 | 79 | /** 80 | * Save trie tree to a file 81 | * 82 | * @param resource $resTree 83 | * @param string $strDictFile 84 | * @return bool true on success or false on error. 85 | */ 86 | function trie_filter_save($resTree, $strDictFile) { 87 | 88 | } 89 | 90 | /** 91 | * Free trie tree 92 | * 93 | * Trie tree will be destructed automaticly when script finished, however, you can free it yourself. 94 | * 95 | * @param resource $resTree 96 | * @return bool true on success or false on error. 97 | */ 98 | function trie_filter_free($resTree) { 99 | 100 | } 101 | -------------------------------------------------------------------------------- /triefiltertest.php: -------------------------------------------------------------------------------- 1 | $v) { 6 | trie_filter_store($resTrie, $v); 7 | } 8 | trie_filter_save($resTrie, __DIR__ . '/blackword.tree'); 9 | 10 | $resTrie = trie_filter_load(__DIR__ . '/blackword.tree'); 11 | $str='hello word2 haha word1 word4 word2'; 12 | $arrRet = trie_filter_search($resTrie, $str); 13 | print_all($str,array($arrRet)); //Array(0 => 6, 1 => 5) 14 | echo "\ntest1///////////////////\n"; 15 | $str = 'hello word2 haha word1 word4 word2'; 16 | $arrRet = trie_filter_search_all($resTrie, $str); 17 | print_all($str, $arrRet); 18 | echo "\ntest2///////////////////\n"; 19 | $str = 'hello word'; 20 | $arrRet = trie_filter_search($resTrie, $str); 21 | print_all($str, array($arrRet)); //Array() 22 | $arrRet = trie_filter_search_all($resTrie, 'hello word'); 23 | print_all($str, $arrRet); 24 | 25 | echo "\ntest3///////////////////\n"; 26 | echo "start memory=".memory_get_usage(true)."\n";date_default_timezone_set('Asia/Chongqing'); 27 | $test = array('a', 'abd', 'dad', 'pab', 'dda', 'word1f', 'cword1', 'cword1t'); 28 | foreach ($test as $v) { 29 | // echo "per start memory=".memory_get_usage(true)."\n"; 30 | $arrRet = trie_filter_search_all($resTrie, $v); 31 | // echo "per end memory=".memory_get_usage(true)."\n"; 32 | //print_all($v, $arrRet); 33 | } 34 | echo "end memory=".memory_get_usage(true)."\n"; 35 | echo date('Y-m-d H:i:s'); 36 | trie_filter_free($resTrie); 37 | function print_all($str, $res) {//print_r($res); 38 | echo "$str\n"; 39 | foreach ($res as $k => $v) { 40 | echo $k."=>{$v[0]}-{$v[1]}-".substr($str, $v[0], $v[1])."\n"; 41 | } 42 | } 43 | 44 | --------------------------------------------------------------------------------