├── .gitignore ├── README.md ├── composer.json ├── phpunit.xml ├── src ├── DfaFilter │ ├── Exceptions │ │ ├── PdsBusinessException.php │ │ └── PdsSystemException.php │ ├── HashMap.php │ ├── SensitiveHelper.php │ └── functions.php └── README.md └── tests ├── BaseTest.php ├── ProTest.php ├── bootstrap.php └── data ├── keyWord.txt └── words.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | 4 | /vendor/ 5 | 6 | /composer.lock -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # php-DFA-filterWord 2 | php实现基于确定有穷自动机算法的铭感词过滤 https://packagist.org/packages/lustre/php-dfa-sensitive 3 | 4 | ## 安装&使用流程 5 | ### Download and install Composer: 6 | curl -sS https://getcomposer.org/installer | php 7 | > 要检查 Composer 是否正常工作,只需要通过 php 来执行 PHAR 8 | 9 | php composer.phar 10 | 11 | ### 安装扩展 12 | 13 | composer require lustre/php-dfa-sensitive 14 | 15 | * 注意:如果你在使用composer安装时,出现 16 | Could not find package lustre/php-dfa-sensitive at any version for your minimum-stability (stable). Check the package spelling or your minimum-stability 请在你的composer.json中加入"minimum-stability": "dev" 17 | 18 | 19 | 20 | 21 | #### 如果你需要手动引入 22 | 23 | require './vendor/autoload.php'; 24 | 25 | use DfaFilter\SensitiveHelper; 26 | 27 | ### 构建敏感词库树 28 | 场景一: 可以拿到不同(用户)词库数组 29 | 30 | // 获取感词库索引数组 31 | $wordData = array( 32 | '察象蚂', 33 | '拆迁灭', 34 | '车牌隐', 35 | '成人电', 36 | '成人卡通', 37 | ...... 38 | ); 39 | 40 | // get one helper 41 | $handle = SensitiveHelper::init()->setTree($wordData); 42 | 43 | 场景二: 全站使用一套敏感词库 44 | 45 | // 获取感词库文件路径 46 | $wordFilePath = 'tests/data/words.txt'; 47 | 48 | // get one helper 49 | $handle = SensitiveHelper::init()->setTreeByFile($wordFilePath); 50 | 51 | ### 检测是否含有敏感词 52 | 53 | $islegal = $handle->islegal($content); 54 | ### 敏感词过滤 55 | 56 | // 敏感词替换为*为例(会替换为相同字符长度的*) 57 | $filterContent = $handle->replace($content, '*', true); 58 | 59 | // 或敏感词替换为***为例 60 | $filterContent = $handle->replace($content, '***'); 61 | 62 | ### 标记敏感词 63 | $markedContent = $handle->mark($content, '', ''); 64 | 65 | ### 获取文字中的敏感词 66 | 67 | // 获取内容中所有的敏感词 68 | $sensitiveWordGroup = $handle->getBadWord($content); 69 | // 仅且获取一个敏感词 70 | $sensitiveWordGroup = $handle->getBadWord($content, 1); 71 | 72 | ### 如何使用单元测试进行测试 73 | #### 安装PHPUnit 74 | ```bash 75 | $ wget https://phar.phpunit.de/phpunit.phar 76 | 77 | $ chmod +x phpunit.phar 78 | 79 | $ mv phpunit.phar /usr/local/bin/phpunit 80 | ``` 81 | #### 使用composer自动加载php命名空间 82 | 83 | ```bash 84 | $ composer update 85 | ``` 86 | ### 运行单元测试 87 | ```bash 88 | $ phpunit tests/BaseTest.php 89 | ``` 90 | 91 | *如果大家有更好的建议,请大家多多指正,O(∩_∩)O谢谢* 92 | 93 | *你们的star是我的动力* 94 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lustre/php-dfa-sensitive", 3 | "description": "To achieve the filtering of sensitive words, based on the determination of finite automata (DFA) algorithm.", 4 | "type": "library", 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "Lustre", 9 | "email": "firelustre@163.com" 10 | } 11 | ], 12 | "minimum-stability": "stable", 13 | "require": { 14 | "php": ">=5.3", 15 | "ext-mbstring": "*" 16 | }, 17 | "autoload": { 18 | "psr-4": { 19 | "DfaFilter\\": "src/DfaFilter" 20 | } 21 | }, 22 | "require-dev": { 23 | "phpunit/phpunit": "^6.4" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | tests 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/DfaFilter/Exceptions/PdsBusinessException.php: -------------------------------------------------------------------------------- 1 | hashTable)) { 32 | $this->hashTable[$key] = $value; 33 | return null; 34 | } 35 | $_temp = $this->hashTable[$key]; 36 | $this->hashTable[$key] = $value; 37 | return $_temp; 38 | } 39 | 40 | /** 41 | * 根据key获取对应的value 42 | * 43 | * @param $key 44 | * @return mixed|null 45 | */ 46 | public function get($key) 47 | { 48 | if (array_key_exists($key, $this->hashTable)) { 49 | return $this->hashTable[$key]; 50 | } 51 | return null; 52 | } 53 | 54 | /** 55 | * 删除指定key的键值对 56 | * 57 | * @param $key 58 | * @return mixed|null 59 | */ 60 | public function remove($key) 61 | { 62 | $temp_table = array(); 63 | if (array_key_exists($key, $this->hashTable)) { 64 | $tempValue = $this->hashTable[$key]; 65 | while ($curValue = current($this->hashTable)) { 66 | if (! (key($this->hashTable) == $key)) { 67 | $temp_table[key($this->hashTable)] = $curValue; 68 | } 69 | next($this->hashTable); 70 | } 71 | $this->hashTable = null; 72 | $this->hashTable = $temp_table; 73 | return $tempValue; 74 | } 75 | return null; 76 | } 77 | 78 | /** 79 | * 获取HashMap的所有键值 80 | * 81 | * @return array 82 | */ 83 | public function keys() 84 | { 85 | return array_keys($this->hashTable); 86 | } 87 | 88 | /** 89 | * 获取HashMap的所有value值 90 | * 91 | * @return array 92 | */ 93 | public function values() 94 | { 95 | return array_values($this->hashTable); 96 | } 97 | 98 | /** 99 | * 将一个HashMap的值全部put到当前HashMap中 100 | * 101 | * @param \DfaFilter\HashMap $map 102 | */ 103 | public function putAll($map) 104 | { 105 | if (! $map->isEmpty() && $map->size() > 0) { 106 | $keys = $map->keys(); 107 | foreach ($keys as $key) { 108 | $this->put($key, $map->get($key)); 109 | } 110 | } 111 | 112 | return ; 113 | } 114 | 115 | /** 116 | * 移除HashMap中所有元素 117 | * 118 | * @return bool 119 | */ 120 | public function removeAll() 121 | { 122 | $this->hashTable = null; 123 | return true; 124 | } 125 | 126 | /** 127 | * 判断HashMap中是否包含指定的值 128 | * 129 | * @param $value 130 | * @return bool 131 | */ 132 | public function containsValue($value) 133 | { 134 | while ($curValue = current($this->hashTable)) { 135 | if ($curValue == $value) { 136 | return true; 137 | } 138 | next($this->hashTable); 139 | } 140 | return false; 141 | } 142 | 143 | /** 144 | * 判断HashMap中是否包含指定的键key 145 | * 146 | * @param $key 147 | * @return bool 148 | */ 149 | public function containsKey($key) 150 | { 151 | if (array_key_exists($key, $this->hashTable)) { 152 | return true; 153 | } else { 154 | return false; 155 | } 156 | } 157 | 158 | /** 159 | * 获取HashMap中元素个数 160 | * 161 | * @return int 162 | */ 163 | public function size() 164 | { 165 | return count($this->hashTable); 166 | } 167 | 168 | /** 169 | * 判断HashMap是否为空 170 | * 171 | * @return bool 172 | */ 173 | public function isEmpty() 174 | { 175 | return (count($this->hashTable) == 0); 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/DfaFilter/SensitiveHelper.php: -------------------------------------------------------------------------------- 1 | wordTree = $this->wordTree ?: new HashMap(); 72 | 73 | foreach ($this->yieldToReadFile($filepath) as $word) { 74 | $this->buildWordToTree(trim($word)); 75 | } 76 | 77 | return $this; 78 | } 79 | 80 | 81 | /** 82 | * 构建铭感词树【数组模式】 83 | * 84 | * @param null $sensitiveWords 85 | * 86 | * @return $this 87 | * @throws \DfaFilter\Exceptions\PdsBusinessException 88 | */ 89 | public function setTree($sensitiveWords = null) 90 | { 91 | if (empty($sensitiveWords)) { 92 | throw new PdsBusinessException('词库不能为空', PdsBusinessException::EMPTY_WORD_POOL); 93 | } 94 | 95 | $this->wordTree = new HashMap(); 96 | 97 | foreach ($sensitiveWords as $word) { 98 | $this->buildWordToTree($word); 99 | } 100 | return $this; 101 | } 102 | 103 | /** 104 | * 检测文字中的敏感词 105 | * 106 | * @param string $content 待检测内容 107 | * @param int $matchType 匹配类型 [默认为最小匹配规则] 108 | * @param int $wordNum 需要获取的敏感词数量 [默认获取全部] 109 | * @return array 110 | * @throws \DfaFilter\Exceptions\PdsSystemException 111 | */ 112 | public function getBadWord($content, $matchType = 1, $wordNum = 0) 113 | { 114 | $this->contentLength = mb_strlen($content, 'utf-8'); 115 | $badWordList = array(); 116 | for ($length = 0; $length < $this->contentLength; $length++) { 117 | $matchFlag = 0; 118 | $flag = false; 119 | $tempMap = $this->wordTree; 120 | for ($i = $length; $i < $this->contentLength; $i++) { 121 | $keyChar = mb_substr($content, $i, 1, 'utf-8'); 122 | 123 | // 获取指定节点树 124 | $nowMap = $tempMap->get($keyChar); 125 | 126 | // 不存在节点树,直接返回 127 | if (empty($nowMap)) { 128 | break; 129 | } 130 | 131 | // 存在,则判断是否为最后一个 132 | $tempMap = $nowMap; 133 | 134 | // 找到相应key,偏移量+1 135 | $matchFlag++; 136 | 137 | // 如果为最后一个匹配规则,结束循环,返回匹配标识数 138 | if (false === $nowMap->get('ending')) { 139 | continue; 140 | } 141 | 142 | $flag = true; 143 | 144 | // 最小规则,直接退出 145 | if (1 === $matchType) { 146 | break; 147 | } 148 | } 149 | 150 | if (! $flag) { 151 | $matchFlag = 0; 152 | } 153 | 154 | // 找到相应key 155 | if ($matchFlag <= 0) { 156 | continue; 157 | } 158 | 159 | $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8'); 160 | 161 | // 有返回数量限制 162 | if ($wordNum > 0 && count($badWordList) == $wordNum) { 163 | return $badWordList; 164 | } 165 | 166 | // 需匹配内容标志位往后移 167 | $length = $length + $matchFlag - 1; 168 | } 169 | return $badWordList; 170 | } 171 | 172 | /** 173 | * 替换敏感字字符 174 | * 175 | * @param $content 文本内容 176 | * @param string $replaceChar 替换字符 177 | * @param bool $repeat true=>重复替换为敏感词相同长度的字符 178 | * @param int $matchType 179 | * 180 | * @return mixed 181 | * @throws \DfaFilter\Exceptions\PdsBusinessException 182 | * @throws \DfaFilter\Exceptions\PdsSystemException 183 | */ 184 | public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1) 185 | { 186 | if (empty($content)) { 187 | throw new PdsBusinessException('请填写检测的内容', PdsBusinessException::EMPTY_CONTENT); 188 | } 189 | 190 | $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType); 191 | 192 | // 未检测到敏感词,直接返回 193 | if (empty($badWordList)) { 194 | return $content; 195 | } 196 | 197 | foreach ($badWordList as $badWord) { 198 | $hasReplacedChar = $replaceChar; 199 | if ($repeat) { 200 | $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar); 201 | } 202 | $content = str_replace($badWord, $hasReplacedChar, $content); 203 | } 204 | return $content; 205 | } 206 | 207 | /** 208 | * 标记敏感词 209 | * 210 | * @param $content 文本内容 211 | * @param string $sTag 标签开头,如 212 | * @param string $eTag 标签结束,如 213 | * @param int $matchType 214 | * 215 | * @return mixed 216 | * @throws \DfaFilter\Exceptions\PdsBusinessException 217 | * @throws \DfaFilter\Exceptions\PdsSystemException 218 | */ 219 | public function mark($content, $sTag, $eTag, $matchType = 1) 220 | { 221 | if (empty($content)) { 222 | throw new PdsBusinessException('请填写检测的内容', PdsBusinessException::EMPTY_CONTENT); 223 | } 224 | 225 | $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType); 226 | 227 | // 未检测到敏感词,直接返回 228 | if (empty($badWordList)) { 229 | return $content; 230 | } 231 | $badWordList = array_unique($badWordList); 232 | 233 | foreach ($badWordList as $badWord) { 234 | $replaceChar = $sTag . $badWord . $eTag; 235 | $content = str_replace($badWord, $replaceChar, $content); 236 | } 237 | return $content; 238 | } 239 | 240 | /** 241 | * 被检测内容是否合法 242 | * 243 | * @param $content 244 | * 245 | * @return bool 246 | * @throws \DfaFilter\Exceptions\PdsSystemException 247 | */ 248 | public function islegal($content) 249 | { 250 | $this->contentLength = mb_strlen($content, 'utf-8'); 251 | 252 | for ($length = 0; $length < $this->contentLength; $length++) { 253 | $matchFlag = 0; 254 | 255 | $tempMap = $this->wordTree; 256 | for ($i = $length; $i < $this->contentLength; $i++) { 257 | $keyChar = mb_substr($content, $i, 1, 'utf-8'); 258 | 259 | // 获取指定节点树 260 | $nowMap = $tempMap->get($keyChar); 261 | 262 | // 不存在节点树,直接返回 263 | if (empty($nowMap)) { 264 | break; 265 | } 266 | 267 | // 找到相应key,偏移量+1 268 | $tempMap = $nowMap; 269 | $matchFlag++; 270 | 271 | // 如果为最后一个匹配规则,结束循环,返回匹配标识数 272 | if (false === $nowMap->get('ending')) { 273 | continue; 274 | } 275 | 276 | return true; 277 | } 278 | 279 | // 找到相应key 280 | if ($matchFlag <= 0) { 281 | continue; 282 | } 283 | 284 | // 需匹配内容标志位往后移 285 | $length = $length + $matchFlag - 1; 286 | } 287 | return false; 288 | } 289 | 290 | protected function yieldToReadFile($filepath) 291 | { 292 | $fp = fopen($filepath, 'r'); 293 | while (! feof($fp)) { 294 | yield fgets($fp); 295 | } 296 | fclose($fp); 297 | } 298 | 299 | // 将单个敏感词构建成树结构 300 | protected function buildWordToTree($word = '') 301 | { 302 | if ('' === $word) { 303 | return; 304 | } 305 | $tree = $this->wordTree; 306 | 307 | $wordLength = mb_strlen($word, 'utf-8'); 308 | for ($i = 0; $i < $wordLength; $i++) { 309 | $keyChar = mb_substr($word, $i, 1, 'utf-8'); 310 | 311 | // 获取子节点树结构 312 | $tempTree = $tree->get($keyChar); 313 | 314 | if ($tempTree) { 315 | $tree = $tempTree; 316 | } else { 317 | // 设置标志位 318 | $newTree = new HashMap(); 319 | $newTree->put('ending', false); 320 | 321 | // 添加到集合 322 | $tree->put($keyChar, $newTree); 323 | $tree = $newTree; 324 | } 325 | 326 | // 到达最后一个节点 327 | if ($i == $wordLength - 1) { 328 | $tree->put('ending', true); 329 | } 330 | } 331 | 332 | return; 333 | } 334 | 335 | /** 336 | * 敏感词替换为对应长度的字符 337 | * @param $word 338 | * @param $char 339 | * 340 | * @return string 341 | * @throws \DfaFilter\Exceptions\PdsSystemException 342 | */ 343 | protected function dfaBadWordConversChars($word, $char) 344 | { 345 | $str = ''; 346 | $length = mb_strlen($word, 'utf-8'); 347 | for ($counter = 0; $counter < $length; ++$counter) { 348 | $str .= $char; 349 | } 350 | 351 | return $str; 352 | } 353 | } 354 | -------------------------------------------------------------------------------- /src/DfaFilter/functions.php: -------------------------------------------------------------------------------- 1 | 要检查 Composer 是否正常工作,只需要通过 php 来执行 PHAR 8 | 9 | php composer.phar 10 | 11 | ### 安装扩展 12 | 13 | composer require lustre/php-dfa-sensitive 14 | 15 | * 注意:如果你在使用composer安装时,出现 16 | Could not find package lustre/php-dfa-sensitive at any version for your minimum-stability (stable). Check the package spelling or your minimum-stability 请在你的composer.json中加入"minimum-stability": "dev" 17 | 18 | 19 | 20 | 21 | #### 如果你需要手动引入 22 | 23 | require './vendor/autoload.php'; 24 | 25 | use DfaFilter\SensitiveHelper; 26 | 27 | ### 获取敏感词库 28 | 29 | // 获取感词库索引数组 30 | $wordData = array( 31 | '察象蚂', 32 | '拆迁灭', 33 | '车牌隐', 34 | '成人电', 35 | '成人卡通', 36 | ...... 37 | ); 38 | 39 | ### 检测是否含有敏感词 40 | 41 | $islegal = SensitiveHelper::init()->setTree($wordData)->islegal($content); 42 | ### 敏感词过滤 43 | 44 | // 敏感词替换为***为例 45 | $filterContent = SensitiveHelper::init()->setTree($wordData)->replace($content, '***'); 46 | 47 | ### 获取文字中的敏感词 48 | 49 | // 获取内容中所有的敏感词 50 | $sensitiveWordGroup = SensitiveHelper::init()->setTree($wordData)->getBadWord($content); 51 | // 仅且获取一个敏感词 52 | $sensitiveWordGroup = SensitiveHelper::init()->setTree($wordData)->getBadWord($content, 1); 53 | 54 | 目前已知使用平台:[广电云](http://www.guangdianyun.tv/) 节目直播页面在线聊天功能支持 55 | 56 | 57 | *如果大家有更好的建议,请大家多多指正,O(∩_∩)O谢谢* 58 | -------------------------------------------------------------------------------- /tests/BaseTest.php: -------------------------------------------------------------------------------- 1 | wordData = explode(',', $wordPool); 22 | } 23 | 24 | public function testGetBadWord() 25 | { 26 | $content = '这是一段测试语句,请忽略赌球网, 第二个敏感词是三级片'; 27 | 28 | // 过滤,其中【赌球网】在词库中 29 | $filterContent = SensitiveHelper::init() 30 | ->setTree($this->wordData) 31 | ->getBadWord($content); 32 | 33 | // 返回规定数量的敏感词,其中【赌球网,三级片】在词库中 34 | $badWords = SensitiveHelper::init() 35 | ->setTree($this->wordData) 36 | ->getBadWord($content, 1, 2); 37 | 38 | $this->assertEquals('赌球网', $filterContent[0]); 39 | $this->assertEquals('三级片', $badWords[1]); 40 | } 41 | 42 | public function testFilterWord() 43 | { 44 | $content = '这是一段测试语句,请忽略赌球网'; 45 | 46 | // 过滤,其中【赌球网】在词库中 47 | $filterContent = SensitiveHelper::init() 48 | ->setTree($this->wordData) 49 | ->replace($content,'*'); 50 | 51 | $this->assertEquals('这是一段测试语句,请忽略*',$filterContent); 52 | } 53 | } -------------------------------------------------------------------------------- /tests/ProTest.php: -------------------------------------------------------------------------------- 1 | wordPoolPath = 'tests/data/words.txt'; 22 | } 23 | 24 | public function testGetBadWord() 25 | { 26 | $sTime = microtime(true); 27 | $content = '这是一段测试语句,请忽略赌球网, 第二个敏感词是三级片'; 28 | 29 | // 过滤,其中【赌球网】在词库中 30 | $filterContent = SensitiveHelper::init() 31 | ->setTreeByFile($this->wordPoolPath) 32 | ->getBadWord($content); 33 | 34 | // 返回规定数量的敏感词,其中【赌球网,三级片】在词库中 35 | $badWords = SensitiveHelper::init() 36 | ->setTreeByFile($this->wordPoolPath) 37 | ->getBadWord($content, 1, 2); 38 | 39 | $eTime = microtime(true); 40 | 41 | echo ($eTime - $sTime) * 1000 . 'ms' . PHP_EOL; 42 | 43 | $this->assertEquals('赌球网', $filterContent[0]); 44 | $this->assertEquals('三级片', $badWords[1]); 45 | } 46 | 47 | public function testFilterWord() 48 | { 49 | $content = '这是一段测试语句,请忽略赌球网'; 50 | 51 | // 过滤,其中【赌球网】在词库中 52 | $filterContent = SensitiveHelper::init() 53 | ->setTreeByFile($this->wordPoolPath) 54 | ->replace($content,'*'); 55 | 56 | $this->assertEquals('这是一段测试语句,请忽略*', $filterContent); 57 | 58 | 59 | // 过滤,其中【赌球网】在词库中 60 | $filterContent = SensitiveHelper::init() 61 | ->setTreeByFile($this->wordPoolPath) 62 | ->replace($content,'*', true); 63 | 64 | $this->assertEquals('这是一段测试语句,请忽略***', $filterContent); 65 | } 66 | 67 | public function testMarkWord() 68 | { 69 | $content = '这是一段测试语句,请忽略赌球网'; 70 | 71 | // 过滤,其中【赌球网】在词库中 72 | $markedContent = SensitiveHelper::init() 73 | ->setTreeByFile($this->wordPoolPath) 74 | ->mark($content,'', ''); 75 | 76 | $this->assertEquals('这是一段测试语句,请忽略赌球网', $markedContent); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 |