├── .gitignore
├── README.md
├── composer.json
├── phpunit.xml
├── src
├── DfaFilter
│ ├── Exceptions
│ │ ├── PdsBusinessException.php
│ │ └── PdsSystemException.php
│ ├── HashMap.php
│ ├── SensitiveHelper.php
│ └── functions.php
└── README.md
└── tests
├── BaseTest.php
├── ProTest.php
├── bootstrap.php
└── data
├── keyWord.txt
└── words.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/
3 |
4 | /vendor/
5 |
6 | /composer.lock
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # php-DFA-filterWord
2 | php实现基于确定有穷自动机算法的铭感词过滤 https://packagist.org/packages/lustre/php-dfa-sensitive
3 |
4 | ## 安装&使用流程
5 | ### Download and install Composer:
6 | curl -sS https://getcomposer.org/installer | php
7 | > 要检查 Composer 是否正常工作,只需要通过 php 来执行 PHAR
8 |
9 | php composer.phar
10 |
11 | ### 安装扩展
12 |
13 | composer require lustre/php-dfa-sensitive
14 |
15 | * 注意:如果你在使用composer安装时,出现
16 | Could not find package lustre/php-dfa-sensitive at any version for your minimum-stability (stable). Check the package spelling or your minimum-stability 请在你的composer.json中加入"minimum-stability": "dev"
17 |
18 |
19 |
20 |
21 | #### 如果你需要手动引入
22 |
23 | require './vendor/autoload.php';
24 |
25 | use DfaFilter\SensitiveHelper;
26 |
27 | ### 构建敏感词库树
28 | 场景一: 可以拿到不同(用户)词库数组
29 |
30 | // 获取感词库索引数组
31 | $wordData = array(
32 | '察象蚂',
33 | '拆迁灭',
34 | '车牌隐',
35 | '成人电',
36 | '成人卡通',
37 | ......
38 | );
39 |
40 | // get one helper
41 | $handle = SensitiveHelper::init()->setTree($wordData);
42 |
43 | 场景二: 全站使用一套敏感词库
44 |
45 | // 获取感词库文件路径
46 | $wordFilePath = 'tests/data/words.txt';
47 |
48 | // get one helper
49 | $handle = SensitiveHelper::init()->setTreeByFile($wordFilePath);
50 |
51 | ### 检测是否含有敏感词
52 |
53 | $islegal = $handle->islegal($content);
54 | ### 敏感词过滤
55 |
56 | // 敏感词替换为*为例(会替换为相同字符长度的*)
57 | $filterContent = $handle->replace($content, '*', true);
58 |
59 | // 或敏感词替换为***为例
60 | $filterContent = $handle->replace($content, '***');
61 |
62 | ### 标记敏感词
63 | $markedContent = $handle->mark($content, '', '');
64 |
65 | ### 获取文字中的敏感词
66 |
67 | // 获取内容中所有的敏感词
68 | $sensitiveWordGroup = $handle->getBadWord($content);
69 | // 仅且获取一个敏感词
70 | $sensitiveWordGroup = $handle->getBadWord($content, 1);
71 |
72 | ### 如何使用单元测试进行测试
73 | #### 安装PHPUnit
74 | ```bash
75 | $ wget https://phar.phpunit.de/phpunit.phar
76 |
77 | $ chmod +x phpunit.phar
78 |
79 | $ mv phpunit.phar /usr/local/bin/phpunit
80 | ```
81 | #### 使用composer自动加载php命名空间
82 |
83 | ```bash
84 | $ composer update
85 | ```
86 | ### 运行单元测试
87 | ```bash
88 | $ phpunit tests/BaseTest.php
89 | ```
90 |
91 | *如果大家有更好的建议,请大家多多指正,O(∩_∩)O谢谢*
92 |
93 | *你们的star是我的动力*
94 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "lustre/php-dfa-sensitive",
3 | "description": "To achieve the filtering of sensitive words, based on the determination of finite automata (DFA) algorithm.",
4 | "type": "library",
5 | "license": "MIT",
6 | "authors": [
7 | {
8 | "name": "Lustre",
9 | "email": "firelustre@163.com"
10 | }
11 | ],
12 | "minimum-stability": "stable",
13 | "require": {
14 | "php": ">=5.3",
15 | "ext-mbstring": "*"
16 | },
17 | "autoload": {
18 | "psr-4": {
19 | "DfaFilter\\": "src/DfaFilter"
20 | }
21 | },
22 | "require-dev": {
23 | "phpunit/phpunit": "^6.4"
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 |
7 | tests
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/DfaFilter/Exceptions/PdsBusinessException.php:
--------------------------------------------------------------------------------
1 | hashTable)) {
32 | $this->hashTable[$key] = $value;
33 | return null;
34 | }
35 | $_temp = $this->hashTable[$key];
36 | $this->hashTable[$key] = $value;
37 | return $_temp;
38 | }
39 |
40 | /**
41 | * 根据key获取对应的value
42 | *
43 | * @param $key
44 | * @return mixed|null
45 | */
46 | public function get($key)
47 | {
48 | if (array_key_exists($key, $this->hashTable)) {
49 | return $this->hashTable[$key];
50 | }
51 | return null;
52 | }
53 |
54 | /**
55 | * 删除指定key的键值对
56 | *
57 | * @param $key
58 | * @return mixed|null
59 | */
60 | public function remove($key)
61 | {
62 | $temp_table = array();
63 | if (array_key_exists($key, $this->hashTable)) {
64 | $tempValue = $this->hashTable[$key];
65 | while ($curValue = current($this->hashTable)) {
66 | if (! (key($this->hashTable) == $key)) {
67 | $temp_table[key($this->hashTable)] = $curValue;
68 | }
69 | next($this->hashTable);
70 | }
71 | $this->hashTable = null;
72 | $this->hashTable = $temp_table;
73 | return $tempValue;
74 | }
75 | return null;
76 | }
77 |
78 | /**
79 | * 获取HashMap的所有键值
80 | *
81 | * @return array
82 | */
83 | public function keys()
84 | {
85 | return array_keys($this->hashTable);
86 | }
87 |
88 | /**
89 | * 获取HashMap的所有value值
90 | *
91 | * @return array
92 | */
93 | public function values()
94 | {
95 | return array_values($this->hashTable);
96 | }
97 |
98 | /**
99 | * 将一个HashMap的值全部put到当前HashMap中
100 | *
101 | * @param \DfaFilter\HashMap $map
102 | */
103 | public function putAll($map)
104 | {
105 | if (! $map->isEmpty() && $map->size() > 0) {
106 | $keys = $map->keys();
107 | foreach ($keys as $key) {
108 | $this->put($key, $map->get($key));
109 | }
110 | }
111 |
112 | return ;
113 | }
114 |
115 | /**
116 | * 移除HashMap中所有元素
117 | *
118 | * @return bool
119 | */
120 | public function removeAll()
121 | {
122 | $this->hashTable = null;
123 | return true;
124 | }
125 |
126 | /**
127 | * 判断HashMap中是否包含指定的值
128 | *
129 | * @param $value
130 | * @return bool
131 | */
132 | public function containsValue($value)
133 | {
134 | while ($curValue = current($this->hashTable)) {
135 | if ($curValue == $value) {
136 | return true;
137 | }
138 | next($this->hashTable);
139 | }
140 | return false;
141 | }
142 |
143 | /**
144 | * 判断HashMap中是否包含指定的键key
145 | *
146 | * @param $key
147 | * @return bool
148 | */
149 | public function containsKey($key)
150 | {
151 | if (array_key_exists($key, $this->hashTable)) {
152 | return true;
153 | } else {
154 | return false;
155 | }
156 | }
157 |
158 | /**
159 | * 获取HashMap中元素个数
160 | *
161 | * @return int
162 | */
163 | public function size()
164 | {
165 | return count($this->hashTable);
166 | }
167 |
168 | /**
169 | * 判断HashMap是否为空
170 | *
171 | * @return bool
172 | */
173 | public function isEmpty()
174 | {
175 | return (count($this->hashTable) == 0);
176 | }
177 | }
178 |
--------------------------------------------------------------------------------
/src/DfaFilter/SensitiveHelper.php:
--------------------------------------------------------------------------------
1 | wordTree = $this->wordTree ?: new HashMap();
72 |
73 | foreach ($this->yieldToReadFile($filepath) as $word) {
74 | $this->buildWordToTree(trim($word));
75 | }
76 |
77 | return $this;
78 | }
79 |
80 |
81 | /**
82 | * 构建铭感词树【数组模式】
83 | *
84 | * @param null $sensitiveWords
85 | *
86 | * @return $this
87 | * @throws \DfaFilter\Exceptions\PdsBusinessException
88 | */
89 | public function setTree($sensitiveWords = null)
90 | {
91 | if (empty($sensitiveWords)) {
92 | throw new PdsBusinessException('词库不能为空', PdsBusinessException::EMPTY_WORD_POOL);
93 | }
94 |
95 | $this->wordTree = new HashMap();
96 |
97 | foreach ($sensitiveWords as $word) {
98 | $this->buildWordToTree($word);
99 | }
100 | return $this;
101 | }
102 |
103 | /**
104 | * 检测文字中的敏感词
105 | *
106 | * @param string $content 待检测内容
107 | * @param int $matchType 匹配类型 [默认为最小匹配规则]
108 | * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
109 | * @return array
110 | * @throws \DfaFilter\Exceptions\PdsSystemException
111 | */
112 | public function getBadWord($content, $matchType = 1, $wordNum = 0)
113 | {
114 | $this->contentLength = mb_strlen($content, 'utf-8');
115 | $badWordList = array();
116 | for ($length = 0; $length < $this->contentLength; $length++) {
117 | $matchFlag = 0;
118 | $flag = false;
119 | $tempMap = $this->wordTree;
120 | for ($i = $length; $i < $this->contentLength; $i++) {
121 | $keyChar = mb_substr($content, $i, 1, 'utf-8');
122 |
123 | // 获取指定节点树
124 | $nowMap = $tempMap->get($keyChar);
125 |
126 | // 不存在节点树,直接返回
127 | if (empty($nowMap)) {
128 | break;
129 | }
130 |
131 | // 存在,则判断是否为最后一个
132 | $tempMap = $nowMap;
133 |
134 | // 找到相应key,偏移量+1
135 | $matchFlag++;
136 |
137 | // 如果为最后一个匹配规则,结束循环,返回匹配标识数
138 | if (false === $nowMap->get('ending')) {
139 | continue;
140 | }
141 |
142 | $flag = true;
143 |
144 | // 最小规则,直接退出
145 | if (1 === $matchType) {
146 | break;
147 | }
148 | }
149 |
150 | if (! $flag) {
151 | $matchFlag = 0;
152 | }
153 |
154 | // 找到相应key
155 | if ($matchFlag <= 0) {
156 | continue;
157 | }
158 |
159 | $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
160 |
161 | // 有返回数量限制
162 | if ($wordNum > 0 && count($badWordList) == $wordNum) {
163 | return $badWordList;
164 | }
165 |
166 | // 需匹配内容标志位往后移
167 | $length = $length + $matchFlag - 1;
168 | }
169 | return $badWordList;
170 | }
171 |
172 | /**
173 | * 替换敏感字字符
174 | *
175 | * @param $content 文本内容
176 | * @param string $replaceChar 替换字符
177 | * @param bool $repeat true=>重复替换为敏感词相同长度的字符
178 | * @param int $matchType
179 | *
180 | * @return mixed
181 | * @throws \DfaFilter\Exceptions\PdsBusinessException
182 | * @throws \DfaFilter\Exceptions\PdsSystemException
183 | */
184 | public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
185 | {
186 | if (empty($content)) {
187 | throw new PdsBusinessException('请填写检测的内容', PdsBusinessException::EMPTY_CONTENT);
188 | }
189 |
190 | $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
191 |
192 | // 未检测到敏感词,直接返回
193 | if (empty($badWordList)) {
194 | return $content;
195 | }
196 |
197 | foreach ($badWordList as $badWord) {
198 | $hasReplacedChar = $replaceChar;
199 | if ($repeat) {
200 | $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
201 | }
202 | $content = str_replace($badWord, $hasReplacedChar, $content);
203 | }
204 | return $content;
205 | }
206 |
207 | /**
208 | * 标记敏感词
209 | *
210 | * @param $content 文本内容
211 | * @param string $sTag 标签开头,如
212 | * @param string $eTag 标签结束,如
213 | * @param int $matchType
214 | *
215 | * @return mixed
216 | * @throws \DfaFilter\Exceptions\PdsBusinessException
217 | * @throws \DfaFilter\Exceptions\PdsSystemException
218 | */
219 | public function mark($content, $sTag, $eTag, $matchType = 1)
220 | {
221 | if (empty($content)) {
222 | throw new PdsBusinessException('请填写检测的内容', PdsBusinessException::EMPTY_CONTENT);
223 | }
224 |
225 | $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
226 |
227 | // 未检测到敏感词,直接返回
228 | if (empty($badWordList)) {
229 | return $content;
230 | }
231 | $badWordList = array_unique($badWordList);
232 |
233 | foreach ($badWordList as $badWord) {
234 | $replaceChar = $sTag . $badWord . $eTag;
235 | $content = str_replace($badWord, $replaceChar, $content);
236 | }
237 | return $content;
238 | }
239 |
240 | /**
241 | * 被检测内容是否合法
242 | *
243 | * @param $content
244 | *
245 | * @return bool
246 | * @throws \DfaFilter\Exceptions\PdsSystemException
247 | */
248 | public function islegal($content)
249 | {
250 | $this->contentLength = mb_strlen($content, 'utf-8');
251 |
252 | for ($length = 0; $length < $this->contentLength; $length++) {
253 | $matchFlag = 0;
254 |
255 | $tempMap = $this->wordTree;
256 | for ($i = $length; $i < $this->contentLength; $i++) {
257 | $keyChar = mb_substr($content, $i, 1, 'utf-8');
258 |
259 | // 获取指定节点树
260 | $nowMap = $tempMap->get($keyChar);
261 |
262 | // 不存在节点树,直接返回
263 | if (empty($nowMap)) {
264 | break;
265 | }
266 |
267 | // 找到相应key,偏移量+1
268 | $tempMap = $nowMap;
269 | $matchFlag++;
270 |
271 | // 如果为最后一个匹配规则,结束循环,返回匹配标识数
272 | if (false === $nowMap->get('ending')) {
273 | continue;
274 | }
275 |
276 | return true;
277 | }
278 |
279 | // 找到相应key
280 | if ($matchFlag <= 0) {
281 | continue;
282 | }
283 |
284 | // 需匹配内容标志位往后移
285 | $length = $length + $matchFlag - 1;
286 | }
287 | return false;
288 | }
289 |
290 | protected function yieldToReadFile($filepath)
291 | {
292 | $fp = fopen($filepath, 'r');
293 | while (! feof($fp)) {
294 | yield fgets($fp);
295 | }
296 | fclose($fp);
297 | }
298 |
299 | // 将单个敏感词构建成树结构
300 | protected function buildWordToTree($word = '')
301 | {
302 | if ('' === $word) {
303 | return;
304 | }
305 | $tree = $this->wordTree;
306 |
307 | $wordLength = mb_strlen($word, 'utf-8');
308 | for ($i = 0; $i < $wordLength; $i++) {
309 | $keyChar = mb_substr($word, $i, 1, 'utf-8');
310 |
311 | // 获取子节点树结构
312 | $tempTree = $tree->get($keyChar);
313 |
314 | if ($tempTree) {
315 | $tree = $tempTree;
316 | } else {
317 | // 设置标志位
318 | $newTree = new HashMap();
319 | $newTree->put('ending', false);
320 |
321 | // 添加到集合
322 | $tree->put($keyChar, $newTree);
323 | $tree = $newTree;
324 | }
325 |
326 | // 到达最后一个节点
327 | if ($i == $wordLength - 1) {
328 | $tree->put('ending', true);
329 | }
330 | }
331 |
332 | return;
333 | }
334 |
335 | /**
336 | * 敏感词替换为对应长度的字符
337 | * @param $word
338 | * @param $char
339 | *
340 | * @return string
341 | * @throws \DfaFilter\Exceptions\PdsSystemException
342 | */
343 | protected function dfaBadWordConversChars($word, $char)
344 | {
345 | $str = '';
346 | $length = mb_strlen($word, 'utf-8');
347 | for ($counter = 0; $counter < $length; ++$counter) {
348 | $str .= $char;
349 | }
350 |
351 | return $str;
352 | }
353 | }
354 |
--------------------------------------------------------------------------------
/src/DfaFilter/functions.php:
--------------------------------------------------------------------------------
1 | 要检查 Composer 是否正常工作,只需要通过 php 来执行 PHAR
8 |
9 | php composer.phar
10 |
11 | ### 安装扩展
12 |
13 | composer require lustre/php-dfa-sensitive
14 |
15 | * 注意:如果你在使用composer安装时,出现
16 | Could not find package lustre/php-dfa-sensitive at any version for your minimum-stability (stable). Check the package spelling or your minimum-stability 请在你的composer.json中加入"minimum-stability": "dev"
17 |
18 |
19 |
20 |
21 | #### 如果你需要手动引入
22 |
23 | require './vendor/autoload.php';
24 |
25 | use DfaFilter\SensitiveHelper;
26 |
27 | ### 获取敏感词库
28 |
29 | // 获取感词库索引数组
30 | $wordData = array(
31 | '察象蚂',
32 | '拆迁灭',
33 | '车牌隐',
34 | '成人电',
35 | '成人卡通',
36 | ......
37 | );
38 |
39 | ### 检测是否含有敏感词
40 |
41 | $islegal = SensitiveHelper::init()->setTree($wordData)->islegal($content);
42 | ### 敏感词过滤
43 |
44 | // 敏感词替换为***为例
45 | $filterContent = SensitiveHelper::init()->setTree($wordData)->replace($content, '***');
46 |
47 | ### 获取文字中的敏感词
48 |
49 | // 获取内容中所有的敏感词
50 | $sensitiveWordGroup = SensitiveHelper::init()->setTree($wordData)->getBadWord($content);
51 | // 仅且获取一个敏感词
52 | $sensitiveWordGroup = SensitiveHelper::init()->setTree($wordData)->getBadWord($content, 1);
53 |
54 | 目前已知使用平台:[广电云](http://www.guangdianyun.tv/) 节目直播页面在线聊天功能支持
55 |
56 |
57 | *如果大家有更好的建议,请大家多多指正,O(∩_∩)O谢谢*
58 |
--------------------------------------------------------------------------------
/tests/BaseTest.php:
--------------------------------------------------------------------------------
1 | wordData = explode(',', $wordPool);
22 | }
23 |
24 | public function testGetBadWord()
25 | {
26 | $content = '这是一段测试语句,请忽略赌球网, 第二个敏感词是三级片';
27 |
28 | // 过滤,其中【赌球网】在词库中
29 | $filterContent = SensitiveHelper::init()
30 | ->setTree($this->wordData)
31 | ->getBadWord($content);
32 |
33 | // 返回规定数量的敏感词,其中【赌球网,三级片】在词库中
34 | $badWords = SensitiveHelper::init()
35 | ->setTree($this->wordData)
36 | ->getBadWord($content, 1, 2);
37 |
38 | $this->assertEquals('赌球网', $filterContent[0]);
39 | $this->assertEquals('三级片', $badWords[1]);
40 | }
41 |
42 | public function testFilterWord()
43 | {
44 | $content = '这是一段测试语句,请忽略赌球网';
45 |
46 | // 过滤,其中【赌球网】在词库中
47 | $filterContent = SensitiveHelper::init()
48 | ->setTree($this->wordData)
49 | ->replace($content,'*');
50 |
51 | $this->assertEquals('这是一段测试语句,请忽略*',$filterContent);
52 | }
53 | }
--------------------------------------------------------------------------------
/tests/ProTest.php:
--------------------------------------------------------------------------------
1 | wordPoolPath = 'tests/data/words.txt';
22 | }
23 |
24 | public function testGetBadWord()
25 | {
26 | $sTime = microtime(true);
27 | $content = '这是一段测试语句,请忽略赌球网, 第二个敏感词是三级片';
28 |
29 | // 过滤,其中【赌球网】在词库中
30 | $filterContent = SensitiveHelper::init()
31 | ->setTreeByFile($this->wordPoolPath)
32 | ->getBadWord($content);
33 |
34 | // 返回规定数量的敏感词,其中【赌球网,三级片】在词库中
35 | $badWords = SensitiveHelper::init()
36 | ->setTreeByFile($this->wordPoolPath)
37 | ->getBadWord($content, 1, 2);
38 |
39 | $eTime = microtime(true);
40 |
41 | echo ($eTime - $sTime) * 1000 . 'ms' . PHP_EOL;
42 |
43 | $this->assertEquals('赌球网', $filterContent[0]);
44 | $this->assertEquals('三级片', $badWords[1]);
45 | }
46 |
47 | public function testFilterWord()
48 | {
49 | $content = '这是一段测试语句,请忽略赌球网';
50 |
51 | // 过滤,其中【赌球网】在词库中
52 | $filterContent = SensitiveHelper::init()
53 | ->setTreeByFile($this->wordPoolPath)
54 | ->replace($content,'*');
55 |
56 | $this->assertEquals('这是一段测试语句,请忽略*', $filterContent);
57 |
58 |
59 | // 过滤,其中【赌球网】在词库中
60 | $filterContent = SensitiveHelper::init()
61 | ->setTreeByFile($this->wordPoolPath)
62 | ->replace($content,'*', true);
63 |
64 | $this->assertEquals('这是一段测试语句,请忽略***', $filterContent);
65 | }
66 |
67 | public function testMarkWord()
68 | {
69 | $content = '这是一段测试语句,请忽略赌球网';
70 |
71 | // 过滤,其中【赌球网】在词库中
72 | $markedContent = SensitiveHelper::init()
73 | ->setTreeByFile($this->wordPoolPath)
74 | ->mark($content,'', '');
75 |
76 | $this->assertEquals('这是一段测试语句,请忽略赌球网', $markedContent);
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
1 |