├── LICENSE.md
├── README.md
├── composer.json
└── src
├── Bootstrap.php
├── HashMap.php
├── Install.php
├── Sensitive.php
├── SensitiveException.php
├── config
├── SensitiveWord.txt
└── plugin
│ └── isszz
│ └── webman-sensitive
│ ├── app.php
│ └── bootstrap.php
├── facade
└── Sensitive.php
├── helpers.php
└── word
└── SensitiveWord.txt
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) isszz
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # webman-sensitive
2 | Webman 敏感词检测,过滤,标记
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | ## 安装
13 |
14 | ```shell
15 | composer require isszz/webman-sensitive
16 | ```
17 |
18 | ## 配置
19 | ```php
20 | return [
21 | 'enable' => true,
22 |
23 | // 支持file,array,也可以指向自己敏感词库文件路径
24 | // file模式时,敏感词库位于webman根目录的config/plugin/isszz/webman-sensitive/SensitiveWord.txt,也可以指向自定义的词库文件路径
25 | 'mode' => 'file',
26 |
27 | 'config' => [
28 | 'repeat' => true, // 重复替换为敏感词相同长度的字符
29 | 'replace_char' => '*', // 替换字符
30 | // 标记敏感词,标签生成敏感词
31 | 'mark' => 'mark',
32 | ],
33 |
34 | // 干扰因子
35 | 'interference_factors' => [
36 | ' ', '&', '*', '/', '|', '@', '.', '^', '~', '$',
37 | ],
38 |
39 | // 数组模式敏感词
40 | 'sensitive_words' => [
41 | '工口',
42 | '里番',
43 | '性感美女',
44 | ]
45 | ];
46 |
47 | ```
48 |
49 | ## 使用
50 |
51 | facade方式
52 | ```php
53 | use isszz\sensitive\facade\Sensitive;
54 |
55 | class Index
56 | {
57 | public function add()
58 | {
59 | // 设置干扰因子
60 | Sensitive::interferenceFactor(['(', ')', ',', ',', ';', ';', '。']);
61 |
62 | // 添加一个额外的敏感词,words参数支持单敏感词,多词也可以用|分割,或者直接传入多个敏感词数组
63 | // words = 性感美女|分隔符
64 | // words = ['性感美女', '数组']
65 | Sensitive::add(words: '性感美女');
66 |
67 | // 删除的敏感词,words参数同添加的格式一样
68 | // 第二个参数once为true时,只针对当次: is,replace,mark,操作生效
69 | Sensitive::remove(words: '性感美女', once: true);
70 |
71 | // 检测
72 | if (Sensitive::is(content: '检测语句')) {
73 | return json(['code' => 1, 'msg' => '输入内容包含敏感词,请注意用词。']);
74 | }
75 |
76 | // 替换
77 | $replaced = Sensitive::add(words: '垃圾')->replace(content: '替换语句垃圾要被替换', replaceChar: '*', repeat: false);
78 |
79 | // 标记敏感词
80 | $marked = Sensitive::add(words: '尼玛')->mark(content: '标记的内容,这里尼玛要被标记', tag: 'bad');
81 |
82 | // 提取内容中的所有敏感词
83 | $badWords = Sensitive::add('狗逼')->get('提取内容中的所有敏感词,狗逼,还有SB都会被提取');
84 |
85 | // 自定义敏感词库
86 | // 文件方式
87 | Sensitive::custom('/config/SensitiveWord.txt')
88 | ->is('检测尼玛的语句');
89 |
90 | // 数组方式
91 | Sensitive::custom([
92 | '垃圾', '尼玛',
93 | //...
94 | ])->is('检测尼玛的语句');
95 |
96 | // 文件词库模式,可以添加新敏感词到词库文件
97 | // data参数可以是一个数组也可以是用|分割敏感词的字符串
98 | // append参数为true是追加模式,false时先提取词库,再去重,然后合并写入
99 | $sensitive->addWordToFile(data: '狗逼|傻缺', append: false);
100 | }
101 | }
102 |
103 | ```
104 | 依赖注入方式
105 | ```php
106 | use isszz\sensitive\Sensitive;
107 |
108 | class Index
109 | {
110 | public function add(Sensitive $sensitive)
111 | {
112 | // 设置干扰因子
113 | $sensitive->interferenceFactor(['(', ')', ',', ',', ';', ';', '。']);
114 | // ...
115 | }
116 | }
117 | ```
118 | 助手函数方式
119 | ```php
120 | class Index
121 | {
122 | public function add(Sensitive $sensitive)
123 | {
124 | // 设置干扰因子,后返回的Sensitive实例可使用:is,replace,mark
125 | sensitive_interference_factor(['(', ')', ',', ',', ';', ';', '。'])
126 | ->is('检测语句尼玛');
127 |
128 | // 添加敏感词,后返回的Sensitive实例可使用:is,replace,mark
129 | sensitive_add(words: '性感美女')
130 | ->mark('你是一个性感美女,你说是不是?');
131 |
132 | // 移除敏感词,后返回的Sensitive实例可使用:is,replace,mark
133 | // 第二个参数once为true时,只针对当次: is,replace,mark,操作生效
134 | sensitive_remove(words: '工口', once: true)
135 | ->mark('你这个SB是不是想看工口类的动漫?哈哈!');
136 |
137 | // 检测敏感词
138 | if (sensitive_is('检测语句尼玛')) {
139 | return json(['code' => 1, 'msg' => '输入内容包含敏感词,请注意用词。']);
140 | }
141 |
142 | // replaceChar是用来设置要被替换的敏感词
143 | // repeat为true时根据检测出的敏感词长度设置replaceChar
144 | $replaced = sensitive_replace(content: '替换语句垃圾要被替换', replaceChar: '*', repeat: true);
145 | // tag参数是用来设置包裹敏感词的标签名例如: 这里SB要被标记
146 | $marked = sensitive_mark(content: '标记的内容,这里SB要被标记', tag: 'bad');
147 |
148 | // 提取内容中的所有敏感词
149 | $badWords = sensitive_get('谁是SB,谁是狗逼,谁是傻缺');
150 |
151 | // 自定义敏感词库
152 | // 文件方式
153 | sensitive_custom('/config/SensitiveWord.txt')
154 | ->is('检测尼玛的语句');
155 |
156 | // 数组方式
157 | sensitive_custom([
158 | '垃圾', '尼玛',
159 | //...
160 | ])->is('检测尼玛的语句');
161 |
162 | // 文件词库模式,可以添加新敏感词到词库文件
163 | // data参数可以是一个数组也可以是用|分割敏感词的字符串
164 | // append参数为true是追加模式,false时先提取词库,再去重,然后合并写入
165 | sensitive_add_word_to_file(data: '狗逼|傻缺', append: false);
166 |
167 | }
168 | }
169 | ```
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "isszz/webman-sensitive",
3 | "description": "Webman 敏感词检测,过滤,标记",
4 | "license": "MIT",
5 | "authors": [
6 | {
7 | "name": "isszz",
8 | "email": "isszz@qq.com"
9 | }
10 | ],
11 | "require": {
12 | "php": ">=8.0.0",
13 | "workerman/webman-framework": ">=1.4"
14 | },
15 | "autoload": {
16 | "files": [
17 | "src/helpers.php"
18 | ],
19 | "psr-4": {
20 | "isszz\\sensitive\\": "src"
21 | }
22 | },
23 | "config": {
24 | "sort-packages": true
25 | },
26 | "minimum-stability": "dev",
27 | "extra": {
28 | "branch-alias": {
29 | "dev-master": "dev-dev"
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Bootstrap.php:
--------------------------------------------------------------------------------
1 | {$name}(... $arguments);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/HashMap.php:
--------------------------------------------------------------------------------
1 | containsKey($key)) {
25 | $this->hashTable[$key] = $value;
26 | return null;
27 | }
28 |
29 | $_temp = $this->hashTable[$key];
30 | $this->hashTable[$key] = $value;
31 |
32 | return $_temp;
33 | }
34 |
35 | /**
36 | * 根据key获取对应的value
37 | *
38 | * @param $key
39 | * @return mixed|null
40 | */
41 | public function get($key)
42 | {
43 | if ($this->containsKey($key)) {
44 | return $this->hashTable[$key];
45 | }
46 |
47 | return null;
48 | }
49 |
50 | /**
51 | * 删除指定key的键值对
52 | *
53 | * @param $key
54 | * @return mixed|null
55 | */
56 | public function remove($key)
57 | {
58 | if ($this->containsKey($key)) {
59 | $tempTable = [];
60 | $tempValue = $this->hashTable[$key];
61 | foreach ($this->hashTable as $k => $v) {
62 | if ($k !== $key) {
63 | $tempTable[$k] = $v;
64 | }
65 | }
66 | $this->hashTable = $tempTable;
67 | return $tempValue;
68 | }
69 |
70 | return null;
71 | }
72 |
73 | /**
74 | * 获取HashMap的所有键值
75 | *
76 | * @return array
77 | */
78 | public function keys()
79 | {
80 | return array_keys($this->hashTable);
81 | }
82 |
83 | /**
84 | * 获取HashMap的所有value值
85 | *
86 | * @return array
87 | */
88 | public function values()
89 | {
90 | return array_values($this->hashTable);
91 | }
92 |
93 | /**
94 | * 将一个HashMap的值全部put到当前HashMap中
95 | *
96 | * @param $map
97 | */
98 | public function putAll($map)
99 | {
100 | if (!$map->isEmpty() && $map->size() > 0) {
101 | $keys = $map->keys();
102 | foreach ($keys as $key) {
103 | $this->put($key, $map->get($key));
104 | }
105 | }
106 |
107 | return ;
108 | }
109 |
110 | /**
111 | * 移除HashMap中所有元素
112 | *
113 | * @return bool
114 | */
115 | public function removeAll()
116 | {
117 | $this->hashTable = null;
118 | return true;
119 | }
120 |
121 | /**
122 | * 判断HashMap中是否包含指定的值
123 | *
124 | * @param $value
125 | * @return bool
126 | */
127 | public function containsValue($value)
128 | {
129 | while ($curValue = current($this->hashTable)) {
130 | if ($curValue == $value) {
131 | return true;
132 | }
133 |
134 | next($this->hashTable);
135 | }
136 |
137 | return false;
138 | }
139 |
140 | /**
141 | * 判断HashMap中是否包含指定的键key
142 | *
143 | * @param $key
144 | * @return bool
145 | */
146 | public function containsKey($key)
147 | {
148 | return array_key_exists($key, $this->hashTable);
149 | }
150 |
151 | /**
152 | * 获取HashMap中元素个数
153 | *
154 | * @return int
155 | */
156 | public function size()
157 | {
158 | return count($this->hashTable);
159 | }
160 |
161 | /**
162 | * 判断HashMap是否为空
163 | *
164 | * @return bool
165 | */
166 | public function isEmpty()
167 | {
168 | return (count($this->hashTable) == 0);
169 | }
170 | }
171 |
--------------------------------------------------------------------------------
/src/Install.php:
--------------------------------------------------------------------------------
1 | 'config/plugin/isszz/webman-sensitive',
15 | ];
16 |
17 | /**
18 | * Install
19 | * @return void
20 | */
21 | public static function install()
22 | {
23 | static::installByRelation();
24 | }
25 |
26 | /**
27 | * Uninstall
28 | * @return void
29 | */
30 | public static function uninstall()
31 | {
32 | self::uninstallByRelation();
33 | }
34 |
35 | /**
36 | * installByRelation
37 | * @return void
38 | */
39 | public static function installByRelation()
40 | {
41 | foreach (static::$pathRelation as $source => $dest) {
42 | if ($pos = strrpos($dest, '/')) {
43 | $parent_dir = base_path().'/'.substr($dest, 0, $pos);
44 | if (!is_dir($parent_dir)) {
45 | mkdir($parent_dir, 0777, true);
46 | }
47 | }
48 | copy_dir(__DIR__ . "/$source", base_path()."/$dest");
49 | echo "Create $dest";
50 | }
51 | }
52 |
53 | /**
54 | * uninstallByRelation
55 | * @return void
56 | */
57 | public static function uninstallByRelation()
58 | {
59 | foreach (static::$pathRelation as $source => $dest) {
60 | $path = base_path()."/$dest";
61 | if (!is_dir($path) && !is_file($path)) {
62 | continue;
63 | }
64 | echo "Remove $dest";
65 | if (is_file($path) || is_link($path)) {
66 | unlink($path);
67 | continue;
68 | }
69 | remove_dir($path);
70 | }
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/Sensitive.php:
--------------------------------------------------------------------------------
1 | config = $config['config'];
64 |
65 | // 配置中的干扰因子
66 | $this->interferenceFactors = $config['interference_factors'] ?? [];
67 |
68 | if ($mode == 'array') {
69 | $this->setTree($config['sensitive_words'] ?? []);
70 | } else {
71 | $this->setFile(
72 | is_file($mode) ? $mode : config_path('plugin') . DIRECTORY_SEPARATOR .'isszz'. DIRECTORY_SEPARATOR .'webman-sensitive'. DIRECTORY_SEPARATOR .'SensitiveWord.txt'
73 | );
74 | }
75 | }
76 |
77 | /**
78 | * 被检测内容是否合法|简写
79 | *
80 | * @param $content
81 | *
82 | * @return bool
83 | * @throws \isszz\sensitive\SensitiveException
84 | */
85 | public function is(string $content)
86 | {
87 | return $this->check($content);
88 | }
89 |
90 | /**
91 | * 被检测内容是否合法
92 | *
93 | * @param $content
94 | *
95 | * @return bool
96 | * @throws \isszz\sensitive\SensitiveException
97 | */
98 | public function check(string $content)
99 | {
100 | $this->contentLength = sensitive_mb_strlen($content, 'utf-8');
101 |
102 | for ($length = 0; $length < $this->contentLength; $length++) {
103 | $matchFlag = 0;
104 | $tempMap = $this->wordTree;
105 | for ($i = $length; $i < $this->contentLength; $i++) {
106 | $keyChar = mb_substr($content, $i, 1, 'utf-8');
107 |
108 | // 检测干扰因子
109 | if ($this->checkInterferenceFactor($keyChar)) {
110 | $matchFlag++;
111 | continue;
112 | }
113 |
114 | // 获取指定节点树
115 | $nowMap = $tempMap->get($keyChar);
116 |
117 | // 不存在节点树,直接返回
118 | if (empty($nowMap)) {
119 | break;
120 | }
121 |
122 | // 找到相应key,偏移量+1
123 | $tempMap = $nowMap;
124 | $matchFlag++;
125 |
126 | // 如果为最后一个匹配规则,结束循环,返回匹配标识数
127 | if ($nowMap->get('isEnd') === false) {
128 | continue;
129 | }
130 |
131 | return true;
132 | }
133 |
134 | // 找到相应key
135 | if ($matchFlag <= 0) {
136 | continue;
137 | }
138 |
139 | // 需匹配内容标志位往后移
140 | $length += $matchFlag - 1;
141 | }
142 |
143 | $this->recoverRemove();
144 | return false;
145 | }
146 |
147 | /**
148 | * 替换敏感字字符
149 | *
150 | * @param string $content 文本内容
151 | * @param string $replaceChar 替换字符
152 | * @param bool $repeat 重复替换为敏感词相同长度的字符
153 | * @param int $matchType 匹配类型,默认为最小匹配规则
154 | *
155 | * @return mixed
156 | * @throws \isszz\sensitive\SensitiveException
157 | */
158 | public function replace(string $content, string $replaceChar = '', bool $repeat = false, $matchType = 1)
159 | {
160 | if (empty($content)) {
161 | throw new SensitiveException('Please fill in the content of the test', 1);
162 | }
163 |
164 | $replaceChar = $replaceChar ?: $this->config['replace_char'];
165 |
166 | if(!$repeat) {
167 | $repeat = $this->config['repeat'] ?? false;
168 | }
169 |
170 | $badWordList = $this->badWordList ? $this->badWordList : $this->get($content, $matchType);
171 |
172 | // 未检测到敏感词,直接返回
173 | if (empty($badWordList)) {
174 | return $content;
175 | }
176 |
177 | foreach ($badWordList as $badWord) {
178 | $hasReplacedChar = $replaceChar;
179 | // $badWord = $this->ltrimInterferenceFactorBadWord($badWord);
180 | if ($repeat) {
181 | $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
182 | }
183 |
184 | $content = str_replace($badWord, $hasReplacedChar, $content);
185 | }
186 |
187 | return $content;
188 | }
189 |
190 | /**
191 | * 标记敏感词
192 | *
193 | * @param string $content 文本内容
194 | * @param string $tag 标签开头,如mark
195 | * @param int $matchType 匹配类型,默认为最小匹配规则
196 | *
197 | * @return mixed
198 | * @throws \isszz\sensitive\SensitiveException
199 | */
200 | public function mark(string $content, string $tag = '', $matchType = 1)
201 | {
202 | if (empty($content)) {
203 | throw new SensitiveException('Please fill in the content of the test', 1);
204 | }
205 |
206 | if(!$tag) {
207 | $tag = $this->config['mark'] ?? 'mark';
208 | }
209 |
210 | $sTag = '<'. $tag .'>';
211 | $eTag = ''. $tag .'>';
212 |
213 | $badWordList = $this->badWordList ? $this->badWordList : $this->get($content, $matchType);
214 |
215 | // 未检测到敏感词,直接返回
216 | if (empty($badWordList)) {
217 | return $content;
218 | }
219 |
220 | $badWordList = array_unique($badWordList);
221 |
222 | foreach ($badWordList as $badWord) {
223 | // $badWord = $this->ltrimInterferenceFactorBadWord($badWord);
224 | $replaceChar = $sTag . $badWord . $eTag;
225 | $content = str_replace($badWord, $replaceChar, $content);
226 | }
227 |
228 | return $content;
229 | }
230 |
231 | /**
232 | * 检测文字中的敏感词
233 | *
234 | * @param string $content 待检测内容
235 | * @param int $matchType 匹配类型,默认为最小匹配规则
236 | * @param int $wordNum 需要获取的敏感词数量,默认获取全部
237 | *
238 | * @return array
239 | * @throws \isszz\sensitive\SensitiveException
240 | */
241 | public function get(string $content, $matchType = 1, $wordNum = 0)
242 | {
243 | $this->contentLength = sensitive_mb_strlen($content, 'utf-8');
244 | $badWordList = [];
245 |
246 | for ($length = 0; $length < $this->contentLength; $length++) {
247 | $matchFlag = 0;
248 | $flag = false;
249 | $tempMap = $this->wordTree;
250 |
251 | for ($i = $length; $i < $this->contentLength; $i++) {
252 | $keyChar = mb_substr($content, $i, 1, 'utf-8');
253 |
254 | // 检测干扰因子
255 | if ($this->checkInterferenceFactor($keyChar)) {
256 | $matchFlag++;
257 | continue;
258 | }
259 |
260 | // 获取指定节点树
261 | $nowMap = $tempMap->get($keyChar);
262 |
263 | // 不存在节点树,直接返回
264 | if (empty($nowMap)) {
265 | break;
266 | }
267 |
268 | // 存在,则判断是否为最后一个
269 | $tempMap = $nowMap;
270 |
271 | // 找到相应key,偏移量+1
272 | $matchFlag++;
273 |
274 | // 如果为最后一个匹配规则,结束循环,返回匹配标识数
275 | if ($nowMap->get('isEnd') === false) {
276 | continue;
277 | }
278 |
279 | $flag = true;
280 |
281 | // 最小规则,直接退出
282 | if ($matchType === 1) {
283 | break;
284 | }
285 | }
286 |
287 | if (!$flag) {
288 | $matchFlag = 0;
289 | }
290 |
291 | if ($matchFlag > 0) {
292 | $badWordList[] = $this->ltrimInterferenceFactorBadWord(mb_substr($content, $length, $matchFlag, 'utf-8'));
293 |
294 | // 有返回数量限制
295 | if ($wordNum > 0 && count($badWordList) == $wordNum) {
296 | return $badWordList;
297 | }
298 |
299 | // 需匹配内容标志位往后移
300 | $length += $matchFlag - 1;
301 | }
302 | }
303 |
304 | $this->recoverRemove();
305 |
306 | return $badWordList;
307 | }
308 |
309 | /**
310 | * 添加额外的敏感词
311 | *
312 | * @param string|array $words
313 | *
314 | * @return $this
315 | */
316 | public function add(string|array $words)
317 | {
318 | if(!$this->wordTree) {
319 | throw new SensitiveException('Please initialize Sensitive first', 6);
320 | }
321 |
322 | if (is_string($words) && str_contains($words, '|')) {
323 | $words = explode('|', $words);
324 | }
325 |
326 | foreach ((array) $words as $word) {
327 | $this->buildWordToTree($word);
328 | }
329 |
330 | return $this;
331 | }
332 |
333 | /**
334 | * 删除敏感词
335 | *
336 | * @param string|array $words
337 | * @param bool $once
338 | *
339 | * @return $this
340 | */
341 | public function remove(string|array $words, bool $once = false)
342 | {
343 | if(!$this->wordTree) {
344 | throw new SensitiveException('Please initialize Sensitive first', 6);
345 | }
346 |
347 | if (is_string($words) && str_contains($words, '|')) {
348 | $words = explode('|', $words);
349 | }
350 |
351 | foreach ((array) $words as $word) {
352 | $this->removeToTree($word, $once);
353 | }
354 |
355 | return $this;
356 | }
357 |
358 | /**
359 | * 从敏感词树删除
360 | *
361 | * @param string|array $words
362 | * @param bool $once
363 | *
364 | * @return $this
365 | */
366 | public function removeToTree(string $word, bool $once = false)
367 | {
368 | for ($i = 0; $i < sensitive_mb_strlen($word, 'utf-8'); $i++) {
369 | $this->wordTree->remove(mb_substr($word, $i, 1, 'utf-8'));
370 | }
371 |
372 | // 放入待恢复
373 | $once === true && $this->removeList[] = $word;
374 | }
375 |
376 | /**
377 | * 恢复删除的敏感词
378 | *
379 | * @return mixed
380 | */
381 | public function recoverRemove()
382 | {
383 | if (!$this->removeList) {
384 | return false;
385 | }
386 |
387 | $this->add($this->removeList);
388 | $this->removeList = [];
389 |
390 | return true;
391 | }
392 |
393 | /**
394 | * 自定义构建敏感词树,文件方式|数组方式
395 | *
396 | * @param string|array $custom
397 | *
398 | * @return $this
399 | * @throws \isszz\sensitive\SensitiveException
400 | */
401 | public function custom(string|array $custom)
402 | {
403 | if (is_string($custom)) {
404 | $this->setFile($custom);
405 | }
406 |
407 | if (is_array($custom)) {
408 | $this->setTree($custom);
409 | }
410 |
411 | return $this;
412 | }
413 |
414 | /**
415 | * 构建敏感词树,文件方式
416 | *
417 | * @param string $file
418 | *
419 | * @return $this
420 | * @throws \isszz\sensitive\SensitiveException
421 | */
422 | public function setFile(string $file)
423 | {
424 | if (!is_file($file)) {
425 | throw new SensitiveException('The sensitive words file does not exist', 3);
426 | }
427 |
428 | $file = $this->getSensitiveWordPath($file);
429 |
430 | $this->wordTree = $this->wordTree ?: new HashMap;
431 |
432 | foreach ($this->yieldToReadFile($file) as $words) {
433 | $this->buildWordToTree(trim($words));
434 | }
435 |
436 | return $this;
437 | }
438 |
439 | /**
440 | * 构建敏感词树,数组方式
441 | *
442 | * @param array|null $sensitiveWords
443 | *
444 | * @return $this
445 | * @throws \isszz\sensitive\SensitiveException
446 | */
447 | public function setTree(array|null $sensitiveWords = null)
448 | {
449 | if (empty($sensitiveWords)) {
450 | throw new SensitiveException('The sensitive words cannot be empty', 2);
451 | }
452 |
453 | $this->wordTree = $this->wordTree ?: new HashMap;
454 |
455 | foreach ($sensitiveWords as $word) {
456 | $this->buildWordToTree(trim($word));
457 | }
458 |
459 | return $this;
460 | }
461 |
462 | /**
463 | * 添加干扰因子
464 | *
465 | * @param array $interferenceFactors
466 | *
467 | * @return $this
468 | */
469 | public function interferenceFactor(array $interferenceFactors)
470 | {
471 | $this->interferenceFactors = array_unique(array_merge($this->interferenceFactors, $interferenceFactors));
472 |
473 | return $this;
474 | }
475 |
476 | /**
477 | * 删除敏感词前的干扰因子
478 | *
479 | * @param string $word 需要处理的敏感词
480 | *
481 | * @return string
482 | */
483 | public function ltrimInterferenceFactorBadWord(string $word)
484 | {
485 | $characters = '';
486 | foreach($this->interferenceFactors as $interferenceFactor) {
487 | $characters .= $interferenceFactor. '\\' .' '. $interferenceFactor;
488 | }
489 |
490 | return ltrim($word, $characters);
491 | }
492 |
493 | /**
494 | * 向敏感词库文件添加新词
495 | *
496 | * @param string|array $data 添加的新敏感词
497 | * @param bool $append 是否追加模式,false时会提取后合并去掉重复再写入
498 | *
499 | * @return string
500 | */
501 | public function addWordToFile(string|array $data, bool $append = true)
502 | {
503 | $mode = config('plugin.isszz.webman-sensitive.app.mode', 'file');
504 |
505 | if ($mode == 'array') {
506 | throw new SensitiveException('Array mode cannot be added', 8);
507 | }
508 |
509 | $file = is_file($mode) ? $mode : config_path('plugin') . DIRECTORY_SEPARATOR .'isszz'. DIRECTORY_SEPARATOR .'webman-sensitive'. DIRECTORY_SEPARATOR .'SensitiveWord.txt';
510 |
511 | if (!is_file($file)) {
512 | throw new SensitiveException('Sensitive thesaurus file does not exist', 7);
513 | }
514 |
515 | $file = $this->getSensitiveWordPath($file);
516 |
517 | if (is_string($data) && str_contains($data, '|')) {
518 | $data = explode('|', $data);
519 | }
520 |
521 | $data = array_filter((array) $data);
522 |
523 | // 追加模式
524 | if ($append === true) {
525 | $bool = file_put_contents($file, PHP_EOL . implode(PHP_EOL, $data), FILE_APPEND) !== false;
526 | } else {
527 | // 重写模式
528 | $words = [];
529 | foreach ($this->yieldToReadFile($file) as $word) {
530 | $words[] = trim($word);
531 | }
532 |
533 | $bool = file_put_contents($file, implode(PHP_EOL, array_unique(array_merge($words, $data)))) !== false;
534 | }
535 |
536 | // phar update file
537 | if ($bool) {
538 | $this->getSensitiveWordPath($file, true);
539 | }
540 |
541 | return $bool;
542 | }
543 |
544 | /**
545 | * 读取敏感词库文件
546 | *
547 | * @param string $file
548 | *
549 | * @throws \isszz\sensitive\SensitiveException
550 | */
551 | protected function yieldToReadFile(string $file)
552 | {
553 | $handle = fopen($file, 'r');
554 |
555 | if (!$handle) {
556 | throw new SensitiveException('Read file failed', 4);
557 | }
558 |
559 | while (!feof($handle)) {
560 | $line = fgets($handle);
561 | if (!is_string($line)) {
562 | continue;
563 | }
564 |
565 | yield str_replace(['\'', ' ', PHP_EOL, ','], '', $line);
566 | }
567 |
568 | fclose($handle);
569 | }
570 |
571 | /**
572 | * 将单个敏感词构建成树结构
573 | */
574 | protected function buildWordToTree(string $word = '')
575 | {
576 | if ($word === '') {
577 | return;
578 | }
579 |
580 | $tree = $this->wordTree;
581 |
582 | $wordLength = sensitive_mb_strlen($word, 'utf-8');
583 | for ($i = 0; $i < $wordLength; $i++) {
584 | $keyChar = mb_substr($word, $i, 1, 'utf-8');
585 |
586 | // 获取子节点树结构
587 | $tempTree = $tree->get($keyChar);
588 |
589 | if ($tempTree) {
590 | $tree = $tempTree;
591 | } else {
592 | // 设置标志位
593 | $newTree = new HashMap;
594 | $newTree->put('isEnd', false);
595 |
596 | // 添加到集合
597 | $tree->put($keyChar, $newTree);
598 | $tree = $newTree;
599 | }
600 |
601 | // 到达最后一个节点
602 | if ($i == $wordLength - 1) {
603 | $tree->put('isEnd', true);
604 | }
605 | }
606 |
607 | return;
608 | }
609 |
610 | /**
611 | * 敏感词替换为对应长度的字符
612 | * @param $word
613 | * @param $char
614 | *
615 | * @return string
616 | * @throws \DfaFilter\Exceptions\PdsSystemException
617 | */
618 | protected function dfaBadWordConversChars($word, $char)
619 | {
620 | $str = '';
621 | $length = sensitive_mb_strlen($word, 'utf-8');
622 |
623 | for ($counter = 0; $counter < $length; ++$counter) {
624 | $str .= $char;
625 | }
626 |
627 | return $str;
628 | }
629 |
630 | /**
631 | * 检测干扰因子
632 | *
633 | * @param string $word
634 | *
635 | * @return bool
636 | */
637 | protected function checkInterferenceFactor(string $word)
638 | {
639 | return in_array($word, $this->interferenceFactors);
640 | }
641 |
642 | /**
643 | * @param $path
644 | * @return string
645 | */
646 | protected static function getSensitiveWordPath($path, $update = false)
647 | {
648 | static $pathMaps = [];
649 | if (!class_exists(\Phar::class, false) || !\Phar::running()) {
650 | return $path;
651 | }
652 |
653 | $tmpPath = sys_get_temp_dir() ?: '/tmp';
654 | $filePath = "$tmpPath/" . basename($path);
655 | clearstatcache();
656 |
657 | if ((!isset($pathMaps[$path]) || !is_file($filePath)) || $update === true) {
658 | file_put_contents($filePath, file_get_contents($path));
659 | $pathMaps[$path] = $filePath;
660 | }
661 |
662 | return $pathMaps[$path];
663 | }
664 | }
665 |
--------------------------------------------------------------------------------
/src/SensitiveException.php:
--------------------------------------------------------------------------------
1 | true,
5 |
6 | // 支持file,array,也可以指向自己敏感词库文件路径
7 | // file模式时,敏感词库位于webman根目录的config/plugin/isszz/webman-sensitive/SensitiveWord.txt,也可以指向自定义的词库文件路径
8 | 'mode' => 'file',
9 | 'config' => [
10 | 'repeat' => true, // 重复替换为敏感词相同长度的字符
11 | 'replace_char' => '*', // 替换字符
12 | // 标记敏感词,标签生成敏感词
13 | 'mark' => 'mark',
14 | ],
15 |
16 | // 干扰因子
17 | 'interference_factors' => [
18 | ' ', '&', '*', '/', '|', '@', '.', '^', '~', '$',
19 | ],
20 |
21 | // 数组模式敏感词
22 | 'sensitive_words' => [
23 | '工口',
24 | '里番',
25 | '性感美女',
26 | ]
27 | ];
28 |
--------------------------------------------------------------------------------
/src/config/plugin/isszz/webman-sensitive/bootstrap.php:
--------------------------------------------------------------------------------
1 | {$name}(... $arguments);
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/helpers.php:
--------------------------------------------------------------------------------
1 |