├── doc ├── issues │ ├── v1-基本敏感词的去重.md │ ├── v4-用户自定义白名单.md │ ├── v3-用户自定义敏感词.md │ ├── roadmap │ │ ├── v009-自定义黑名单的处理.md │ │ ├── v010-自定义白名单的处理.md │ │ ├── v006-繁简体转换实现.md │ │ ├── v005-数字的转换实现.md │ │ ├── v016-自定义降噪处理.md │ │ ├── v015-镜像反转处理.md │ │ ├── v014-形近字的处理.md │ │ ├── v004-实现标点英文全角半角转换.md │ │ ├── v012-停顿词的处理.md │ │ ├── v007-重复词的处理.md │ │ ├── v014-声近字的处理.md │ │ └── v011-邮箱网址Regex检测实现.md │ ├── issues.md │ ├── 关联框架.md │ ├── v5-基本敏感词的标签.md │ └── v2-基本敏感词的简化.md └── 发布流程.md ├── .coveralls.yml ├── src ├── test │ ├── resources │ │ ├── sensitive_word_allow.txt │ │ ├── sensitive_word_deny.txt │ │ └── dict_tag_test.txt │ └── java │ │ └── com │ │ └── github │ │ └── houbb │ │ └── sensitive │ │ └── word │ │ ├── bugs │ │ ├── package-info.java │ │ ├── b20211211 │ │ │ ├── MyWordAllow.java │ │ │ ├── MyWordDeny.java │ │ │ └── MySensitiveTest.java │ │ ├── b32 │ │ │ ├── MyWordDenyChineseNum.java │ │ │ └── MyWordDenyChineseTest.java │ │ ├── b31 │ │ │ ├── BugWeixieTest.java │ │ │ └── Bug31Test.java │ │ ├── b29 │ │ │ └── Bug29Test.java │ │ ├── b55 │ │ │ └── Bug55Test.java │ │ └── b118 │ │ │ └── Bug118Test.java │ │ ├── support │ │ ├── package-info.java │ │ ├── format │ │ │ └── package-info.java │ │ ├── resultcondition │ │ │ ├── MyWordTag.java │ │ │ └── WordTagsTest.java │ │ └── handler │ │ │ └── WordResultHandlerTest.java │ │ ├── define │ │ ├── MyWordAllow.java │ │ ├── MyWordDeny.java │ │ └── SensitiveWordBsDefineTest.java │ │ ├── spring │ │ ├── annotation │ │ │ ├── Component.java │ │ │ ├── Autowired.java │ │ │ ├── Bean.java │ │ │ └── Configuration.java │ │ ├── database │ │ │ ├── MyDdWordAllow.java │ │ │ └── MyDdWordDeny.java │ │ ├── service │ │ │ └── SensitiveWordService.java │ │ └── SpringSensitiveWordConfig.java │ │ ├── bs │ │ ├── SensitiveWordBsDestroyTest.java │ │ ├── SensitiveWordBsChineseTest.java │ │ ├── SensitiveWordBsEnglishTest.java │ │ ├── SensitiveWordBsUserDefineTest.java │ │ ├── SensitiveWordBsRepeatTest.java │ │ ├── SensitiveWordBsDataTest.java │ │ ├── SensitiveWordBsSystemDictTest.java │ │ ├── SensitiveWordBsIpv4Test.java │ │ ├── SensitiveWordBsNumLenTest.java │ │ ├── SensitiveWordBsNumTest.java │ │ ├── SensitiveWordBsUrlNoPrefixTest.java │ │ ├── SensitiveWordBsEmailTest.java │ │ ├── SensitiveWordBsUrlTest.java │ │ ├── SensitiveWordBsIgnoreCharTest.java │ │ ├── SensitiveWordBsAllowTest.java │ │ └── SensitiveWordBsTagTest.java │ │ ├── data │ │ ├── WordCountDto.java │ │ ├── DictRemoveSingleTest.java │ │ ├── DictRemoveTwoEnglishTest.java │ │ ├── DictRemoveCommonITUsageTest.java │ │ ├── DictNumTest.java │ │ ├── StopWordTest.java │ │ ├── DataUtil.java │ │ └── NumUtilTest.java │ │ ├── replace │ │ └── MyWordReplace.java │ │ ├── memory │ │ └── DataMemoryTest.java │ │ └── benchmark │ │ ├── CharUtilPerfTest.java │ │ └── BenchmarkTimesTest.java └── main │ └── java │ └── com │ └── github │ └── houbb │ └── sensitive │ └── word │ ├── package-info.java │ ├── support │ ├── package-info.java │ ├── combine │ │ ├── package-info.java │ │ ├── check │ │ │ ├── WordCheckCombines.java │ │ │ ├── AbstractWordCheckCombine.java │ │ │ └── WordCheckCombine.java │ │ ├── allowdeny │ │ │ ├── WordAllowDenyCombines.java │ │ │ ├── WordAllowDenyCombine.java │ │ │ └── AbstractWordAllowDenyCombine.java │ │ └── format │ │ │ ├── WordFormatCombines.java │ │ │ ├── AbstractWordFormatCombine.java │ │ │ └── WordFormatCombine.java │ ├── warmup │ │ ├── WordWarmUps.java │ │ └── WordWarmUpDefault.java │ ├── tag │ │ ├── NoneWordTag.java │ │ ├── WordTagMap.java │ │ ├── WordTagSystem.java │ │ ├── AbstractWordTag.java │ │ ├── FileWordTag.java │ │ ├── AbstractWordTagInit.java │ │ └── WordTagLines.java │ ├── format │ │ ├── mapping │ │ │ ├── WordFormatTexts.java │ │ │ ├── AbstractWordFormatText.java │ │ │ └── WordFormatTextDefault.java │ │ ├── WordFormatNone.java │ │ ├── WordFormatIgnoreCase.java │ │ ├── WordFormatIgnoreWidth.java │ │ ├── WordFormatIgnoreChineseStyle.java │ │ ├── WordFormatArray.java │ │ ├── WordFormatIgnoreEnglishStyleC2C.java │ │ ├── WordFormatIgnoreEnglishStyle.java │ │ ├── WordFormats.java │ │ ├── WordFormatIgnoreNumStyle.java │ │ └── WordFormatIgnoreNumStyleC2C.java │ ├── ignore │ │ ├── NoneSensitiveWordCharIgnore.java │ │ ├── SensitiveWordCharIgnores.java │ │ ├── AbstractSensitiveWordCharIgnore.java │ │ └── SpecialCharSensitiveWordCharIgnore.java │ ├── deny │ │ ├── WordDenyEmpty.java │ │ ├── WordDenySystem.java │ │ ├── WordDenyInit.java │ │ └── WordDenys.java │ ├── allow │ │ ├── WordAllowEmpty.java │ │ ├── WordAllowSystem.java │ │ ├── WordAllowInit.java │ │ └── WordAllows.java │ ├── resultcondition │ │ ├── WordResultConditionAlwaysTrue.java │ │ ├── AbstractWordResultCondition.java │ │ ├── WordResultConditionInit.java │ │ ├── WordResultConditionWordTagsMatch.java │ │ ├── WordResultConditionEnglishWordNumMatch.java │ │ ├── WordResultConditionEnglishWordMatch.java │ │ └── WordResultConditions.java │ ├── data │ │ ├── WordDatas.java │ │ ├── WordDataTreeNode.java │ │ └── AbstractWordData.java │ ├── check │ │ ├── WordCheckUrlNoPrefix.java │ │ ├── WordCheckNone.java │ │ ├── WordCheckNum.java │ │ ├── WordCheckArray.java │ │ ├── WordCheckInit.java │ │ ├── WordCheckResult.java │ │ ├── WordCheckEmail.java │ │ ├── AbstractWordCheck.java │ │ ├── WordChecks.java │ │ ├── WordCheckUrl.java │ │ └── WordCheckIPV4.java │ ├── result │ │ ├── AbstractWordResultHandler.java │ │ ├── WordTagsDto.java │ │ ├── WordResultHandlerRaw.java │ │ ├── WordResultHandlerWord.java │ │ ├── WordResultHandlers.java │ │ ├── WordResultHandlerWordTags.java │ │ ├── WordResult.java │ │ └── WordLengthResult.java │ └── replace │ │ ├── WordReplaces.java │ │ └── WordReplaceChar.java │ ├── api │ ├── package-info.java │ ├── ISensitiveWordDestroy.java │ ├── IWordAllow.java │ ├── IWordDeny.java │ ├── IWordTag.java │ ├── IWordFormatText.java │ ├── combine │ │ ├── IWordCheckCombine.java │ │ ├── IWordFormatCombine.java │ │ └── IWordAllowDenyCombine.java │ ├── IWordFormat.java │ ├── IWordResultHandler.java │ ├── ISensitiveWordCharIgnore.java │ ├── IWordReplace.java │ ├── IWordResult.java │ ├── IWordWarmUp.java │ ├── IWordResultCondition.java │ ├── IWordCheck.java │ ├── IWordData.java │ ├── ISensitiveWord.java │ └── context │ │ └── InnerSensitiveWordContext.java │ ├── constant │ ├── package-info.java │ ├── enums │ │ ├── WordContainsTypeEnum.java │ │ ├── WordValidModeEnum.java │ │ ├── WordTypeEnum.java │ │ └── WordTagType.java │ └── WordConst.java │ ├── core │ └── SensitiveWords.java │ ├── exception │ └── SensitiveWordException.java │ ├── utils │ ├── InnerStreamUtils.java │ ├── InnerWordCharUtils.java │ ├── InnerWordTagUtils.java │ ├── InnerCharUtils.java │ └── InnerWordFormatUtils.java │ └── collection │ └── Char2CharMap.java ├── WECHAT.png ├── lmxxf_reword.png ├── .idea └── vcs.xml ├── .travis.yml ├── cgit.sh ├── cgit.bat ├── .gitignore ├── release.bat ├── release_rm.sh └── release.sh /doc/issues/v1-基本敏感词的去重.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveralls.yml: -------------------------------------------------------------------------------- 1 | service_name: travis-ci -------------------------------------------------------------------------------- /doc/issues/v4-用户自定义白名单.md: -------------------------------------------------------------------------------- 1 | # 用户自定义 2 | 3 | 自定义白名单 -------------------------------------------------------------------------------- /src/test/resources/sensitive_word_allow.txt: -------------------------------------------------------------------------------- 1 | gender -------------------------------------------------------------------------------- /src/test/resources/sensitive_word_deny.txt: -------------------------------------------------------------------------------- 1 | 敏感词 2 | 自定义敏感词 3 | -------------------------------------------------------------------------------- /doc/issues/v3-用户自定义敏感词.md: -------------------------------------------------------------------------------- 1 | # 用户自定义 2 | 3 | 自定义敏感词 4 | 5 | 可以指定类别。 -------------------------------------------------------------------------------- /WECHAT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/houbb/sensitive-word/HEAD/WECHAT.png -------------------------------------------------------------------------------- /doc/issues/roadmap/v009-自定义黑名单的处理.md: -------------------------------------------------------------------------------- 1 | 初始化构造的时候,加入即可。 2 | 3 | 4 | 这个后期可以添加一个开关控制。 -------------------------------------------------------------------------------- /doc/issues/roadmap/v010-自定义白名单的处理.md: -------------------------------------------------------------------------------- 1 | 如果敏感词在这个列表中,则认为没有命中,直接跳过。 2 | 3 | 获取在构造的时候,直接移除。 -------------------------------------------------------------------------------- /lmxxf_reword.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/houbb/sensitive-word/HEAD/lmxxf_reword.png -------------------------------------------------------------------------------- /src/test/resources/dict_tag_test.txt: -------------------------------------------------------------------------------- 1 | 五星红旗 政治,国家 2 | 毛主席 政治,国家,伟人 3 | 天安门 政治,国家,地址 4 | 0售 广告,网络 -------------------------------------------------------------------------------- /doc/issues/roadmap/v006-繁简体转换实现.md: -------------------------------------------------------------------------------- 1 | # 在遍历的时候 2 | 3 | 如果是中文,则直接进行替换。 4 | 5 | # 忽略英文的写法样式 6 | 7 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/package-info.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word; -------------------------------------------------------------------------------- /doc/issues/roadmap/v005-数字的转换实现.md: -------------------------------------------------------------------------------- 1 | # 转换为数字 2 | 3 | 所有中文/符号转换为数字。 4 | 5 | # 是否为多个数字的判断 6 | 7 | 连续超过 6 位的数字。 -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/package-info.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs; -------------------------------------------------------------------------------- /doc/issues/roadmap/v016-自定义降噪处理.md: -------------------------------------------------------------------------------- 1 | 有时候噪音是恶意插入的,程序本身难以辨认。 2 | 3 | 比如: 4 | 5 | ``` 6 | 123我是噪音456我是噪音789 7 | ```` 8 | 9 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/package-info.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support; -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/support/package-info.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support; -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/support/format/package-info.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; -------------------------------------------------------------------------------- /doc/issues/roadmap/v015-镜像反转处理.md: -------------------------------------------------------------------------------- 1 | # 这里比较消耗性能 2 | 3 | 主要针对国骂+政治非常敏感的个别名词。 4 | 5 | 你大爷 6 | 7 | 一句话如果反转之后是敏感词,那应该就是敏感词。 8 | 9 | 这个不是很着急用。 -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * api 定义 3 | * @since 0.0.1 4 | */ 5 | package com.github.houbb.sensitive.word.api; -------------------------------------------------------------------------------- /doc/issues/issues.md: -------------------------------------------------------------------------------- 1 | # 转换 2 | 3 | ## 拼音 4 | 5 | 首字母 6 | 7 | ## 繁简体 8 | 9 | ## 名称翻转 10 | 11 | ## stop-word 12 | 13 | ## 重复词 14 | 15 | ffffuuuucccckkk 16 | -------------------------------------------------------------------------------- /doc/issues/roadmap/v014-形近字的处理.md: -------------------------------------------------------------------------------- 1 | # 形近字 2 | 3 | 比如:王 玉 这种。 4 | 5 | 这种相对而言比较难,需要有一张完整的近似表。 6 | 7 | # 组合字 8 | 9 | 甚至包含偏旁部首: 10 | 11 | 如 `法`==》【氵去】【水去】等等。 12 | 13 | 这种可以通过原来的字直接进行拆分。 -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @author d 3 | * @since 1.0.0 4 | */ 5 | package com.github.houbb.sensitive.word.support.combine; 6 | -------------------------------------------------------------------------------- /doc/issues/roadmap/v004-实现标点英文全角半角转换.md: -------------------------------------------------------------------------------- 1 | # 字符 2 | 3 | 全部使用小写+半角的形式匹配。 4 | 5 | ## 忽略大小写 6 | 7 | if(Character.isLetter) { 8 | ignoreCase=true 9 | ignoreWidth=true 10 | } 11 | 12 | 13 | -------------------------------------------------------------------------------- /doc/issues/roadmap/v012-停顿词的处理.md: -------------------------------------------------------------------------------- 1 | # 标点符号 2 | 3 | 无论中文英文数字,其中特殊符号一定是停顿词。 4 | 5 | 可以这么粗俗的认为。 6 | 7 | # 英文 8 | 9 | 核心是英文停顿词。 10 | 11 | # 中文 12 | 13 | 如果是数字,则中文就会成为停顿词。 14 | 15 | 有这些字符,直接跳过。 -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk7 4 | install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true 5 | script: mvn test 6 | after_success: 7 | - mvn clean cobertura:cobertura coveralls:report 8 | 9 | -------------------------------------------------------------------------------- /doc/issues/关联框架.md: -------------------------------------------------------------------------------- 1 | stop-word 2 | 3 | 拼音 4 | 5 | 繁简体转换 6 | 7 | 全角半角转换 8 | 9 | 重复词 10 | 11 | # 其他 12 | 13 | 中文英文转换(待定) 14 | 15 | 手写 Regex 16 | 17 | 分词 18 | 19 | ## 核心原理 20 | 21 | DFA 算法 22 | 23 | 根据又穷状态机去处理。 -------------------------------------------------------------------------------- /doc/issues/roadmap/v007-重复词的处理.md: -------------------------------------------------------------------------------- 1 | ffffuuuuccckkk 2 | 3 | 直接认为是 4 | 5 | f xxx 6 | 7 | x 如果和上一个字符一样,则直接忽略。 8 | 9 | # 细节 10 | 11 | 当开启的时候,如果在敏感词获取的时候,如果下一个字没有找到,则进行去重。 12 | 13 | 即如果当前字符和上一个字符完全一样,则直接跳过。(仅仅在没有普匹配的场景下) -------------------------------------------------------------------------------- /cgit.sh: -------------------------------------------------------------------------------- 1 | # 提交 2 | 3 | git pull 4 | git add . 5 | git commit -m "[Feature] add for new" 6 | git push 7 | git status 8 | 9 | # 1. 赋值权限: chmod +x ./cgit.sh 10 | # 2. 执行: ./cgit.sh 11 | # Last Update Time: 2018-11-21 21:55:38 12 | # Author: houbb -------------------------------------------------------------------------------- /cgit.bat: -------------------------------------------------------------------------------- 1 | :: 用于提交当前变更(windows) 2 | :: author: houbb 3 | :: LastUpdateTime: 2018-11-22 09:08:52 4 | :: 用法:双击运行,或者当前路径 cmd 直接输入 .\cgit.bat 5 | 6 | git pull 7 | git add . 8 | git commit -m "[Feature] add for new" 9 | git push 10 | git status 11 | 12 | -------------------------------------------------------------------------------- /doc/issues/v5-基本敏感词的标签.md: -------------------------------------------------------------------------------- 1 | 一个词可以有多个标签。 2 | 3 | 为基本词语做标签标记。 4 | 5 | # privacy 6 | 7 | 个人隐私 8 | 9 | # sex 10 | 11 | 色情 12 | 13 | # violence 14 | 15 | 暴力 16 | 17 | # political 18 | 19 | 政治 20 | 21 | # cult 22 | 23 | 邪教 24 | 25 | # ad 26 | 27 | 广告 -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/constant/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | *

project: sensitive-word-package-info

3 | *

create on 2020/1/7 22:46

4 | * 5 | * @author Administrator 6 | * @since 1.0.0 7 | */ 8 | package com.github.houbb.sensitive.word.constant; -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordDestroy.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | public interface ISensitiveWordDestroy { 4 | 5 | /** 6 | * 资源的销毁 7 | * @since 0.16.0 8 | */ 9 | void destroy(); 10 | 11 | } 12 | -------------------------------------------------------------------------------- /doc/issues/v2-基本敏感词的简化.md: -------------------------------------------------------------------------------- 1 | # 简化部分信息 2 | 3 | ## 纯数字 4 | 5 | 移除 6 | 7 | ## 去重 8 | 9 | 移除大量重复的信息。 10 | 11 | 提取出关键的敏感词语即可。 12 | 13 | ## 包含 stop-word 的信息 14 | 15 | 移除 stop-word 之后进行相关的处理。 16 | 17 | ## 数字 18 | 19 | 0123456789 20 | 21 | 对应的任意写法。 22 | 23 | https://github.com/toolgood 思想值得借鉴。 -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordAllow.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * 允许的内容-返回的内容不被当做敏感词 7 | * @author binbin.hou 8 | * @since 0.0.13 9 | */ 10 | public interface IWordAllow { 11 | 12 | /** 13 | * 获取结果 14 | * @return 结果 15 | * @since 0.0.13 16 | */ 17 | List allow(); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordDeny.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * 拒绝出现的数据-返回的内容被当做是敏感词 7 | * @author binbin.hou 8 | * @since 0.0.13 9 | */ 10 | public interface IWordDeny { 11 | 12 | /** 13 | * 获取结果 14 | * @return 结果 15 | * @since 0.0.13 16 | */ 17 | List deny(); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordTag.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * 获取脏词的标签,便于分类 7 | * 8 | * @author dh 9 | * @since 0.10.0 10 | */ 11 | public interface IWordTag { 12 | 13 | /** 14 | * 查询标签列表 15 | * @param word 脏词 16 | * @return 结果 17 | */ 18 | Set getTag(final String word); 19 | 20 | } 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # maven ignore 2 | target/ 3 | *.jar 4 | *.war 5 | *.zip 6 | *.tar 7 | *.tar.gz 8 | 9 | # eclipse ignore 10 | .settings/ 11 | .project 12 | .classpath 13 | 14 | # idea ignore 15 | .idea/ 16 | *.ipr 17 | *.iml 18 | *.iws 19 | 20 | # temp ignore 21 | *.log 22 | *.cache 23 | *.diff 24 | *.patch 25 | *.tmp 26 | *.java~ 27 | *.properties~ 28 | *.xml~ 29 | 30 | # system ignore 31 | .DS_Store 32 | Thumbs.db 33 | 34 | 35 | *.jfr -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MyWordAllow.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b20211211; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | public class MyWordAllow implements IWordAllow { 9 | 10 | @Override 11 | public List allow() { 12 | return Arrays.asList("五星红旗"); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MyWordDeny.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b20211211; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | public class MyWordDeny implements IWordDeny { 9 | 10 | @Override 11 | public List deny() { 12 | return Arrays.asList("尼玛"); 13 | } 14 | 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b32/MyWordDenyChineseNum.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b32; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | public class MyWordDenyChineseNum implements IWordDeny { 9 | 10 | @Override 11 | public List deny() { 12 | return Arrays.asList("三三九乘元功", "一军两策"); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /doc/issues/roadmap/v014-声近字的处理.md: -------------------------------------------------------------------------------- 1 | 主要是两个引用: 2 | 3 | (1)中文生成拼音 4 | 5 | 例如国骂的各种简写,拼音。 6 | 7 | 感觉比较合适 8 | 9 | (2)数字 10 | 11 | 对于数字,除却象形,最常用的就是谐音。 12 | 13 | ## 不可变性 14 | 15 | 这个涉及到拼音的 DFA 树构建,可能需要 wordMap 提供一个添加的接口。 16 | 17 | 这个需要在初始化的时候,直接指定。而且不可变化。 18 | 19 | # 数据的来源 20 | 21 | ## dict 在变为数字之前 22 | 23 | 全部变为 dict_pinyin.txt 保存一份。 24 | 25 | 现在的转化为数字的,也生成一份拼音。 26 | 27 | 然后将二者进行合并。 28 | 29 | ## 拼音的处理 30 | 31 | 拼音的处理只是形声字。 32 | 33 | 还可以有象形字,所以第一份包含中文写法的字段很重要。 -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/constant/enums/WordContainsTypeEnum.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.constant.enums; 2 | 3 | /** 4 | * 单词包含类别 5 | * @since 0.4.0 6 | */ 7 | public enum WordContainsTypeEnum { 8 | 9 | /** 10 | * 包含+前缀 11 | */ 12 | CONTAINS_PREFIX, 13 | 14 | /** 15 | * 包含+且是结尾 16 | */ 17 | CONTAINS_END, 18 | 19 | /** 20 | * 不存在 21 | */ 22 | NOT_FOUND, 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/warmup/WordWarmUps.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.warmup; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordWarmUp; 4 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 5 | 6 | /** 7 | * 预热策略 8 | * @since 0.29.0 9 | */ 10 | public final class WordWarmUps { 11 | 12 | public static IWordWarmUp defaults() { 13 | return new WordWarmUpDefault(); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/NoneWordTag.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import java.util.Collections; 4 | import java.util.Set; 5 | 6 | /** 7 | * 空标签 8 | * 9 | * word tag1,tag2 10 | * @since 0.10.0 11 | */ 12 | public class NoneWordTag extends AbstractWordTag { 13 | 14 | @Override 15 | protected Set doGetTag(String word) { 16 | return Collections.emptySet(); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b31/BugWeixieTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b31; 2 | 3 | import com.github.houbb.sensitive.word.core.SensitiveWordHelper; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | public class BugWeixieTest { 8 | 9 | @Test 10 | public void lettersTest() { 11 | String text = "我受到了威胁救救我"; 12 | 13 | System.out.println(SensitiveWordHelper.findAll(text)); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/define/MyWordAllow.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.define; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | /** 9 | * @author binbin.hou 10 | * @since 0.0.14 11 | */ 12 | public class MyWordAllow implements IWordAllow { 13 | 14 | @Override 15 | public List allow() { 16 | return Arrays.asList("测试"); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/define/MyWordDeny.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.define; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | /** 9 | * @author binbin.hou 10 | * @since 0.0.14 11 | */ 12 | public class MyWordDeny implements IWordDeny { 13 | 14 | @Override 15 | public List deny() { 16 | return Arrays.asList("我的自定义敏感词"); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/annotation/Component.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * @author binbin.hou 10 | * @since 1.0.0 11 | */ 12 | @Target(ElementType.TYPE) 13 | @Retention(RetentionPolicy.RUNTIME) 14 | public @interface Component { 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/check/WordCheckCombines.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.check; 2 | 3 | import com.github.houbb.sensitive.word.api.combine.IWordCheckCombine; 4 | 5 | /** 6 | * @author d 7 | * @since 1.0.0 8 | */ 9 | public final class WordCheckCombines { 10 | 11 | private WordCheckCombines(){} 12 | 13 | public static IWordCheckCombine defaults() { 14 | return new WordCheckCombine(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/core/SensitiveWords.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.core; 2 | 3 | import com.github.houbb.sensitive.word.api.ISensitiveWord; 4 | 5 | /** 6 | * 策略工具类 7 | * @since 0.3.2 8 | */ 9 | public final class SensitiveWords { 10 | 11 | private SensitiveWords(){} 12 | 13 | /** 14 | * 默认策略 15 | * @return 策略 16 | */ 17 | public static ISensitiveWord defaults() { 18 | return SensitiveWord.getInstance(); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsDestroyTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Test; 4 | 5 | /** 6 | * 资源的销毁 7 | * 8 | * @since 0.16.0 9 | */ 10 | public class SensitiveWordBsDestroyTest { 11 | 12 | @Test 13 | public void destroyTest() { 14 | SensitiveWordBs wordBs = SensitiveWordBs.newInstance() 15 | .init(); 16 | // 后续因为一些原因移除了对应信息,希望释放内存。 17 | wordBs.destroy(); 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTexts.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format.mapping; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordFormatText; 4 | 5 | /** 6 | * 格式化工具类 7 | * @author binbin.hou 8 | * @since 0.28.0 9 | */ 10 | public final class WordFormatTexts { 11 | 12 | private WordFormatTexts(){} 13 | 14 | public static IWordFormatText defaults() { 15 | return new WordFormatTextDefault(); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/allowdeny/WordAllowDenyCombines.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.allowdeny; 2 | 3 | import com.github.houbb.sensitive.word.api.combine.IWordAllowDenyCombine; 4 | 5 | /** 6 | * @author d 7 | * @since 1.0.0 8 | */ 9 | public final class WordAllowDenyCombines { 10 | 11 | private WordAllowDenyCombines(){} 12 | 13 | public static IWordAllowDenyCombine defaults() { 14 | return new WordAllowDenyCombine(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/constant/enums/WordValidModeEnum.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.constant.enums; 2 | 3 | /** 4 | *

project: sensitive-word-ValidModeEnum

5 | *

create on 2020/1/7 22:46

6 | * 7 | * @author Administrator 8 | * @since 0.0.1 9 | */ 10 | public enum WordValidModeEnum { 11 | 12 | /** 13 | * 快速失败 14 | * @since 0.0.1 15 | */ 16 | FAIL_FAST, 17 | 18 | /** 19 | * 全部遍历 20 | * @since 0.0.1 21 | */ 22 | FAIL_OVER 23 | } 24 | -------------------------------------------------------------------------------- /doc/issues/roadmap/v011-邮箱网址Regex检测实现.md: -------------------------------------------------------------------------------- 1 | # 是否为邮箱 check 2 | 3 | 4 | 暂时先使用基本的正则表达式, 5 | 6 | ================== 7 | 8 | 网址等等 9 | 10 | URL 初期可以不做。 11 | 12 | Image-URL 检测,避免替换错误。 13 | 14 | 如果 image-url 中包含数字,直接替换就会导致问题。 15 | 16 | 针对不同的信息脱敏,则需要知道对应的检测代码是什么。 17 | 18 | jpg 19 | png 20 | jpeg 21 | gif 22 | 23 | ## 是否脱敏的配置 24 | 25 | - 敏感词 √ 26 | 27 | - url × 28 | 29 | - 数字 √ 30 | 31 | # 是否为 URL check 32 | 33 | 可以直接开辟另一道验证方式。 34 | 35 | 直接 regex+全文检索实现。 36 | 37 | # 前提 38 | 39 | 首先实现 Regex 40 | 41 | 这里也可以支持 allow_regex/deny_regex 42 | 43 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/ignore/NoneSensitiveWordCharIgnore.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.ignore; 2 | 3 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 4 | 5 | /** 6 | * 特殊字符忽略 7 | * @since 0.11.0 8 | */ 9 | public class NoneSensitiveWordCharIgnore extends AbstractSensitiveWordCharIgnore { 10 | 11 | @Override 12 | protected boolean doIgnore(int ix, String text, InnerSensitiveWordContext innerContext) { 13 | return false; 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b29/Bug29Test.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b29; 2 | 3 | import com.github.houbb.sensitive.word.core.SensitiveWordHelper; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | public class Bug29Test { 8 | 9 | @Test 10 | public void test() { 11 | String text = "生日快乐"; 12 | 13 | Assert.assertFalse(SensitiveWordHelper.contains(text)); 14 | Assert.assertEquals("[]", SensitiveWordHelper.findAll(text).toString()); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/format/WordFormatCombines.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.format; 2 | 3 | import com.github.houbb.sensitive.word.api.combine.IWordFormatCombine; 4 | 5 | /** 6 | * @author d 7 | * @since 0.8.0 8 | */ 9 | public final class WordFormatCombines { 10 | 11 | /** 12 | * 默认策略 13 | * @return 策略 14 | * @since 0.8.0 15 | */ 16 | public static IWordFormatCombine defaults() { 17 | return new WordFormatCombine(); 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b31/Bug31Test.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b31; 2 | 3 | import com.github.houbb.sensitive.word.core.SensitiveWordHelper; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | public class Bug31Test { 8 | 9 | @Test 10 | public void lettersTest() { 11 | String text = "你是SB吧"; 12 | 13 | Assert.assertTrue(SensitiveWordHelper.contains(text)); 14 | Assert.assertEquals("[SB]", SensitiveWordHelper.findAll(text).toString()); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordFormatText.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import java.util.Map; 4 | 5 | /** 6 | * 单词整体格式化 7 | * 8 | * @author binbin.hou 9 | * @since 0.28.0 10 | */ 11 | public interface IWordFormatText { 12 | 13 | /** 14 | * 针对 text 格式化映射,提升对整体的控制力 15 | * 16 | * @param text 原始 文本 17 | * @param context 上下文 18 | * @return 格式化后的 char 19 | * @since 0.28.0 20 | */ 21 | Map format(final String text, final IWordContext context); 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/combine/IWordCheckCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api.combine; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordCheck; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | 6 | /** 7 | * @author d 8 | * @since 0.8.0 9 | */ 10 | public interface IWordCheckCombine { 11 | 12 | /** 13 | * 初始化敏感检测策略 14 | * @param context 上下文 15 | * 16 | * @return 实现 17 | * @since 0.8.0 18 | */ 19 | IWordCheck initWordCheck(final IWordContext context); 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/combine/IWordFormatCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api.combine; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordFormat; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | 6 | /** 7 | * @author d 8 | * @since 0.8.0 9 | */ 10 | public interface IWordFormatCombine { 11 | 12 | /** 13 | * 初始化 charFormat 14 | * @param context 上下文 15 | * @return 结果 16 | * @since 0.8.0 17 | */ 18 | IWordFormat initWordFormat(final IWordContext context); 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordFormat.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | /** 4 | * 单词格式化 5 | * (1)忽略大小写 6 | * (2)忽略全角半角 7 | * (3)忽略停顿词 8 | * (4)忽略数字转换。 9 | * 10 | * @author binbin.hou 11 | * @since 0.0.5 12 | */ 13 | public interface IWordFormat { 14 | 15 | /** 16 | * 针对 char 格式化 17 | * @param original 原始 char 18 | * @param context 上下文 19 | * @return 格式化后的 char 20 | * @since 0.0.5 21 | */ 22 | char format(final char original, 23 | final IWordContext context); 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordResultHandler.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | /** 4 | * 敏感词的结果处理 5 | * @author binbin.hou 6 | * @since 0.1.0 7 | */ 8 | public interface IWordResultHandler { 9 | 10 | /** 11 | * 对于结果的处理 12 | * @param wordResult 结果 13 | * @param wordContext 上下文 14 | * @param originalText 原始文本 15 | * @return 处理结果 16 | * @since 0.1.0 17 | */ 18 | R handle(final IWordResult wordResult, 19 | final IWordContext wordContext, 20 | final String originalText); 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b55/Bug55Test.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b55; 2 | 3 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | public class Bug55Test { 8 | 9 | @Test 10 | public void test() { 11 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 12 | .init(); 13 | final String text = "以个人账户或现金收取资金、现场或即时交付本金即给予部分提成、分红、利息;"; 14 | Assert.assertEquals("[]", sensitiveWordBs.findAll(text).toString()); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/database/MyDdWordAllow.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.database; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | import com.github.houbb.sensitive.word.spring.annotation.Component; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | /** 10 | * @author binbin.hou 11 | * @since 1.0.0 12 | */ 13 | @Component 14 | public class MyDdWordAllow implements IWordAllow { 15 | 16 | @Override 17 | public List allow() { 18 | // 数据库查询 19 | return Arrays.asList("学习"); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/database/MyDdWordDeny.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.database; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | import com.github.houbb.sensitive.word.spring.annotation.Component; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | /** 10 | * @author binbin.hou 11 | * @since 1.0.0 12 | */ 13 | @Component 14 | public class MyDdWordDeny implements IWordDeny { 15 | 16 | @Override 17 | public List deny() { 18 | // 数据库返回的各种信息 19 | return Arrays.asList("广告"); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordCharIgnore.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 4 | 5 | /** 6 | * 是否忽略某一个字符 7 | * @since 0.11.0 8 | */ 9 | public interface ISensitiveWordCharIgnore { 10 | 11 | /** 12 | * 是否忽略当前字符 13 | * @param ix 下标志 14 | * @param text 字符串 15 | * @param innerContext 上下文 16 | * @return 结果 17 | */ 18 | boolean ignore(final int ix, 19 | final String text, 20 | InnerSensitiveWordContext innerContext); 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/annotation/Autowired.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * @author binbin.hou 10 | * @since 1.0.0 11 | */ 12 | @Target({ElementType.CONSTRUCTOR, ElementType.METHOD, ElementType.PARAMETER, ElementType.FIELD, ElementType.ANNOTATION_TYPE}) 13 | @Retention(RetentionPolicy.RUNTIME) 14 | public @interface Autowired { 15 | 16 | boolean required() default true; 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordReplace.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | /** 4 | * 敏感词替换策略 5 | * 6 | * @author binbin.hou 7 | * @since 0.2.0 8 | */ 9 | public interface IWordReplace { 10 | 11 | /** 12 | * 替换 13 | *

14 | * 説明:废弃以前的字符串返回,减少对象创建,提升性能。 15 | * 16 | * @param stringBuilder 字符串连接器 17 | * @param rawText 原始字符串 18 | * @param wordResult 当前的敏感词结果 19 | * @param wordContext 上下文 20 | * @since 0.4.0 21 | */ 22 | void replace(final StringBuilder stringBuilder, final String rawText, final IWordResult wordResult, final IWordContext wordContext); 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenyEmpty.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.deny; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.util.io.StreamUtil; 5 | import com.github.houbb.sensitive.word.api.IWordDeny; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.List; 10 | 11 | /** 12 | * 空实现 13 | * @author binbin.hou 14 | * @since 0.19.0 15 | */ 16 | @ThreadSafe 17 | public class WordDenyEmpty implements IWordDeny { 18 | 19 | @Override 20 | public List deny() { 21 | return new ArrayList<>(); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllowEmpty.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.allow; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.util.io.StreamUtil; 5 | import com.github.houbb.sensitive.word.api.IWordAllow; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.List; 10 | 11 | /** 12 | * 空列表 13 | * @author binbin.hou 14 | * @since 0.19.0 15 | */ 16 | @ThreadSafe 17 | public class WordAllowEmpty implements IWordAllow { 18 | 19 | @Override 20 | public List allow() { 21 | return new ArrayList<>(); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/ignore/SensitiveWordCharIgnores.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.ignore; 2 | 3 | import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; 4 | 5 | /** 6 | * @since 0.11.0 7 | */ 8 | public class SensitiveWordCharIgnores { 9 | 10 | public static ISensitiveWordCharIgnore specialChars() { 11 | return new SpecialCharSensitiveWordCharIgnore(); 12 | } 13 | 14 | public static ISensitiveWordCharIgnore none() { 15 | return new NoneSensitiveWordCharIgnore(); 16 | } 17 | 18 | public static ISensitiveWordCharIgnore defaults() { 19 | return none(); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/annotation/Bean.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * @author binbin.hou 10 | * @since 1.0.0 11 | */ 12 | @Target({ElementType.METHOD, ElementType.ANNOTATION_TYPE}) 13 | @Retention(RetentionPolicy.RUNTIME) 14 | public @interface Bean { 15 | 16 | String[] value() default {}; 17 | 18 | String[] name() default {}; 19 | 20 | String initMethod() default ""; 21 | 22 | String destroyMethod() default ""; 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/WordResultConditionAlwaysTrue.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | import com.github.houbb.sensitive.word.api.IWordResult; 5 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 6 | 7 | /** 8 | * 恒为真 9 | * 10 | * @since 0.13.0 11 | */ 12 | public class WordResultConditionAlwaysTrue extends AbstractWordResultCondition { 13 | 14 | @Override 15 | protected boolean doMatch(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context) { 16 | return true; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/data/WordDatas.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.data; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordData; 4 | 5 | /** 6 | * 敏感词 map 7 | * 8 | * @author binbin.hou 9 | * @since 0.3.0 10 | */ 11 | public final class WordDatas { 12 | 13 | private WordDatas(){} 14 | 15 | /** 16 | * 默认策略 17 | * @return 策略 18 | * @since 0.3.0 19 | */ 20 | public static IWordData defaults() { 21 | return tree(); 22 | } 23 | 24 | /** 25 | * 树模式 26 | * @return 树 27 | * @since 0.7.0 28 | */ 29 | public static IWordData tree() { 30 | return new WordDataTree(); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordResult.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | /** 4 | * 敏感词的结果 5 | * @author binbin.hou 6 | * @since 0.1.0 7 | */ 8 | public interface IWordResult { 9 | 10 | /** 11 | * 开始下标 12 | * @return 开始下标 13 | * @since 0.1.0 14 | */ 15 | int startIndex(); 16 | 17 | /** 18 | * 结束下标 19 | * @return 结束下标 20 | * @since 0.1.0 21 | */ 22 | int endIndex(); 23 | 24 | /** 25 | * 类别 26 | * @return 类别 27 | * @since 0.14.0 28 | */ 29 | String type(); 30 | 31 | /** 32 | * 实际匹配的单词,方便统一的标签等处理,实际问题排查等 33 | * @return 结果 34 | * @since 0.25.1 35 | */ 36 | String word(); 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordWarmUp.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * 提前预热,触发类加载、JIT 优化等 9 | * @author binbin.hou 10 | * @since 0.29.0 11 | */ 12 | public interface IWordWarmUp { 13 | 14 | /** 15 | * 预热 16 | * @param sensitiveWordBs 引导类本身 17 | * @param wordContext 上下文 18 | * @param wordDenyList 允许 19 | * @param wordAllowList 禁止 20 | */ 21 | void warmUp(final SensitiveWordBs sensitiveWordBs, 22 | final IWordContext wordContext, 23 | final List wordAllowList, 24 | final List wordDenyList); 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/ignore/AbstractSensitiveWordCharIgnore.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.ignore; 2 | 3 | import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; 4 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 5 | 6 | /** 7 | * 抽象实现 8 | * @since 0.11.0 9 | */ 10 | public abstract class AbstractSensitiveWordCharIgnore implements ISensitiveWordCharIgnore { 11 | 12 | protected abstract boolean doIgnore(int ix, String text, InnerSensitiveWordContext innerContext); 13 | 14 | @Override 15 | public boolean ignore(int ix, String text, InnerSensitiveWordContext innerContext) { 16 | return doIgnore(ix, text, innerContext); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordResultCondition.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 4 | 5 | /** 6 | * 敏感词的结果是否匹配 7 | * @author binbin.hou 8 | * @since 0.13.0 9 | */ 10 | public interface IWordResultCondition { 11 | 12 | /** 13 | * 是否匹配 14 | * @param wordResult 根据词匹配的结果 15 | * @param text 原始文本 16 | * @param modeEnum 枚举类别 17 | * @param context 上下文 18 | * @return 是否匹配 19 | * @since 0.13.0 20 | */ 21 | boolean match(final IWordResult wordResult, 22 | final String text, 23 | final WordValidModeEnum modeEnum, 24 | final IWordContext context); 25 | 26 | } 27 | -------------------------------------------------------------------------------- /doc/发布流程.md: -------------------------------------------------------------------------------- 1 | # push to mvn center 2 | 3 | 确认版本为 release 4 | 5 | ``` 6 | mvn clean deploy -P release 7 | ``` 8 | 9 | # commit to github 10 | 11 | ``` 12 | git push 13 | ``` 14 | 15 | # merge to master 16 | 17 | ``` 18 | git checkout master 19 | git pull 20 | git checkout branch 21 | git rebase master (用rebase合并主干的修改,如果有冲突在此时解决) 22 | git checkout master 23 | git merge branch 24 | git push 25 | ``` 26 | 27 | # create new branch & checkout 28 | 29 | ``` 30 | git branch release_XXX 31 | git checkout release_XXX 32 | ``` 33 | 34 | # modify project version 35 | 36 | ``` 37 | mvn versions:set -DgroupId=com.github.houbb -DartifactId=paradise* -DoldVersion=1.1.1 -DnewVersion=1.1.2-SNAPSHOT--> 38 | mvn -N versions:update-child-modules 39 | mvn versions:commit 40 | ``` 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/constant/enums/WordTypeEnum.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.constant.enums; 2 | 3 | /** 4 | * 单词类别包含类别 5 | * @since 0.14.0 6 | */ 7 | public enum WordTypeEnum { 8 | WORD("WORD", "敏感词"), 9 | EMAIL("EMAIL", "邮箱"), 10 | URL("URL", "链接"), 11 | NUM("NUM", "数字"), 12 | IPV4("IPV4", "IPv4"), 13 | 14 | DEFAULTS("DEFAULTS", "默认"), 15 | ; 16 | 17 | private final String code; 18 | private final String desc; 19 | 20 | WordTypeEnum(String code, String desc) { 21 | this.code = code; 22 | this.desc = desc; 23 | } 24 | 25 | public String getCode() { 26 | return code; 27 | } 28 | 29 | public String getDesc() { 30 | return desc; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatNone.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordFormat; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | 7 | /** 8 | * 无处理 9 | * 10 | * @author binbin.hou 11 | * @since 0.0.5 12 | */ 13 | @ThreadSafe 14 | public class WordFormatNone implements IWordFormat { 15 | 16 | private static final IWordFormat INSTANCE = new WordFormatNone(); 17 | 18 | public static IWordFormat getInstance() { 19 | return INSTANCE; 20 | } 21 | 22 | @Override 23 | public char format(char original, IWordContext context) { 24 | return original; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/combine/IWordAllowDenyCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api.combine; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | 5 | import java.util.Collection; 6 | import java.util.List; 7 | 8 | /** 9 | * @author d 10 | * @since 0.8.0 11 | */ 12 | public interface IWordAllowDenyCombine { 13 | 14 | /** 15 | * 获取最终的拒绝单词列表 16 | * @param allowList 允许 17 | * @param denyList 拒绝 18 | * @param context 上下文 19 | * @return 结果 20 | * @since 0.8.0 21 | */ 22 | Collection getActualDenyList(final List allowList, 23 | final List denyList, 24 | final IWordContext context); 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/WordTagMap.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import com.github.houbb.heaven.util.common.ArgUtil; 4 | 5 | import java.util.Map; 6 | import java.util.Set; 7 | 8 | /** 9 | * 根据 map 构建初始化 10 | * 11 | * key:单词 12 | * value: 标签 set 13 | * 14 | * @since 0.24.0 15 | */ 16 | public class WordTagMap extends AbstractWordTag { 17 | 18 | private final Map> wordTagMap; 19 | 20 | public WordTagMap(Map> wordTagMap) { 21 | ArgUtil.notNull(wordTagMap, "wordTagMap"); 22 | this.wordTagMap = wordTagMap; 23 | } 24 | 25 | @Override 26 | protected Set doGetTag(String word) { 27 | return wordTagMap.get(word); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.6 14 | */ 15 | public class SensitiveWordBsChineseTest { 16 | 17 | /** 18 | * 忽略中文繁简体 19 | * @since 0.0.6 20 | */ 21 | @Test 22 | public void ignoreChineseStyleTest() { 23 | final String text = "我爱我的祖国和五星紅旗。"; 24 | 25 | List wordList = SensitiveWordBs.newInstance().init().findAll(text); 26 | Assert.assertEquals("[五星紅旗]", wordList.toString()); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEnglishTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.6 14 | */ 15 | public class SensitiveWordBsEnglishTest { 16 | 17 | /** 18 | * 忽略英文写法 19 | * @since 0.0.6 20 | */ 21 | @Test 22 | public void ignoreEnglishStyleTest() { 23 | final String text = "Ⓕⓤc⒦ the bad words"; 24 | 25 | List wordList = SensitiveWordBs.newInstance().init().findAll(text); 26 | Assert.assertEquals("[Ⓕⓤc⒦]", wordList.toString()); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/WordTagSystem.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordTag; 4 | import com.github.houbb.sensitive.word.utils.InnerStreamUtils; 5 | 6 | import java.util.List; 7 | import java.util.Set; 8 | 9 | /** 10 | * 系统内置策略,根据文件默认处理 11 | * 12 | * @since 0.24.0 13 | */ 14 | public class WordTagSystem extends AbstractWordTag { 15 | 16 | private final IWordTag wordTag; 17 | 18 | public WordTagSystem() { 19 | List lines = InnerStreamUtils.readAllLines("/sensitive_word_tags.txt"); 20 | this.wordTag = WordTags.lines(lines); 21 | } 22 | 23 | @Override 24 | protected Set doGetTag(String word) { 25 | return wordTag.getTag(word); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreCase.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordFormat; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | 7 | /** 8 | * 忽略大小写 9 | * @author binbin.hou 10 | * @since 0.0.5 11 | */ 12 | @ThreadSafe 13 | public class WordFormatIgnoreCase implements IWordFormat { 14 | 15 | private static final IWordFormat INSTANCE = new WordFormatIgnoreCase(); 16 | 17 | public static IWordFormat getInstance() { 18 | return INSTANCE; 19 | } 20 | 21 | @Override 22 | public char format(char original, IWordContext context) { 23 | return Character.toLowerCase(original); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUserDefineTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.8 14 | */ 15 | public class SensitiveWordBsUserDefineTest { 16 | 17 | /** 18 | * 自定义允许和拒绝的文件 19 | * @since 0.0.8 20 | */ 21 | @Test 22 | public void allowAndDenyTest() { 23 | final String text = "gender 我们认为应该通过,自定义敏感词我们认为应该拒绝。"; 24 | 25 | List wordList = SensitiveWordBs.newInstance().init().findAll(text); 26 | Assert.assertEquals("[自定义敏感词]", wordList.toString()); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/constant/WordConst.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.constant; 2 | 3 | /** 4 | *

project: sensitive-word-AppConst

5 | *

create on 2020/1/7 23:39

6 | * 7 | * @author Administrator 8 | * @since 0.0.1 9 | */ 10 | public final class WordConst { 11 | 12 | private WordConst(){} 13 | 14 | /** 15 | * 是否为结束标识 16 | * ps: 某种角度而言,我不是很喜欢这种风格。 17 | * (1)正常的 char 只會占用一個字符,这里直接给定两个字符即可,降低 Map 的容量。 18 | * @since 0.0.1 19 | */ 20 | public static final String IS_END = "ED"; 21 | 22 | /** 23 | * 最长的网址长度 24 | * @since 0.3.0 25 | */ 26 | public static final int MAX_WEB_SITE_LEN = 70; 27 | 28 | /** 29 | * 最大邮箱地址 30 | * @since 0.4.0 31 | */ 32 | public static final int MAX_EMAIL_LEN = 64; 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/AbstractWordTag.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import com.github.houbb.heaven.util.lang.StringUtil; 4 | import com.github.houbb.sensitive.word.api.IWordTag; 5 | 6 | import java.util.Collections; 7 | import java.util.Set; 8 | 9 | /** 10 | * 抽象的单词标签 11 | * 12 | * @since 0.10.0 13 | */ 14 | public abstract class AbstractWordTag implements IWordTag { 15 | 16 | 17 | /** 18 | * 获取标签 19 | * @param word 单词 20 | * @return 结果 21 | */ 22 | protected abstract Set doGetTag(String word); 23 | 24 | @Override 25 | public Set getTag(String word) { 26 | if(StringUtil.isEmpty(word)) { 27 | return Collections.emptySet(); 28 | } 29 | 30 | return doGetTag(word); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/exception/SensitiveWordException.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.exception; 2 | 3 | /** 4 | * @author binbin.hou 5 | * @since 0.0.1 6 | */ 7 | public class SensitiveWordException extends RuntimeException { 8 | 9 | public SensitiveWordException() { 10 | } 11 | 12 | public SensitiveWordException(String message) { 13 | super(message); 14 | } 15 | 16 | public SensitiveWordException(String message, Throwable cause) { 17 | super(message, cause); 18 | } 19 | 20 | public SensitiveWordException(Throwable cause) { 21 | super(cause); 22 | } 23 | 24 | public SensitiveWordException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 25 | super(message, cause, enableSuppression, writableStackTrace); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckUrlNoPrefix.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.util.util.regex.RegexUtil; 4 | import com.github.houbb.sensitive.word.api.IWordCheck; 5 | 6 | /** 7 | * (1)暂时先粗略的处理 web-site 8 | * (2)如果网址的最后为图片类型,则跳过。 9 | * (3)长度超过 70,直接结束。 10 | * 11 | * 不包含前缀的实现策略 12 | * 13 | * @author binbin.hou 14 | * @since 0.25.0 15 | */ 16 | public class WordCheckUrlNoPrefix extends WordCheckUrl { 17 | 18 | /** 19 | * @since 0.3.0 20 | */ 21 | private static final IWordCheck INSTANCE = new WordCheckUrlNoPrefix(); 22 | 23 | public static IWordCheck getInstance() { 24 | return INSTANCE; 25 | } 26 | 27 | @Override 28 | protected boolean isUrl(String text) { 29 | return RegexUtil.isWebSite(text); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreWidth.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | import com.github.houbb.sensitive.word.utils.InnerCharUtils; 7 | 8 | /** 9 | * 格式化字宽度 10 | * @author binbin.hou 11 | * @since 0.0.5 12 | */ 13 | @ThreadSafe 14 | public class WordFormatIgnoreWidth implements IWordFormat { 15 | 16 | private static final IWordFormat INSTANCE = new WordFormatIgnoreWidth(); 17 | 18 | public static IWordFormat getInstance() { 19 | return INSTANCE; 20 | } 21 | 22 | @Override 23 | public char format(char original, IWordContext context) { 24 | return InnerCharUtils.toHalfWidth(original); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreChineseStyle.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.opencc4j.util.ZhSlimUtil; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.api.IWordFormat; 7 | 8 | /** 9 | * 忽略中文样式 10 | * @author binbin.hou 11 | * @since 0.0.5 12 | */ 13 | @ThreadSafe 14 | public class WordFormatIgnoreChineseStyle implements IWordFormat { 15 | 16 | private static final IWordFormat INSTANCE = new WordFormatIgnoreChineseStyle(); 17 | 18 | public static IWordFormat getInstance() { 19 | return INSTANCE; 20 | } 21 | 22 | @Override 23 | public char format(char original, IWordContext context) { 24 | return ZhSlimUtil.toSimple(original); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsRepeatTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.7 14 | */ 15 | public class SensitiveWordBsRepeatTest { 16 | 17 | /** 18 | * 忽略重复词 19 | * @since 0.0.7 20 | */ 21 | @Test 22 | public void ignoreChineseStyleTest() { 23 | final String text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words"; 24 | 25 | List wordList = SensitiveWordBs.newInstance() 26 | .ignoreRepeat(true) 27 | .init() 28 | .findAll(text); 29 | Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString()); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllowSystem.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.allow; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordAllow; 5 | import com.github.houbb.sensitive.word.utils.InnerStreamUtils; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * 系统默认的信息 11 | * @author binbin.hou 12 | * @since 0.0.13 13 | */ 14 | @ThreadSafe 15 | public class WordAllowSystem implements IWordAllow { 16 | 17 | /** 18 | * @since 0.3.0 19 | */ 20 | private static final WordAllowSystem INSTANCE = new WordAllowSystem(); 21 | 22 | public static WordAllowSystem getInstance() { 23 | return INSTANCE; 24 | } 25 | 26 | @Override 27 | public List allow() { 28 | return InnerStreamUtils.readAllLines("/sensitive_word_allow.txt"); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsDataTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import com.github.houbb.sensitive.word.support.data.WordDatas; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | /** 8 | *

project: sensitive-word-SensitiveWordBsConfigTest

9 | *

create on 2020/1/7 23:43

10 | * 11 | * @author Administrator 12 | * @since 0.7.0 13 | */ 14 | public class SensitiveWordBsDataTest { 15 | 16 | @Test 17 | public void wordDataConfigTest() { 18 | SensitiveWordBs wordBs = SensitiveWordBs.newInstance() 19 | .wordData(WordDatas.tree()) 20 | .init(); 21 | 22 | final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; 23 | Assert.assertTrue(wordBs.contains(text)); 24 | Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordBs.findAll(text).toString()); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/format/AbstractWordFormatCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.format; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordFormat; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.combine.IWordFormatCombine; 6 | import com.github.houbb.sensitive.word.support.format.WordFormats; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * @author d 12 | * @since 0.8.0 13 | */ 14 | public abstract class AbstractWordFormatCombine implements IWordFormatCombine { 15 | 16 | protected abstract List getWordFormatList(IWordContext context); 17 | 18 | @Override 19 | public IWordFormat initWordFormat(IWordContext context) { 20 | List list = getWordFormatList(context); 21 | return WordFormats.chains(list); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/check/AbstractWordCheckCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.check; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordCheck; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.combine.IWordCheckCombine; 6 | import com.github.houbb.sensitive.word.support.check.WordChecks; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * @author d 12 | * @since 0.8.0 13 | */ 14 | public abstract class AbstractWordCheckCombine implements IWordCheckCombine { 15 | 16 | protected abstract List getWordCheckList(IWordContext context); 17 | 18 | @Override 19 | public IWordCheck initWordCheck(IWordContext context) { 20 | List wordCheckList = getWordCheckList(context); 21 | 22 | return WordChecks.chains(wordCheckList); 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/ignore/SpecialCharSensitiveWordCharIgnore.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.ignore; 2 | 3 | import com.github.houbb.heaven.util.lang.StringUtil; 4 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 5 | 6 | import java.util.Set; 7 | 8 | /** 9 | * 特殊字符忽略 10 | * @since 0.11.0 11 | */ 12 | public class SpecialCharSensitiveWordCharIgnore extends AbstractSensitiveWordCharIgnore { 13 | 14 | private static final String SPECIAL = "`-=~!@#$%^&*()_+[]{}\\|;:'\",./<>?"; 15 | 16 | private static final Set SET; 17 | 18 | static { 19 | SET = StringUtil.toCharSet(SPECIAL); 20 | } 21 | 22 | @Override 23 | protected boolean doIgnore(int ix, String text, InnerSensitiveWordContext innerContext) { 24 | char c = text.charAt(ix); 25 | return SET.contains(c); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | import com.github.houbb.sensitive.word.api.IWordResult; 5 | import com.github.houbb.sensitive.word.api.IWordResultHandler; 6 | 7 | /** 8 | * 抽象的处理结果 9 | * 10 | * @since 0.12.0 11 | * @param 泛型 12 | */ 13 | public abstract class AbstractWordResultHandler implements IWordResultHandler { 14 | 15 | protected abstract R doHandle(IWordResult wordResult, IWordContext wordContext, String originalText); 16 | 17 | @Override 18 | public R handle(IWordResult wordResult, IWordContext wordContext, String originalText) { 19 | if(wordResult == null) { 20 | return null; 21 | } 22 | 23 | return doHandle(wordResult, wordContext, originalText); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import java.io.Serializable; 4 | import java.util.Set; 5 | 6 | /** 7 | * @since 0.12.0 8 | */ 9 | public class WordTagsDto implements Serializable { 10 | 11 | private String word; 12 | 13 | private Set tags; 14 | 15 | public String getWord() { 16 | return word; 17 | } 18 | 19 | public void setWord(String word) { 20 | this.word = word; 21 | } 22 | 23 | public Set getTags() { 24 | return tags; 25 | } 26 | 27 | public void setTags(Set tags) { 28 | this.tags = tags; 29 | } 30 | 31 | @Override 32 | public String toString() { 33 | return "WordTagsDto{" + 34 | "word='" + word + '\'' + 35 | ", tags=" + tags + 36 | '}'; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/utils/InnerStreamUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.utils; 2 | 3 | import com.github.houbb.heaven.util.io.StreamUtil; 4 | 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.util.Collections; 8 | import java.util.List; 9 | 10 | /** 11 | * @since 0.27.1 12 | */ 13 | public class InnerStreamUtils { 14 | 15 | /** 16 | * 获取文件,兼容为空的场景 17 | * @param path 路径 18 | * @return 结果 19 | */ 20 | public static List readAllLines(String path) { 21 | try(InputStream inputStream = StreamUtil.class.getResourceAsStream(path);) { 22 | if(inputStream == null) { 23 | return Collections.emptyList(); 24 | } 25 | } catch (IOException e) { 26 | throw new RuntimeException(e); 27 | } 28 | 29 | return StreamUtil.readAllLines(path); 30 | } 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordResult; 6 | 7 | /** 8 | * 不做任何处理 9 | * @author binbin.hou 10 | * @since 0.1.0 11 | */ 12 | @ThreadSafe 13 | public class WordResultHandlerRaw extends AbstractWordResultHandler { 14 | 15 | /** 16 | * @since 0.3.0 17 | */ 18 | private static final WordResultHandlerRaw INSTANCE = new WordResultHandlerRaw(); 19 | 20 | public static WordResultHandlerRaw getInstance() { 21 | return INSTANCE; 22 | } 23 | 24 | @Override 25 | protected IWordResult doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) { 26 | return wordResult; 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/AbstractWordResultCondition.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | import com.github.houbb.sensitive.word.api.IWordResult; 5 | import com.github.houbb.sensitive.word.api.IWordResultCondition; 6 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 7 | 8 | /** 9 | * 抽象实现 10 | * 11 | * @since 0.13.0 12 | */ 13 | public abstract class AbstractWordResultCondition implements IWordResultCondition { 14 | 15 | protected abstract boolean doMatch(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context); 16 | 17 | @Override 18 | public boolean match(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context) { 19 | return doMatch(wordResult, text, modeEnum, context); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/mapping/AbstractWordFormatText.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format.mapping; 2 | 3 | import com.github.houbb.heaven.util.lang.StringUtil; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormatText; 6 | 7 | import java.util.Collections; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | /** 12 | * 抽象实现 13 | * @author binbin.hou 14 | * @since 0.28.0 15 | */ 16 | public abstract class AbstractWordFormatText implements IWordFormatText { 17 | 18 | protected abstract Map doFormat(String text, IWordContext context); 19 | 20 | @Override 21 | public Map format(String text, IWordContext context) { 22 | if(StringUtil.isEmpty(text)) { 23 | return Collections.emptyMap(); 24 | } 25 | 26 | return doFormat(text, context); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/replace/WordReplaces.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.replace; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordReplace; 4 | 5 | /** 6 | * 字符替换策略工具类 7 | * 8 | * @author binbin.hou 9 | * @since 0.3.0 10 | */ 11 | public final class WordReplaces { 12 | 13 | private WordReplaces(){} 14 | 15 | /** 16 | * 字符 17 | * @param c 字符 18 | * @return 结果 19 | * @since 0.3.0 20 | */ 21 | public static IWordReplace chars(final char c) { 22 | return new WordReplaceChar(c); 23 | } 24 | 25 | /** 26 | * 字符,默认为 * 27 | * @return 结果 28 | * @since 0.3.0 29 | */ 30 | public static IWordReplace chars() { 31 | return new WordReplaceChar(); 32 | } 33 | 34 | /** 35 | * 字符,默认为 * 36 | * @return 结果 37 | * @since 0.7.0 38 | */ 39 | public static IWordReplace defaults() { 40 | return chars(); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/constant/enums/WordTagType.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.constant.enums; 2 | 3 | /** 4 | * 单词标签类别 5 | * 6 | * @since 0.24.0 7 | */ 8 | public enum WordTagType { 9 | ZHENGZHI("0", "政治"), 10 | DUPIN("1", "毒品"), 11 | SEQING("2", "色情"), 12 | DUBO("3", "赌博"), 13 | FANZUI("4", "违法犯罪"), 14 | ; 15 | 16 | private final String code; 17 | private final String desc; 18 | 19 | WordTagType(String code, String desc) { 20 | this.code = code; 21 | this.desc = desc; 22 | } 23 | 24 | public String getCode() { 25 | return code; 26 | } 27 | 28 | public String getDesc() { 29 | return desc; 30 | } 31 | 32 | public static String getDescByCode(final String code) { 33 | for(WordTagType tagType : WordTagType.values()) { 34 | if(tagType.code.equals(code)) { 35 | return tagType.desc; 36 | } 37 | } 38 | return code; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/WordCountDto.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | public class WordCountDto implements Comparable { 4 | 5 | private String code; 6 | private int count; 7 | 8 | public WordCountDto(String code, int count) { 9 | this.code = code; 10 | this.count = count; 11 | } 12 | 13 | public String getCode() { 14 | return code; 15 | } 16 | 17 | public void setCode(String code) { 18 | this.code = code; 19 | } 20 | 21 | public int getCount() { 22 | return count; 23 | } 24 | 25 | public void setCount(int count) { 26 | this.count = count; 27 | } 28 | 29 | 30 | @Override 31 | public int compareTo(WordCountDto o) { 32 | return this.count - o.count; 33 | } 34 | 35 | @Override 36 | public String toString() { 37 | return "{" + 38 | "n='" + code + '\'' + 39 | ", c=" + count + 40 | '}'; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b32/MyWordDenyChineseTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b32; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 5 | import com.github.houbb.sensitive.word.support.deny.WordDenys; 6 | import org.junit.Assert; 7 | import org.junit.Test; 8 | 9 | public class MyWordDenyChineseTest { 10 | 11 | @Test 12 | public void test() { 13 | IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDenyChineseNum()); 14 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 15 | .wordDeny(wordDeny)// 各种其他配置 16 | .init();// init() 初始化敏感词字典 17 | 18 | final String text = "和我练习三三九乘元功、一军两策"; 19 | 20 | //输出测试结果 21 | Assert.assertEquals("[三三九乘元功, 一军两策]", sensitiveWordBs.findAll(text).toString()); 22 | Assert.assertTrue(sensitiveWordBs.contains("三三九乘元功")); 23 | Assert.assertTrue(sensitiveWordBs.contains("一军两策")); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/annotation/Configuration.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.annotation; 2 | 3 | /* 4 | * Copyright 2002-2017 the original author or authors. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import java.lang.annotation.*; 20 | 21 | 22 | @Target(ElementType.TYPE) 23 | @Retention(RetentionPolicy.RUNTIME) 24 | @Documented 25 | @Component 26 | public @interface Configuration { 27 | 28 | String value() default ""; 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/support/resultcondition/MyWordTag.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.sensitive.word.support.tag.AbstractWordTag; 4 | 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.Map; 8 | import java.util.Set; 9 | 10 | /** 11 | * 自定义单词标签 12 | * @since 0.23.0 13 | */ 14 | public class MyWordTag extends AbstractWordTag { 15 | 16 | private static Map> dataMap; 17 | 18 | static { 19 | dataMap = new HashMap<>(); 20 | dataMap.put("商品", buildSet("广告", "中文")); 21 | dataMap.put("AV", buildSet("色情", "单词", "英文")); 22 | } 23 | 24 | private static Set buildSet(String... tags) { 25 | Set set = new HashSet<>(); 26 | for(String tag : tags) { 27 | set.add(tag); 28 | } 29 | return set; 30 | } 31 | 32 | @Override 33 | protected Set doGetTag(String word) { 34 | return dataMap.get(word); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordResult; 6 | import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; 7 | 8 | /** 9 | * 只保留单词 10 | * 11 | * @author binbin.hou 12 | * @since 0.1.0 13 | */ 14 | @ThreadSafe 15 | public class WordResultHandlerWord extends AbstractWordResultHandler { 16 | 17 | /** 18 | * @since 0.3.0 19 | */ 20 | private static final WordResultHandlerWord INSTANCE = new WordResultHandlerWord(); 21 | 22 | public static WordResultHandlerWord getInstance() { 23 | return INSTANCE; 24 | } 25 | 26 | @Override 27 | protected String doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) { 28 | // 截取 29 | return InnerWordCharUtils.getString(originalText, wordResult); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordCheck.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 4 | import com.github.houbb.sensitive.word.support.check.WordCheckResult; 5 | 6 | /** 7 | * 敏感信息监测接口 8 | * (1)敏感词 9 | * (2)数字(连续8位及其以上) 10 | * (3)邮箱 11 | * (4)URL 12 | * 13 | * 可以使用责任链的模式,循环调用。 14 | * @author binbin.hou 15 | * @since 0.0.5 16 | */ 17 | public interface IWordCheck { 18 | 19 | /** 20 | * 检查敏感词数量 21 | *

22 | * (1)如果未命中敏感词,直接返回 0 23 | * (2)命中敏感词,则返回敏感词的长度。 24 | *

25 | * ps: 这里结果进行优化, 26 | * 1. 是否包含敏感词。 27 | * 2. 敏感词的长度 28 | * 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复) 29 | * 30 | * @param beginIndex 开始下标 31 | * @param context 执行上下文 32 | * @return 敏感信息对应的长度 33 | * @since 0.0.5 34 | * @since 0.24.2 为了黑白名单统一,调整了对应的返回值 35 | */ 36 | WordCheckResult sensitiveCheck(final int beginIndex, 37 | final InnerSensitiveWordContext context); 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenySystem.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.deny; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordDeny; 5 | import com.github.houbb.sensitive.word.utils.InnerStreamUtils; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * 系统默认的信息 11 | * @author binbin.hou 12 | * @since 0.0.13 13 | */ 14 | @ThreadSafe 15 | public class WordDenySystem implements IWordDeny { 16 | 17 | /** 18 | * @since 0.3.0 19 | */ 20 | private static final IWordDeny INSTANCE = new WordDenySystem(); 21 | 22 | public static IWordDeny getInstance() { 23 | return INSTANCE; 24 | } 25 | 26 | @Override 27 | public List deny() { 28 | List results = InnerStreamUtils.readAllLines("/sensitive_word_dict.txt"); 29 | results.addAll(InnerStreamUtils.readAllLines("/sensitive_word_dict_en.txt")); 30 | results.addAll(InnerStreamUtils.readAllLines("/sensitive_word_deny.txt")); 31 | return results; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordResult; 4 | import com.github.houbb.sensitive.word.api.IWordResultHandler; 5 | 6 | /** 7 | * 敏感词的结果处理 8 | * @author binbin.hou 9 | * @since 0.1.0 10 | */ 11 | public final class WordResultHandlers { 12 | 13 | private WordResultHandlers(){} 14 | 15 | /** 16 | * 不做任何处理 17 | * @return 结果 18 | * @since 0.1.0 19 | */ 20 | public static IWordResultHandler raw() { 21 | return WordResultHandlerRaw.getInstance(); 22 | } 23 | 24 | /** 25 | * 只保留单词 26 | * @return 结果 27 | * @since 0.1.0 28 | */ 29 | public static IWordResultHandler word() { 30 | return WordResultHandlerWord.getInstance(); 31 | } 32 | 33 | /** 34 | * 单词+标签的处理结果 35 | * @return 单词+标签的处理结果 36 | * @since 0.12.0 37 | */ 38 | public static IWordResultHandler wordTags() { 39 | return new WordResultHandlerWordTags(); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsSystemDictTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | import com.github.houbb.sensitive.word.api.IWordDeny; 5 | import com.github.houbb.sensitive.word.support.allow.WordAllows; 6 | import com.github.houbb.sensitive.word.support.deny.WordDenys; 7 | import com.github.houbb.sensitive.word.support.tag.WordTags; 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | /** 15 | *

project: sensitive-word-SensitiveWordBsTest

16 | *

create on 2020/1/7 23:43

17 | * 18 | * @author Administrator 19 | * @since 0.27.0 20 | */ 21 | public class SensitiveWordBsSystemDictTest { 22 | 23 | @Test 24 | public void configTest() { 25 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 26 | .wordAllow(WordAllows.defaults()) 27 | .wordDeny(WordDenys.defaults()) 28 | .wordTag(WordTags.defaults()) 29 | .init(); 30 | } 31 | 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/service/SensitiveWordService.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring.service; 2 | 3 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 4 | import com.github.houbb.sensitive.word.spring.annotation.Autowired; 5 | import com.github.houbb.sensitive.word.spring.annotation.Component; 6 | 7 | /** 8 | * @author binbin.hou 9 | * @since 1.0.0 10 | */ 11 | @Component 12 | public class SensitiveWordService { 13 | 14 | @Autowired 15 | private SensitiveWordBs sensitiveWordBs; 16 | 17 | /** 18 | * 更新词库 19 | * 20 | * 每次数据库的信息发生变化之后,首先调用更新数据库敏感词库的方法。 21 | * 如果需要生效,则调用这个方法。 22 | * 23 | * 说明:重新初始化不影响旧的方法使用。初始化完成后,会以新的为准。 24 | */ 25 | public void refresh() { 26 | // 每次数据库的信息发生变化之后,首先调用更新数据库敏感词库的方法,然后调用这个方法。 27 | sensitiveWordBs.init(); 28 | } 29 | 30 | /** 31 | * 是否包含 32 | * 33 | * 可以重新封装,也可以直接使用 sensitiveWordBs 34 | * @param word 单词 35 | * @return 结果 36 | */ 37 | public boolean contains(String word){ 38 | return sensitiveWordBs.contains(word); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /release.bat: -------------------------------------------------------------------------------- 1 | :: 用于 release 当前项目(windows) 2 | :: author: houbb 3 | :: LastUpdateTime: 2018-1-22 09:08:52 4 | :: 用法:双击运行,或者当前路径 cmd 直接输入 release.bat 5 | 6 | :: 关闭回显 7 | @echo OFF 8 | 9 | ECHO "============================= RELEASE START..." 10 | 11 | :: 版本号信息(需要手动指定) 12 | :::: 旧版本名称 13 | SET version=0.28.0 14 | :::: 新版本名称 15 | SET newVersion=0.29.0 16 | :::: 组织名称 17 | SET groupName=com.github.houbb 18 | :::: 项目名称 19 | SET projectName=sensitive-word 20 | 21 | :: release 项目版本 22 | :::: snapshot 版本号 23 | SET snapshot_version=%version%"-SNAPSHOT" 24 | :::: 新的版本号 25 | SET release_version=%version% 26 | 27 | call mvn versions:set -DgroupId=%groupName% -DartifactId=%projectName% -DoldVersion=%snapshot_version% -DnewVersion=%release_version% 28 | call mvn -N versions:update-child-modules 29 | call mvn versions:commit 30 | call echo "1. RELEASE %snapshot_version% TO %release_version% DONE." 31 | 32 | 33 | :: 推送到 github 34 | git add . 35 | git commit -m "release branch %version%" 36 | git push 37 | git status 38 | 39 | ECHO "2. PUSH TO GITHUB DONE." 40 | 41 | :: 推送到 maven 中央仓库 42 | call mvn clean deploy -P release 43 | ECHO "3 PUSH TO MVN CENTER DONE." 44 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MySensitiveTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b20211211; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | import com.github.houbb.sensitive.word.api.IWordDeny; 5 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 6 | import com.github.houbb.sensitive.word.support.allow.WordAllows; 7 | import com.github.houbb.sensitive.word.support.deny.WordDenys; 8 | import org.junit.Test; 9 | 10 | public class MySensitiveTest { 11 | 12 | 13 | @Test 14 | public void test() { 15 | IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny()); 16 | IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow()); 17 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 18 | .wordAllow(wordAllow) 19 | .wordDeny(wordDeny)// 各种其他配置 20 | .init();// init() 初始化敏感词字典 21 | 22 | final String text = "五星红旗 我的自定义敏感词尼玛"; 23 | //输出测试结果 24 | System.out.println("敏感词:"+sensitiveWordBs.findAll(text).toString()); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/allowdeny/WordAllowDenyCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.allowdeny; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | 5 | import java.util.Collection; 6 | import java.util.HashSet; 7 | import java.util.List; 8 | import java.util.Set; 9 | 10 | /** 11 | * @author d 12 | * @since 0.8.0 13 | */ 14 | public class WordAllowDenyCombine extends AbstractWordAllowDenyCombine{ 15 | 16 | @Override 17 | protected Collection doGetActualDenyList(List allowList, 18 | List denyList, 19 | IWordContext context) { 20 | Set resultSet = new HashSet<>(denyList.size()); 21 | 22 | // O(1) 23 | Set allowSet = new HashSet<>(allowList); 24 | 25 | for(String deny : denyList) { 26 | if(allowSet.contains(deny)) { 27 | continue; 28 | } 29 | 30 | resultSet.add(deny); 31 | } 32 | return resultSet; 33 | } 34 | 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsIpv4Test.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | 10 | */ 11 | public class SensitiveWordBsIpv4Test { 12 | 13 | /** 14 | * ipv4 地址 15 | * @since 0.17.0 16 | */ 17 | @Test 18 | public void defaultTest() { 19 | final String text = "个人网站,如果网址打不开可以访问 127.0.0.1。"; 20 | final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().init(); 21 | 22 | List wordList = sensitiveWordBs.findAll(text); 23 | Assert.assertEquals("[]", wordList.toString()); 24 | } 25 | 26 | /** 27 | * ipv4 地址 28 | * @since 0.17.0 29 | */ 30 | @Test 31 | public void ipv4CheckTest() { 32 | final String text = "个人网站,如果网址打不开可以访问 127.0.0.1。"; 33 | final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().enableIpv4Check(true).init(); 34 | List wordList = sensitiveWordBs.findAll(text); 35 | Assert.assertEquals("[127.0.0.1]", wordList.toString()); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsNumLenTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.2.1 14 | */ 15 | public class SensitiveWordBsNumLenTest { 16 | 17 | /** 18 | * 返回所有敏感词 19 | * @since 0.2.1 20 | */ 21 | @Test 22 | public void findAllTest() { 23 | final String text = "你懂得:12345678"; 24 | 25 | // 默认检测 8 位 26 | List wordList = SensitiveWordBs.newInstance() 27 | .enableNumCheck(true) 28 | .init().findAll(text); 29 | Assert.assertEquals("[12345678]", wordList.toString()); 30 | 31 | // 指定数字的长度,避免误杀 32 | List wordList2 = SensitiveWordBs.newInstance() 33 | .enableNumCheck(true) 34 | .numCheckLen(9) 35 | .init() 36 | .findAll(text); 37 | Assert.assertEquals("[]", wordList2.toString()); 38 | } 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bugs/b118/Bug118Test.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bugs.b118; 2 | 3 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 4 | import com.github.houbb.sensitive.word.support.check.WordChecks; 5 | import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; 6 | import org.junit.Assert; 7 | import org.junit.Test; 8 | 9 | public class Bug118Test { 10 | 11 | @Test 12 | public void test() { 13 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 14 | .charIgnore(SensitiveWordCharIgnores.specialChars()) 15 | .wordCheckNum(WordChecks.num()) 16 | .numCheckLen(8) 17 | .enableNumCheck(true) 18 | .init(); 19 | 20 | Assert.assertEquals(sensitiveWordBs.findFirst("1234567===0001哈哈哈"), "1234567===0001"); 21 | Assert.assertEquals(sensitiveWordBs.findFirst("12345670002 哈哈哈"), "12345670002"); 22 | Assert.assertEquals(sensitiveWordBs.findFirst("=====123456====70002 哈哈哈"), "=====123456====70002"); 23 | Assert.assertEquals(sensitiveWordBs.findFirst("=====123456====X70002 哈哈哈"), null); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/FileWordTag.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import com.github.houbb.heaven.util.common.ArgUtil; 4 | import com.github.houbb.heaven.util.io.FileUtil; 5 | import com.github.houbb.sensitive.word.api.IWordTag; 6 | 7 | import java.util.List; 8 | import java.util.Set; 9 | 10 | /** 11 | * 基于文件的标签 12 | * 13 | * word tag1,tag2 14 | * @since 0.10.0 15 | */ 16 | public class FileWordTag extends AbstractWordTag { 17 | 18 | /** 19 | * 词和标签的分隔符 20 | */ 21 | protected final IWordTag wordTag; 22 | 23 | public FileWordTag(String filePath) { 24 | this(filePath, " ", ","); 25 | } 26 | 27 | public FileWordTag(String filePath, String wordSplit, String tagSplit) { 28 | ArgUtil.notEmpty(filePath, "filePath"); 29 | ArgUtil.notEmpty(wordSplit, "wordSplit"); 30 | ArgUtil.notEmpty(tagSplit, "tagSplit"); 31 | 32 | List lines = FileUtil.readAllLines(filePath); 33 | wordTag = WordTags.lines(lines, wordSplit, tagSplit); 34 | } 35 | 36 | @Override 37 | protected Set doGetTag(String word) { 38 | return wordTag.getTag(word); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/DictRemoveSingleTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import com.github.houbb.heaven.util.io.FileUtil; 4 | import org.junit.Ignore; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * 数据初始化 11 | * @author binbin.hou 12 | * @since 0.9.0 13 | */ 14 | @Ignore 15 | public class DictRemoveSingleTest { 16 | 17 | /** 18 | * 统一格式 19 | * 20 | * 1. 将所有的大写字母统一转换为小写 21 | * 2. 将所有的全角转换为半角 22 | * 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了) 23 | * 4. 繁体字统一转换为简体字 24 | * @since 0.0.3 25 | */ 26 | @Test 27 | @Ignore 28 | public void removeSingleWord() { 29 | final String sourceFile = "D:\\code\\github\\sensitive-word\\src\\test\\resources\\dict_20231117.txt"; 30 | final String targetFile = "D:\\code\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt"; 31 | 32 | List words = FileUtil.readAllLines(sourceFile); 33 | 34 | for(String word : words) { 35 | String wordTrim = word.trim(); 36 | if(wordTrim.length() > 1) { 37 | FileUtil.append(targetFile, wordTrim); 38 | } 39 | } 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/DictRemoveTwoEnglishTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import com.github.houbb.heaven.util.io.FileUtil; 4 | import com.github.houbb.heaven.util.lang.CharUtil; 5 | import com.github.houbb.heaven.util.lang.StringUtil; 6 | import org.junit.Ignore; 7 | import org.junit.Test; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * 数据初始化 13 | * @author binbin.hou 14 | * @since 0.9.0 15 | */ 16 | @Ignore 17 | public class DictRemoveTwoEnglishTest { 18 | 19 | public static void main(String[] args) { 20 | final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt"; 21 | final String targetFile = "D:\\github\\sensitive-word\\src\\test\\resources\\dict_v20240407.txt"; 22 | 23 | List words = FileUtil.readAllLines(sourceFile); 24 | 25 | for(String word : words) { 26 | String wordTrim = word.trim(); 27 | // 如果是2 28 | if(wordTrim.length() == 2 && StringUtil.isEnglish(wordTrim)) { 29 | System.out.println(word); 30 | } else { 31 | FileUtil.append(targetFile, wordTrim); 32 | } 33 | } 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatArray.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.util.common.ArgUtil; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * 直接列表调用 11 | * @author binbin.hou 12 | * @since 0.30.0 13 | */ 14 | public class WordFormatArray implements IWordFormat { 15 | 16 | private final IWordFormat[] wordFormats; 17 | private final int size; 18 | public WordFormatArray(List wordFormats) { 19 | ArgUtil.notEmpty(wordFormats, "wordFormats"); 20 | 21 | this.size = wordFormats.size(); 22 | this.wordFormats = new IWordFormat[size]; 23 | for(int i = 0; i < size; i++) { 24 | this.wordFormats[i] = wordFormats.get(i); 25 | } 26 | } 27 | 28 | @Override 29 | public char format(char original, IWordContext context) { 30 | char c = original; 31 | for(int i = 0; i < size; i++) { 32 | IWordFormat charFormat = wordFormats[i]; 33 | c = charFormat.format(c, context); 34 | } 35 | 36 | return c; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/replace/WordReplaceChar.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.replace; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.constant.CharConst; 5 | import com.github.houbb.sensitive.word.api.IWordReplace; 6 | import com.github.houbb.sensitive.word.api.IWordContext; 7 | import com.github.houbb.sensitive.word.api.IWordResult; 8 | 9 | /** 10 | * 指定字符的替换策略 11 | * @author binbin.hou 12 | * @since 0.2.0 13 | */ 14 | @ThreadSafe 15 | public class WordReplaceChar implements IWordReplace { 16 | 17 | /** 18 | * 替换的字符 19 | * @since 0.3.0 20 | */ 21 | private final char replaceChar; 22 | 23 | public WordReplaceChar(char replaceChar) { 24 | this.replaceChar = replaceChar; 25 | } 26 | 27 | public WordReplaceChar() { 28 | this(CharConst.STAR); 29 | } 30 | 31 | @Override 32 | public void replace(StringBuilder stringBuilder, final String rawText, IWordResult wordResult, IWordContext wordContext) { 33 | int wordLen = wordResult.endIndex() - wordResult.startIndex(); 34 | for(int i = 0; i < wordLen; i++) { 35 | stringBuilder.append(replaceChar); 36 | } 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/utils/InnerWordCharUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.utils; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordResult; 4 | 5 | /** 6 | *

project: sensitive-word-NumUtils

7 | *

create on 2020/1/8 22:18

8 | * 9 | * @author Administrator 10 | * @since 0.0.4 11 | */ 12 | public final class InnerWordCharUtils { 13 | 14 | private InnerWordCharUtils() { 15 | } 16 | 17 | /** 18 | * 构建字符串 19 | * @param text 字符串 20 | * @param startIndex 开始位置 21 | * @param endIndex 结束位置 22 | * @return 结果 23 | * @since 0.29.0 24 | */ 25 | public static String getString(final String text, 26 | final int startIndex, 27 | final int endIndex) { 28 | return text.substring(startIndex, endIndex); 29 | } 30 | /** 31 | * 构建字符串 32 | * @param text 字符串 33 | * @param wordResult 结果 34 | * @return 结果 35 | * @since 0.29.0 36 | */ 37 | public static String getString(final String text, 38 | final IWordResult wordResult) { 39 | return getString(text, wordResult.startIndex(), wordResult.endIndex()); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/replace/MyWordReplace.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.replace; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordReplace; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordResult; 6 | import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; 7 | 8 | /** 9 | * 自定义敏感词替换策略 10 | * 11 | * @author binbin.hou 12 | * @since 0.2.0 13 | */ 14 | public class MyWordReplace implements IWordReplace { 15 | 16 | @Override 17 | public void replace(StringBuilder stringBuilder, final String rawText, IWordResult wordResult, IWordContext wordContext) { 18 | String sensitiveWord = InnerWordCharUtils.getString(rawText, wordResult); 19 | // 自定义不同的敏感词替换策略,可以从数据库等地方读取 20 | if("五星红旗".equals(sensitiveWord)) { 21 | stringBuilder.append("国家旗帜"); 22 | } else if("毛主席".equals(sensitiveWord)) { 23 | stringBuilder.append("教员"); 24 | } else { 25 | // 其他默认使用 * 代替 26 | int wordLength = wordResult.endIndex() - wordResult.startIndex(); 27 | for(int i = 0; i < wordLength; i++) { 28 | stringBuilder.append('*'); 29 | } 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsNumTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.5 14 | */ 15 | public class SensitiveWordBsNumTest { 16 | 17 | /** 18 | * 返回所有敏感词 19 | * @since 0.0.5 20 | */ 21 | @Test 22 | public void findAllTest() { 23 | final String text = "这个是我的微信:9989123456"; 24 | 25 | List wordList = SensitiveWordBs.newInstance() 26 | .enableNumCheck(true) 27 | .init().findAll(text); 28 | Assert.assertEquals("[9989123456]", wordList.toString()); 29 | } 30 | 31 | /** 32 | * 返回所有敏感词 33 | * @since 0.0.5 34 | */ 35 | @Test 36 | public void ignoreNumStyleTest() { 37 | final String text = "这个是我的微信:9⓿二肆⁹₈③⑸⒋➃㈤㊄"; 38 | 39 | List wordList = SensitiveWordBs.newInstance() 40 | .enableNumCheck(true) 41 | .init().findAll(text); 42 | Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString()); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/DictRemoveCommonITUsageTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import com.github.houbb.heaven.util.io.FileUtil; 4 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 5 | import org.junit.Ignore; 6 | import org.junit.Test; 7 | 8 | import java.io.File; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Set; 12 | 13 | /** 14 | * 常用的 it 使用 15 | * 16 | * @author binbin.hou 17 | * @since 0.14.1 18 | */ 19 | @Ignore 20 | public class DictRemoveCommonITUsageTest { 21 | 22 | /** 23 | * 统计自己的文章,移除常用的 it 用语等。降低误判率 24 | */ 25 | @Test 26 | @Ignore 27 | public void removeSingleWord() { 28 | final String dir = "D:\\github\\houbb.github.io\\_posts"; 29 | 30 | File[] files = new File(dir).listFiles(); 31 | 32 | // 默认策略 33 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().init(); 34 | 35 | Set allWords = new HashSet<>(); 36 | for(File file : files) { 37 | String content = FileUtil.getFileContent(file); 38 | 39 | List words = sensitiveWordBs.findAll(content); 40 | allWords.addAll(words); 41 | } 42 | 43 | System.out.println(allWords); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUrlNoPrefixTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import com.github.houbb.sensitive.word.support.check.WordChecks; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | *

project: sensitive-word-SensitiveWordBsTest

11 | *

create on 2020/1/7 23:43

12 | * 13 | * @author Administrator 14 | * @since 0.25.0 15 | */ 16 | public class SensitiveWordBsUrlNoPrefixTest { 17 | 18 | /** 19 | * URL 检测 20 | * 21 | * @since 0.25.0 22 | */ 23 | @Test 24 | public void urlNoPrefixTest() { 25 | final String text = "点击链接 https://www.baidu.com 查看答案,当然也可以是 baidu.com、www.baidu.com"; 26 | 27 | final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 28 | .enableUrlCheck(true) // 启用URL检测 29 | .wordCheckUrl(WordChecks.urlNoPrefix()) //指定检测的方式 30 | .init(); 31 | List wordList = sensitiveWordBs.findAll(text); 32 | Assert.assertEquals("[www.baidu.com, baidu.com, www.baidu.com]", wordList.toString()); 33 | 34 | Assert.assertEquals("点击链接 https://************* 查看答案,当然也可以是 *********、*************", sensitiveWordBs.replace(text)); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/check/WordCheckCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.check; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordCheck; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.support.check.WordChecks; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * @author d 12 | * @since 0.8.0 13 | */ 14 | public class WordCheckCombine extends AbstractWordCheckCombine { 15 | 16 | @Override 17 | protected List getWordCheckList(IWordContext context) { 18 | List wordCheckList = new ArrayList<>(); 19 | 20 | if(context.enableWordCheck()) { 21 | wordCheckList.add(context.wordCheckWord()); 22 | } 23 | if(context.enableNumCheck()) { 24 | wordCheckList.add(context.wordCheckNum()); 25 | } 26 | if(context.enableEmailCheck()) { 27 | wordCheckList.add(context.wordCheckEmail()); 28 | } 29 | if(context.enableUrlCheck()) { 30 | wordCheckList.add(context.wordCheckUrl()); 31 | } 32 | if(context.enableIpv4Check()) { 33 | wordCheckList.add(context.wordCheckIpv4()); 34 | } 35 | 36 | return wordCheckList; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/utils/InnerWordTagUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.utils; 2 | 3 | import com.github.houbb.heaven.util.lang.StringUtil; 4 | import com.github.houbb.heaven.util.util.CollectionUtil; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.api.IWordTag; 7 | 8 | import java.util.Collections; 9 | import java.util.Set; 10 | 11 | /** 12 | * 内部的单词标签工具类 13 | * 14 | * @since 0.24.0 15 | */ 16 | public class InnerWordTagUtils { 17 | 18 | /** 19 | * 获取敏感词的标签 20 | * 21 | * @param word 敏感词 22 | * @param wordContext 上下文 23 | * @return 结果 24 | * @since 0.24.0 25 | */ 26 | public static Set tags(final String word, 27 | final IWordContext wordContext) { 28 | if(StringUtil.isEmpty(word)) { 29 | return null; 30 | } 31 | 32 | final IWordTag wordTag = wordContext.wordTag(); 33 | // 直接获取 34 | Set actualSet = wordTag.getTag(word); 35 | if(CollectionUtil.isNotEmpty(actualSet)) { 36 | return actualSet; 37 | } 38 | 39 | // 格式化处理后的信息 40 | String formatWord = InnerWordFormatUtils.format(word, wordContext); 41 | return wordContext.wordTag().getTag(formatWord); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format.mapping; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | import com.github.houbb.sensitive.word.api.IWordFormat; 5 | import com.github.houbb.sensitive.word.support.format.WordFormatNone; 6 | 7 | import java.util.Collections; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | /** 12 | * 默认实现 13 | * 14 | * @author d 15 | * @since 0.28.0 16 | */ 17 | public class WordFormatTextDefault extends AbstractWordFormatText { 18 | 19 | @Override 20 | protected Map doFormat(String text, IWordContext context) { 21 | // 单个字符串里信息 22 | final IWordFormat wordFormat = context.wordFormat(); 23 | // 不需要处理的场景 24 | if(wordFormat.getClass().getName().equals(WordFormatNone.class.getName())) { 25 | return Collections.emptyMap(); 26 | } 27 | 28 | //v0.29.2 29 | Map map = new HashMap<>(); 30 | for(int i = 0; i < text.length(); i++) { 31 | char c = text.charAt(i); 32 | char mc = wordFormat.format(c, context); 33 | 34 | if(c != mc) { 35 | map.put(c, mc); 36 | } 37 | } 38 | return map; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/warmup/WordWarmUpDefault.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.warmup; 2 | 3 | import com.github.houbb.heaven.util.util.CollectionUtil; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordWarmUp; 6 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * 默认策略 12 | * @since 1.0.0 13 | */ 14 | public class WordWarmUpDefault implements IWordWarmUp { 15 | 16 | @Override 17 | public void warmUp(SensitiveWordBs sensitiveWordBs, IWordContext wordContext, List wordAllowList, List wordDenyList) { 18 | String testInfo = "sensitive-word"; 19 | if(CollectionUtil.isNotEmpty(wordAllowList)) { 20 | testInfo = testInfo + " " + wordAllowList.get(0); 21 | } 22 | if(CollectionUtil.isNotEmpty(wordDenyList)) { 23 | testInfo = testInfo + " " + wordDenyList.get(0); 24 | } 25 | 26 | // 只能说优化,但是无法杜绝 27 | for(int i = 0; i < 5; i++) { 28 | sensitiveWordBs.findAll(testInfo); 29 | sensitiveWordBs.findFirst(testInfo); 30 | sensitiveWordBs.contains(testInfo); 31 | sensitiveWordBs.replace(testInfo); 32 | sensitiveWordBs.tags(testInfo); 33 | } 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/spring/SpringSensitiveWordConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.spring; 2 | 3 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 4 | import com.github.houbb.sensitive.word.spring.annotation.Autowired; 5 | import com.github.houbb.sensitive.word.spring.annotation.Bean; 6 | import com.github.houbb.sensitive.word.spring.annotation.Configuration; 7 | import com.github.houbb.sensitive.word.spring.database.MyDdWordAllow; 8 | import com.github.houbb.sensitive.word.spring.database.MyDdWordDeny; 9 | import com.github.houbb.sensitive.word.support.allow.WordAllows; 10 | 11 | /** 12 | * @author binbin.hou 13 | * @since 1.0.0 14 | */ 15 | @Configuration 16 | public class SpringSensitiveWordConfig { 17 | 18 | @Autowired 19 | private MyDdWordAllow myDdWordAllow; 20 | 21 | @Autowired 22 | private MyDdWordDeny myDdWordDeny; 23 | 24 | /** 25 | * 初始化引导类 26 | * @return 初始化引导类 27 | * @since 1.0.0 28 | */ 29 | @Bean 30 | public SensitiveWordBs sensitiveWordBs() { 31 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 32 | .wordAllow(WordAllows.chains(WordAllows.defaults(), myDdWordAllow)) 33 | .wordDeny(myDdWordDeny) 34 | // 各种其他配置 35 | .init(); 36 | 37 | return sensitiveWordBs; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/format/WordFormatCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.format; 2 | 3 | import com.github.houbb.heaven.util.guava.Guavas; 4 | import com.github.houbb.sensitive.word.api.IWordFormat; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.support.format.WordFormats; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * @author d 12 | * @since 0.8.0 13 | */ 14 | public class WordFormatCombine extends AbstractWordFormatCombine { 15 | 16 | @Override 17 | protected List getWordFormatList(IWordContext context) { 18 | List charFormats = Guavas.newArrayList(); 19 | if(context.ignoreEnglishStyle()) { 20 | charFormats.add(WordFormats.ignoreEnglishStyle()); 21 | } 22 | if(context.ignoreCase()) { 23 | charFormats.add(WordFormats.ignoreCase()); 24 | } 25 | if(context.ignoreWidth()) { 26 | charFormats.add(WordFormats.ignoreWidth()); 27 | } 28 | if(context.ignoreNumStyle()) { 29 | charFormats.add(WordFormats.ignoreNumStyle()); 30 | } 31 | if(context.ignoreChineseStyle()) { 32 | charFormats.add(WordFormats.ignoreChineseStyle()); 33 | } 34 | 35 | return charFormats; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/utils/InnerCharUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.utils; 2 | 3 | /** 4 | * @since 0.17.0 5 | */ 6 | public class InnerCharUtils { 7 | 8 | /** 9 | * 转换为半角 10 | * @param original 原始 11 | * @return 半角 12 | * @since 0.29.2 13 | */ 14 | public static char toHalfWidth(char original) { 15 | // 全角空格 16 | if (original == '\u3000') return ' '; 17 | // 其他可转换全角字符 18 | if (original >= '\uFF01' && original <= '\uFF5E') { 19 | return (char) (original - 0xFEE0); 20 | } 21 | // 其他字符保持不变 22 | return original; 23 | } 24 | 25 | 26 | /** 27 | * 转换为整数 28 | * @param text 文本 29 | * @return 整数 30 | * @since 1.18.0 31 | */ 32 | public static int parseInt(String text) { 33 | int len = text.length(); 34 | 35 | int sum = 0; 36 | 37 | int weight = 1; 38 | for(int i = len-1; i >= 0; i--) { 39 | int val = getCharInt(text.charAt(i)); 40 | 41 | sum += weight * val; 42 | 43 | weight *= 10; 44 | } 45 | return sum; 46 | } 47 | 48 | /** 49 | * 获取 int char 对应的真实值 50 | * @param c 字符 51 | * @return 结果 52 | * @since 1.18.0 53 | */ 54 | public static int getCharInt(final char c) { 55 | return c - '0'; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/IWordData.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 4 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 5 | import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; 6 | 7 | import java.util.Collection; 8 | 9 | /** 10 | * 敏感词 map 11 | * @author binbin.hou 12 | * @since 0.0.1 13 | */ 14 | public interface IWordData extends ISensitiveWordDestroy { 15 | 16 | /** 17 | * 初始化敏感词 map 18 | * @param collection 集合信息 19 | * @since 0.0.1 20 | */ 21 | void initWordData(Collection collection); 22 | 23 | /** 24 | * 删除敏感词 25 | * @param collection 单词 26 | * @since 0.19.0 27 | */ 28 | void removeWord(Collection collection); 29 | 30 | /** 31 | * 新增敏感词 32 | * @param collection 敏感词集合 33 | * @since 0.19.0 34 | */ 35 | void addWord(Collection collection); 36 | 37 | /** 38 | * 是否包含敏感词 39 | * @param stringBuilder 缓冲 40 | * @param innerContext 上下文 41 | * @return 是否包含 42 | * @since 0.5.0 43 | * @see WordValidModeEnum#FAIL_FAST 建议使用快速返回模式 44 | */ 45 | WordContainsTypeEnum contains(final StringBuilder stringBuilder, 46 | final InnerSensitiveWordContext innerContext); 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import com.github.houbb.heaven.util.util.CollectionUtil; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordResult; 6 | import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; 7 | import com.github.houbb.sensitive.word.utils.InnerWordTagUtils; 8 | 9 | import java.util.Set; 10 | 11 | /** 12 | * 单词+对应的标签信息 13 | * 14 | * @author binbin.hou 15 | * @since 0.12.0 16 | */ 17 | public class WordResultHandlerWordTags extends AbstractWordResultHandler { 18 | 19 | @Override 20 | protected WordTagsDto doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) { 21 | WordTagsDto dto = new WordTagsDto(); 22 | 23 | // 截取 24 | String word = InnerWordCharUtils.getString(originalText, wordResult); 25 | 26 | // 获取 tags (使用清理后的单词查找标签) 27 | Set wordTags = InnerWordTagUtils.tags(word, wordContext); 28 | 29 | // 如果为空,则尝试使用命中的敏感词匹配 v0.25.1 bug105 30 | if(CollectionUtil.isEmpty(wordTags)) { 31 | wordTags = InnerWordTagUtils.tags(wordResult.word(), wordContext); 32 | } 33 | 34 | dto.setWord(word); 35 | dto.setTags(wordTags); 36 | 37 | return dto; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/AbstractWordTagInit.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import com.github.houbb.heaven.support.pipeline.Pipeline; 4 | import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline; 5 | import com.github.houbb.heaven.util.util.CollectionUtil; 6 | import com.github.houbb.sensitive.word.api.IWordTag; 7 | 8 | import java.util.HashSet; 9 | import java.util.List; 10 | import java.util.Set; 11 | 12 | /** 13 | * 抽象的单词标签初始化引导类 14 | * 15 | * @since 0.24.0 16 | */ 17 | public abstract class AbstractWordTagInit extends AbstractWordTag { 18 | 19 | /** 20 | * 初始化列表 21 | * 22 | * @param pipeline 当前列表泳道 23 | * @since 0.24.0 24 | */ 25 | protected abstract void init(final Pipeline pipeline); 26 | 27 | @Override 28 | public Set doGetTag(String word) { 29 | Pipeline pipeline = new DefaultPipeline<>(); 30 | this.init(pipeline); 31 | 32 | Set resultSet = new HashSet<>(); 33 | List wordTagList = pipeline.list(); 34 | for (IWordTag wordTag : wordTagList) { 35 | Set tempTagSet = wordTag.getTag(word); 36 | if(CollectionUtil.isNotEmpty(tempTagSet)) { 37 | resultSet.addAll(tempTagSet); 38 | } 39 | } 40 | 41 | return resultSet; 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllowInit.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.allow; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.support.pipeline.Pipeline; 5 | import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline; 6 | import com.github.houbb.sensitive.word.api.IWordAllow; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | /** 12 | * 初始化类 13 | * 14 | * @author binbin.hou 15 | * @since 0.0.13 16 | */ 17 | @ThreadSafe 18 | public abstract class WordAllowInit implements IWordAllow { 19 | 20 | /** 21 | * 初始化列表 22 | * 23 | * @param pipeline 当前列表泳道 24 | * @since 0.0.13 25 | */ 26 | protected abstract void init(final Pipeline pipeline); 27 | 28 | @Override 29 | public List allow() { 30 | Pipeline pipeline = new DefaultPipeline<>(); 31 | this.init(pipeline); 32 | 33 | List results = new ArrayList<>(); 34 | List wordAllows = pipeline.list(); 35 | for (IWordAllow wordAllow : wordAllows) { 36 | List allowList = wordAllow.allow(); 37 | if (allowList == null) { 38 | allowList = new ArrayList<>(); 39 | } 40 | results.addAll(allowList); 41 | } 42 | 43 | return results; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/DictNumTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import com.github.houbb.heaven.util.io.FileUtil; 4 | import org.junit.Ignore; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * 数据数据的格式统一化 11 | * @author binbin.hou 12 | * @since 0.0.5 13 | */ 14 | @Ignore 15 | public class DictNumTest { 16 | 17 | /** 18 | * 统一格式 19 | * 20 | * 1. 将所有的大写字母统一转换为小写 21 | * 2. 将所有的全角转换为半角 22 | * 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了) 23 | * 4. 繁体字统一转换为简体字 24 | * @since 0.0.3 25 | */ 26 | @Test 27 | @Ignore 28 | public void formatTest() { 29 | final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt"; 30 | final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt"; 31 | 32 | List words = FileUtil.readAllLines(sourceFile); 33 | // List formats = CollectionUtil.toList(words, new IHandler() { 34 | // @Override 35 | // public String handle(String string) { 36 | // // 数字的格式化统一处理 37 | // return NumUtils.getMappingString(string); 38 | // } 39 | // }); 40 | 41 | List resultList = DataUtil.disctinctAndSort(words); 42 | FileUtil.write(targetFile, resultList); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /release_rm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "============================= RELEASE START..." 3 | 4 | ## 版本号信息(需要手动指定) 5 | oldVersion="1.0.0" 6 | newVersion="1.0.0" 7 | projectName="sisyphus" 8 | 9 | # 删除分支 10 | oldBranchName="release_"${oldVersion} 11 | git branch -d ${oldBranchName} 12 | git push origin --delete ${oldBranchName} 13 | 14 | echo "1. Branch remove success..." 15 | 16 | # 拉取新的分支 17 | newBranchName="release_"${newVersion} 18 | git branch ${newBranchName} 19 | git checkout ${newBranchName} 20 | git push --set-upstream origin ${newBranchName} 21 | 22 | echo "2. NEW BRANCH DONE." 23 | 24 | # 修改新分支的版本号 25 | ## snapshot 版本号 26 | snapshot_new_version=${newVersion}"-SNAPSHOT" 27 | mvn versions:set -DgroupId=com.github.houbb -DartifactId=${projectName} -DoldVersion=${release_version} -DnewVersion=${snapshot_new_version} 28 | mvn -N versions:update-child-modules 29 | mvn versions:commit 30 | 31 | git add . 32 | git commit -m "modify branch ${release_version} TO ${snapshot_new_version}" 33 | git push 34 | git status 35 | echo "3. MODIFY ${release_version} TO ${snapshot_new_version} DONE." 36 | 37 | echo "============================= BRANCH RE-CREATE END..." 38 | 39 | echo "============================= BRANCH LIST =============================" 40 | git branch -a 41 | 42 | # 使用方式: 43 | # 注意:本脚本用于删除分支,谨慎使用! 44 | # 1. 赋值权限: chmod +x ./release_rm.sh 45 | # 2. 执行: ./release_rm.sh 46 | # Last Update Time: 2018-06-21 11:10:42 47 | # Author: houbb -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckNone.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordCheck; 5 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 6 | import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; 7 | import com.github.houbb.sensitive.word.support.result.WordLengthResult; 8 | 9 | /** 10 | * 未匹配 11 | * 12 | * @author binbin.hou 13 | * @since 0.3.0 14 | */ 15 | @ThreadSafe 16 | public class WordCheckNone implements IWordCheck { 17 | 18 | /** 19 | * @since 0.3.0 20 | */ 21 | private static final IWordCheck INSTANCE = new WordCheckNone(); 22 | 23 | public static IWordCheck getInstance() { 24 | return INSTANCE; 25 | } 26 | 27 | /** 28 | * 只有一个未匹配 29 | */ 30 | private static final WordCheckResult NONE_RESULT = WordCheckResult.newInstance() 31 | .type(WordTypeEnum.DEFAULTS.getCode()) 32 | .wordLengthResult(WordLengthResult.newInstance()) 33 | .checkClass(WordCheckNone.class); 34 | 35 | public static WordCheckResult getNoneResult() { 36 | return NONE_RESULT; 37 | } 38 | 39 | @Override 40 | public WordCheckResult sensitiveCheck(int beginIndex, InnerSensitiveWordContext context) { 41 | return NONE_RESULT; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/StopWordTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import com.github.houbb.heaven.support.condition.ICondition; 4 | import com.github.houbb.heaven.support.filter.IFilter; 5 | import com.github.houbb.heaven.util.io.FileUtil; 6 | import com.github.houbb.heaven.util.lang.StringUtil; 7 | import com.github.houbb.heaven.util.util.CharsetUtil; 8 | import com.github.houbb.heaven.util.util.CollectionUtil; 9 | import org.junit.Ignore; 10 | import org.junit.Test; 11 | 12 | import java.util.Collections; 13 | import java.util.List; 14 | 15 | /** 16 | * 停止词数据初始化 17 | * @author binbin.hou 18 | * @since 0.0.3 19 | */ 20 | @Ignore 21 | public class StopWordTest { 22 | 23 | /** 24 | * 中文测试 25 | * @since 0.0.3 26 | */ 27 | @Test 28 | @Ignore 29 | public void zhTest() { 30 | final String sourceFile = "stopword.txt"; 31 | final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\stopword_zh.txt"; 32 | 33 | List allLines = DataUtil.distinctLines(sourceFile); 34 | 35 | List zhLines = CollectionUtil.conditionList(allLines, new ICondition() { 36 | @Override 37 | public boolean condition(String s) { 38 | return CharsetUtil.isAllChinese(s); 39 | } 40 | }); 41 | 42 | FileUtil.write(targetFile, zhLines); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWord.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api; 2 | 3 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * 核心方法 9 | * @since 0.3.2 10 | */ 11 | public interface ISensitiveWord { 12 | 13 | /** 14 | * 返回所有对应的敏感词 15 | * @param string 原始字符串 16 | * @param context 上下文 17 | * @return 结果 18 | * @since 0.0.1 19 | * @see WordValidModeEnum#FAIL_OVER 建议使用全部检测返回模式 20 | */ 21 | List findAll(final String string, 22 | final IWordContext context); 23 | 24 | /** 25 | * 返回第一个对应的敏感词 26 | * @param string 原始字符串 27 | * @param context 上下文 28 | * @return 结果 29 | * @since 0.3.2 30 | */ 31 | IWordResult findFirst(final String string, 32 | final IWordContext context); 33 | 34 | /** 35 | * 替换所有敏感词内容 36 | * 37 | * ps: 这里可以添加优化。 38 | * 39 | * @param target 目标字符串 40 | * @param context 上下文 41 | * @return 替换后结果 42 | * @since 0.3.2 43 | */ 44 | String replace(final String target, 45 | final IWordContext context); 46 | 47 | /** 48 | * 包含 49 | * @param string 字符串 50 | * @param context 上下文 51 | * @return 结果 52 | * @since 0.3.2 53 | */ 54 | boolean contains(final String string, 55 | final IWordContext context); 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenyInit.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.deny; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.support.pipeline.Pipeline; 5 | import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline; 6 | import com.github.houbb.heaven.util.io.StreamUtil; 7 | import com.github.houbb.sensitive.word.api.IWordDeny; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | /** 13 | * 初始化类 14 | * 15 | * @author binbin.hou 16 | * @since 0.0.13 17 | */ 18 | @ThreadSafe 19 | public abstract class WordDenyInit implements IWordDeny { 20 | 21 | /** 22 | * 初始化列表 23 | * 24 | * @param pipeline 当前列表泳道 25 | * @since 0.0.13 26 | */ 27 | protected abstract void init(final Pipeline pipeline); 28 | 29 | @Override 30 | public List deny() { 31 | Pipeline pipeline = new DefaultPipeline<>(); 32 | this.init(pipeline); 33 | 34 | List results = new ArrayList<>(); 35 | List wordDenies = pipeline.list(); 36 | for (IWordDeny wordDeny : wordDenies) { 37 | List denyList = wordDeny.deny(); 38 | if (denyList == null) { 39 | denyList = new ArrayList<>(); 40 | } 41 | results.addAll(denyList); 42 | } 43 | 44 | return results; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/memory/DataMemoryTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.memory; 2 | 3 | import com.github.houbb.heaven.util.io.StreamUtil; 4 | import com.github.houbb.sensitive.word.api.IWordData; 5 | import com.github.houbb.sensitive.word.support.data.WordDatas; 6 | import org.apache.lucene.util.RamUsageEstimator; 7 | import org.junit.Ignore; 8 | import org.junit.Test; 9 | 10 | import java.util.List; 11 | 12 | /** 13 | * 数据内存测试 14 | * 15 | * @since 0.7.0 16 | */ 17 | @Ignore 18 | public class DataMemoryTest { 19 | 20 | /** 21 | * 35.5 MB 22 | */ 23 | @Test 24 | public void hashMapTest() { 25 | List allLines = StreamUtil.readAllLines("/sensitive_word_dict.txt"); 26 | IWordData wordData = WordDatas.defaults(); 27 | 28 | wordData.initWordData(allLines); 29 | 30 | //计算指定对象及其引用树上的所有对象的综合大小,返回可读的结果,如:2KB 31 | String humanSize = RamUsageEstimator.humanSizeOf(wordData); 32 | System.out.println(humanSize); 33 | } 34 | 35 | 36 | //33.4 MB 37 | @Test 38 | public void treeTest() { 39 | List allLines = StreamUtil.readAllLines("/sensitive_word_dict.txt"); 40 | IWordData wordData = WordDatas.tree(); 41 | 42 | wordData.initWordData(allLines); 43 | 44 | //计算指定对象及其引用树上的所有对象的综合大小,返回可读的结果,如:2KB 45 | String humanSize = RamUsageEstimator.humanSizeOf(wordData); 46 | System.out.println(humanSize); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/benchmark/CharUtilPerfTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.benchmark; 2 | 3 | import com.github.houbb.heaven.util.lang.CharUtil; 4 | import com.github.houbb.sensitive.word.utils.InnerCharUtils; 5 | 6 | public class CharUtilPerfTest { 7 | 8 | 9 | private static final int COUNT = 10_00_000; 10 | 11 | public static void main(String[] args) { 12 | char[] testData = new char[COUNT]; 13 | for (int i = 0; i < COUNT; i++) { 14 | testData[i] = (char) ('A' + (i % 52)); // A-Z a-z 15 | } 16 | 17 | // 测试新小写 18 | // 测试原始半角 19 | char[] fullWidthData = new char[COUNT]; 20 | for (int i = 0; i < COUNT; i++) { 21 | fullWidthData[i] = (char) ('\uFF01' + (i % 94)); // 常见全角字符 22 | } 23 | 24 | long t5 = System.currentTimeMillis(); 25 | char sum3 = 0; 26 | for (char c : fullWidthData) { 27 | sum3 += CharUtil.toHalfWidth(c); 28 | } 29 | long t6 = System.currentTimeMillis(); 30 | System.out.println("原始 toHalfWidth 耗时: " + (t6 - t5) + "ms, sum=" + sum3); 31 | 32 | // 测试新半角 33 | long t7 = System.currentTimeMillis(); 34 | char sum4 = 0; 35 | for (char c : fullWidthData) { 36 | sum4 += InnerCharUtils.toHalfWidth(c); 37 | } 38 | long t8 = System.currentTimeMillis(); 39 | System.out.println("优化 toHalfWidth 耗时: " + (t8 - t7) + "ms, sum=" + sum4); 40 | } 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckNum.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordCheck; 5 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 6 | import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; 7 | 8 | /** 9 | * 敏感词监测实现 10 | * 11 | * 这里可以提供一个公共的父类。 12 | * @author binbin.hou 13 | * @since 0.0.5 14 | */ 15 | @ThreadSafe 16 | public class WordCheckNum extends AbstractConditionWordCheck { 17 | 18 | /** 19 | * @since 0.3.0 20 | */ 21 | private static final IWordCheck INSTANCE = new WordCheckNum(); 22 | 23 | public static IWordCheck getInstance() { 24 | return INSTANCE; 25 | } 26 | 27 | @Override 28 | protected Class getSensitiveCheckClass() { 29 | return WordCheckNum.class; 30 | } 31 | 32 | @Override 33 | protected String getType() { 34 | return WordTypeEnum.NUM.getCode(); 35 | } 36 | 37 | @Override 38 | protected boolean isCharCondition(char mappingChar, int index, InnerSensitiveWordContext checkContext) { 39 | return Character.isDigit(mappingChar); 40 | } 41 | 42 | @Override 43 | protected boolean isStringCondition(int index, StringBuilder stringBuilder, InnerSensitiveWordContext checkContext) { 44 | int bufferLen = stringBuilder.length(); 45 | return bufferLen >= checkContext.wordContext().sensitiveCheckNumLen(); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/WordResultConditionInit.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.heaven.support.pipeline.Pipeline; 4 | import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.api.IWordResult; 7 | import com.github.houbb.sensitive.word.api.IWordResultCondition; 8 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 9 | 10 | import java.util.List; 11 | 12 | /** 13 | * 结果条件的的初始化类 14 | * 15 | * @since 0.23.0 16 | */ 17 | public abstract class WordResultConditionInit extends AbstractWordResultCondition { 18 | 19 | /** 20 | * 初始化列表 21 | * 22 | * @param pipeline 当前列表泳道 23 | * @since 0.0.13 24 | */ 25 | protected abstract void init(final Pipeline pipeline); 26 | 27 | @Override 28 | protected boolean doMatch(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context) { 29 | Pipeline pipeline = new DefaultPipeline<>(); 30 | this.init(pipeline); 31 | List conditionList = pipeline.list(); 32 | 33 | // 必须满足所有 34 | for(IWordResultCondition wordResultCondition : conditionList) { 35 | if(!wordResultCondition.match(wordResult, text, modeEnum, context)) { 36 | return false; 37 | } 38 | } 39 | 40 | return true; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenys.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.deny; 2 | 3 | import com.github.houbb.heaven.support.pipeline.Pipeline; 4 | import com.github.houbb.heaven.util.util.ArrayUtil; 5 | import com.github.houbb.sensitive.word.api.IWordDeny; 6 | 7 | /** 8 | * 所有拒绝的结果 9 | * @author binbin.hou 10 | * @since 0.0.13 11 | */ 12 | public final class WordDenys { 13 | 14 | private WordDenys(){} 15 | 16 | /** 17 | * 责任链 18 | * @param wordDeny 拒绝 19 | * @param others 其他 20 | * @return 结果 21 | * @since 0.0.13 22 | */ 23 | public static IWordDeny chains(final IWordDeny wordDeny, 24 | final IWordDeny... others) { 25 | return new WordDenyInit() { 26 | @Override 27 | protected void init(Pipeline pipeline) { 28 | pipeline.addLast(wordDeny); 29 | 30 | if(ArrayUtil.isNotEmpty(others)) { 31 | for(IWordDeny other : others) { 32 | pipeline.addLast(other); 33 | } 34 | } 35 | } 36 | }; 37 | } 38 | 39 | /** 40 | * 系统实现 41 | * @return 结果 42 | * @since 0.0.13 43 | */ 44 | public static IWordDeny defaults() { 45 | return WordDenySystem.getInstance(); 46 | } 47 | 48 | /** 49 | * 空实现 50 | * @return 结果 51 | * @since 0.19.13 52 | */ 53 | public static IWordDeny empty() { 54 | return new WordDenyEmpty(); 55 | } 56 | 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/define/SensitiveWordBsDefineTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.define; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | import com.github.houbb.sensitive.word.api.IWordDeny; 5 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 6 | import com.github.houbb.sensitive.word.support.allow.WordAllows; 7 | import com.github.houbb.sensitive.word.support.deny.WordDenys; 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | /** 12 | * @author binbin.hou 13 | * @since 1.0.0 14 | */ 15 | public class SensitiveWordBsDefineTest { 16 | 17 | @Test 18 | public void defineDenyTest() { 19 | String text = "这是一个测试,我的自定义敏感词。"; 20 | 21 | SensitiveWordBs wordBs = SensitiveWordBs.newInstance() 22 | .wordDeny(new MyWordDeny()) 23 | .wordAllow(new MyWordAllow()) 24 | .init(); 25 | 26 | Assert.assertEquals("[我的自定义敏感词]", wordBs.findAll(text).toString()); 27 | } 28 | 29 | @Test 30 | public void defineChainsTest() { 31 | String text = "这是一个测试。我的自定义敏感词。"; 32 | 33 | IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny()); 34 | IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow()); 35 | 36 | SensitiveWordBs wordBs = SensitiveWordBs.newInstance() 37 | .wordDeny(wordDeny) 38 | .wordAllow(wordAllow) 39 | .init(); 40 | 41 | Assert.assertEquals("[我的自定义敏感词]", wordBs.findAll(text).toString()); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllows.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.allow; 2 | 3 | import com.github.houbb.heaven.support.pipeline.Pipeline; 4 | import com.github.houbb.heaven.util.util.ArrayUtil; 5 | import com.github.houbb.sensitive.word.api.IWordAllow; 6 | 7 | /** 8 | * 所有允许的结果 9 | * @author binbin.hou 10 | * @since 0.0.13 11 | */ 12 | public final class WordAllows { 13 | 14 | private WordAllows(){} 15 | 16 | /** 17 | * 责任链 18 | * @param wordAllow 允许 19 | * @param others 其他 20 | * @return 结果 21 | * @since 0.0.13 22 | */ 23 | public static IWordAllow chains(final IWordAllow wordAllow, 24 | final IWordAllow... others) { 25 | return new WordAllowInit() { 26 | @Override 27 | protected void init(Pipeline pipeline) { 28 | pipeline.addLast(wordAllow); 29 | 30 | if(ArrayUtil.isNotEmpty(others)) { 31 | for(IWordAllow other : others) { 32 | pipeline.addLast(other); 33 | } 34 | } 35 | } 36 | }; 37 | } 38 | 39 | /** 40 | * 系统实现 41 | * @return 结果 42 | * @since 0.0.13 43 | */ 44 | public static IWordAllow defaults() { 45 | return WordAllowSystem.getInstance(); 46 | } 47 | 48 | 49 | /** 50 | * 空实现,可测试用 51 | * @return 结果 52 | * @since 0.19.0 53 | */ 54 | public static IWordAllow empty() { 55 | return new WordAllowEmpty(); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.9 14 | */ 15 | public class SensitiveWordBsEmailTest { 16 | 17 | /** 18 | * 邮箱测试 19 | * @since 0.0.9 20 | */ 21 | @Test 22 | public void emailEnglishTest() { 23 | final String text = "楼主好人,邮箱 sensitiveword@xx.com"; 24 | 25 | List wordList = SensitiveWordBs.newInstance() 26 | .enableEmailCheck(true) 27 | .init() 28 | .findAll(text); 29 | Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString()); 30 | } 31 | 32 | /** 33 | * 邮箱测试 34 | * @since 0.0.9 35 | */ 36 | @Test 37 | public void emailNumberTest() { 38 | final String text = "楼主好人,邮箱 123456789@xx.com"; 39 | 40 | List wordList = SensitiveWordBs.newInstance() 41 | .enableEmailCheck(true) 42 | .init() 43 | .findAll(text); 44 | Assert.assertEquals("[123456789@xx.com]", wordList.toString()); 45 | } 46 | 47 | @Test 48 | public void emailTest() { 49 | final String text = "你我.他你"; 50 | List wordList = SensitiveWordBs.newInstance().init().findAll(text); 51 | Assert.assertEquals("[]", wordList.toString()); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/DataUtil.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import com.github.houbb.heaven.util.io.FileUtil; 4 | import com.github.houbb.heaven.util.util.CollectionUtil; 5 | import org.junit.Ignore; 6 | import org.junit.Test; 7 | 8 | import java.util.Collection; 9 | import java.util.Collections; 10 | import java.util.List; 11 | 12 | /** 13 | * @author binbin.hou 14 | * @since 0.0.3 15 | */ 16 | public class DataUtil { 17 | 18 | /** 19 | * 获取对应文件的独一无二内容 20 | * @param name 名称 21 | * @return 结果 22 | * @since 0.0.1 23 | */ 24 | public static List distinctLines(final String name) { 25 | final String dir = "D:\\github\\sensitive-word\\src\\main\\resources\\"; 26 | final String path = dir + name; 27 | List lines = FileUtil.readAllLines(path); 28 | return CollectionUtil.distinct(lines); 29 | } 30 | 31 | public static List disctinctAndSort(final Collection collection) { 32 | List stringList = CollectionUtil.distinct(collection); 33 | Collections.sort(stringList); 34 | 35 | return stringList; 36 | } 37 | 38 | @Test 39 | @Ignore 40 | public void singleCharTest() { 41 | final String path = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt"; 42 | 43 | List stringList = FileUtil.readAllLines(path); 44 | for(String s : stringList) { 45 | if(s.length() == 1) { 46 | System.out.println(s); 47 | } 48 | } 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUrlTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | *

project: sensitive-word-SensitiveWordBsTest

10 | *

create on 2020/1/7 23:43

11 | * 12 | * @author Administrator 13 | * @since 0.0.12 14 | */ 15 | public class SensitiveWordBsUrlTest { 16 | 17 | /** 18 | * 忽略中文繁简体 19 | * @since 0.0.12 20 | */ 21 | @Test 22 | public void commonUrlTest() { 23 | final String text = "点击链接 https://www.baidu.com 查看答案"; 24 | 25 | final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().enableUrlCheck(true).init(); 26 | List wordList = sensitiveWordBs.findAll(text); 27 | Assert.assertEquals("[https://www.baidu.com]", wordList.toString()); 28 | 29 | Assert.assertEquals("点击链接 ********************* 查看答案", sensitiveWordBs.replace(text)); 30 | } 31 | 32 | /** 33 | * 图片测试 34 | * 35 | * (1)可以检测 36 | * (2)默认不替换 37 | * 38 | * @since 0.0.12 39 | */ 40 | @Test 41 | public void imageUrlTest() { 42 | final String text = "双击查看大图 http://www.big-image.png 查看"; 43 | 44 | final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 45 | .enableUrlCheck(true) 46 | .init(); 47 | List wordList = sensitiveWordBs.findAll(text); 48 | Assert.assertEquals("[http://www.big-image.png]", wordList.toString()); 49 | 50 | Assert.assertEquals("双击查看大图 ************************ 查看", sensitiveWordBs.replace(text)); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsIgnoreCharTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | *

project: sensitive-word-SensitiveWordBsTest

11 | *

create on 2020/1/7 23:43

12 | * 13 | * @author Administrator 14 | * @since 0.11.0 15 | */ 16 | public class SensitiveWordBsIgnoreCharTest { 17 | 18 | /** 19 | * 忽略中文繁简体 20 | * @since 0.0.6 21 | */ 22 | @Test 23 | public void ignoreChineseStyleTest() { 24 | final String text = "傻@冒,狗+东西"; 25 | 26 | //默认因为有特殊字符分割,无法识别 27 | List wordList = SensitiveWordBs.newInstance().init().findAll(text); 28 | Assert.assertEquals("[]", wordList.toString()); 29 | 30 | // 指定忽略的字符策略,可自行实现。 31 | List wordList2 = SensitiveWordBs.newInstance() 32 | .charIgnore(SensitiveWordCharIgnores.specialChars()) 33 | .init() 34 | .findAll(text); 35 | 36 | Assert.assertEquals("[傻@冒, 狗+东西]", wordList2.toString()); 37 | } 38 | 39 | //https://github.com/houbb/sensitive-word/issues/68 40 | @Test 41 | public void ignoreChineseStyleTest2() { 42 | final String text = "

傻逼

"; 43 | 44 | // 指定忽略的字符策略,可自行实现。 45 | List wordList2 = SensitiveWordBs.newInstance() 46 | .charIgnore(SensitiveWordCharIgnores.specialChars()) 47 | .init() 48 | .findAll(text); 49 | 50 | Assert.assertEquals("[傻逼]", wordList2.toString()); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/WordResultConditionWordTagsMatch.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.heaven.util.common.ArgUtil; 4 | import com.github.houbb.heaven.util.util.CollectionUtil; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.api.IWordResult; 7 | import com.github.houbb.sensitive.word.api.IWordTag; 8 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 9 | 10 | import java.util.Collection; 11 | import java.util.Set; 12 | 13 | /** 14 | * 结果标签匹配的条件 15 | * 16 | * @since 0.23.0 17 | */ 18 | public class WordResultConditionWordTagsMatch extends AbstractWordResultCondition { 19 | 20 | /** 21 | * 指定标签的集合 22 | */ 23 | private final Collection tags; 24 | 25 | public WordResultConditionWordTagsMatch(Collection tags) { 26 | ArgUtil.notEmpty(tags, "tags"); 27 | 28 | this.tags = tags; 29 | } 30 | 31 | @Override 32 | protected boolean doMatch(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context) { 33 | // 判断对应的标签 34 | String word = text.substring(wordResult.startIndex(), wordResult.endIndex()); 35 | final IWordTag wordTag = context.wordTag(); 36 | Set wordTags = wordTag.getTag(word); 37 | 38 | // 在指定的 tag 中 39 | if(CollectionUtil.isEmpty(wordTags)) { 40 | return false; 41 | } 42 | 43 | for(String tag : tags) { 44 | if(wordTags.contains(tag)) { 45 | return true; 46 | } 47 | } 48 | 49 | return false; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/data/NumUtilTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.data; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | /** 10 | * @author binbin.hou 11 | * @since 0.0.11 12 | */ 13 | @Ignore 14 | public class NumUtilTest { 15 | 16 | @Test 17 | public void groupNumTest() { 18 | String nums = "123456789" + 19 | "一二三四五六七八九" + 20 | "壹贰叁肆伍陆柒捌玖" + 21 | "¹²³⁴⁵⁶⁷⁸⁹" + 22 | "₁₂₃₄₅₆₇₈₉" + 23 | "①②③④⑤⑥⑦⑧⑨" + 24 | "⑴⑵⑶⑷⑸⑹⑺⑻⑼" + 25 | "⒈⒉⒊⒋⒌⒍⒎⒏⒐" + 26 | "❶❷❸❹❺❻❼❽❾" + 27 | "➀➁➂➃➄➅➆➇➈" + 28 | "➊➋➌➍➎➏➐➑➒" + 29 | "㈠㈡㈢㈣㈤㈥㈦㈧㈨" + 30 | "⓵⓶⓷⓸⓹⓺⓻⓼⓽" + 31 | "㊀㊁㊂㊃㊄㊅㊆㊇㊈" + 32 | "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" + 33 | "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ"; 34 | 35 | for(int l = 0; l < 9; l++) { 36 | for(int i = 0; i < 16; i++) { 37 | System.out.print(nums.charAt(i*9+l)+" "); 38 | } 39 | System.out.println(); 40 | } 41 | 42 | } 43 | 44 | 45 | @Test 46 | public void groupEnglishTest() { 47 | List lines = Arrays.asList("ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ", 48 | "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ", 49 | "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵"); 50 | for(int i = 0; i < 26; i++) { 51 | System.out.print(lines.get(0).charAt(i)+" "); 52 | System.out.print(lines.get(1).charAt(i)+" "); 53 | System.out.print(lines.get(2).charAt(i)); 54 | System.out.println(); 55 | } 56 | 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckArray.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.util.common.ArgUtil; 4 | import com.github.houbb.sensitive.word.api.IWordCheck; 5 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 6 | import com.github.houbb.sensitive.word.support.result.WordLengthResult; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * 集合 12 | * @author binbin.hou 13 | * @since 0.30.0 14 | */ 15 | public class WordCheckArray implements IWordCheck { 16 | 17 | private final IWordCheck[] sensitiveChecks; 18 | private final int size; 19 | public WordCheckArray(List sensitiveChecks) { 20 | ArgUtil.notEmpty(sensitiveChecks, "sensitiveChecks"); 21 | 22 | this.size = sensitiveChecks.size(); 23 | this.sensitiveChecks = new IWordCheck[size]; 24 | for(int i = 0; i < size; i++) { 25 | this.sensitiveChecks[i] = sensitiveChecks.get(i); 26 | } 27 | } 28 | 29 | @Override 30 | public WordCheckResult sensitiveCheck(int beginIndex, InnerSensitiveWordContext checkContext) { 31 | // 循环调用 32 | for(int i = 0; i < size; i++) { 33 | IWordCheck sensitiveCheck = sensitiveChecks[i]; 34 | WordCheckResult result = sensitiveCheck.sensitiveCheck(beginIndex, checkContext); 35 | 36 | WordLengthResult wordLengthResult = result.wordLengthResult(); 37 | if(wordLengthResult.wordAllowLen() > 0 || wordLengthResult.wordDenyLen()> 0) { 38 | return result; 39 | } 40 | } 41 | 42 | // 这里直接进行正则表达式相关的调用。 43 | // 默认返回 0 44 | return WordCheckNone.getNoneResult(); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckInit.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.support.pipeline.Pipeline; 4 | import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline; 5 | import com.github.houbb.sensitive.word.api.IWordCheck; 6 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 7 | import com.github.houbb.sensitive.word.support.result.WordLengthResult; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * 检测初始化类 13 | * @since 0.3.0 14 | */ 15 | @Deprecated 16 | public abstract class WordCheckInit implements IWordCheck { 17 | 18 | /** 19 | * 初始化列表 20 | * 21 | * @param pipeline 当前列表泳道 22 | * @since 0.0.13 23 | */ 24 | protected abstract void init(final Pipeline pipeline); 25 | 26 | 27 | @Override 28 | public WordCheckResult sensitiveCheck(final int beginIndex, 29 | final InnerSensitiveWordContext checkContext) { 30 | 31 | Pipeline pipeline = new DefaultPipeline<>(); 32 | this.init(pipeline); 33 | List sensitiveChecks = pipeline.list(); 34 | 35 | // 循环调用 36 | for(IWordCheck sensitiveCheck : sensitiveChecks) { 37 | WordCheckResult result = sensitiveCheck.sensitiveCheck(beginIndex, checkContext); 38 | 39 | WordLengthResult wordLengthResult = result.wordLengthResult(); 40 | if(wordLengthResult.wordAllowLen() > 0 || wordLengthResult.wordDenyLen()> 0) { 41 | return result; 42 | } 43 | } 44 | 45 | // 这里直接进行正则表达式相关的调用。 46 | // 默认返回 0 47 | return WordCheckNone.getNoneResult(); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/WordResultConditionEnglishWordNumMatch.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.heaven.util.lang.CharUtil; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordResult; 6 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 7 | 8 | /** 9 | * 英文单词和数字必须要全词匹配 10 | * 11 | * https://github.com/houbb/sensitive-word/issues/77 12 | * 13 | * @since 0.20.0 14 | */ 15 | public class WordResultConditionEnglishWordNumMatch extends AbstractWordResultCondition { 16 | 17 | @Override 18 | protected boolean doMatch(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context) { 19 | final int startIndex = wordResult.startIndex(); 20 | final int endIndex = wordResult.endIndex(); 21 | // 判断处理,判断前一个字符是否为英文。如果是,则不满足 22 | if(startIndex > 0) { 23 | char preC = text.charAt(startIndex-1); 24 | if(CharUtil.isDigitOrLetter(preC)) { 25 | return false; 26 | } 27 | } 28 | 29 | // 判断后一个字符是否为英文 30 | // v0.19.1 修正 cp cpm 单个字符错误命中问题 31 | if(endIndex < text.length()) { 32 | char afterC = text.charAt(endIndex); 33 | if(CharUtil.isDigitOrLetter(afterC)) { 34 | return false; 35 | } 36 | } 37 | 38 | // 判断当前是否为英文单词 39 | for(int i = startIndex; i < endIndex; i++) { 40 | char c = text.charAt(i); 41 | if(!CharUtil.isDigitOrLetter(c)) { 42 | return true; 43 | } 44 | } 45 | 46 | return true; 47 | } 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/WordResultConditionEnglishWordMatch.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.heaven.util.lang.CharUtil; 4 | import com.github.houbb.heaven.util.util.CharsetUtil; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.api.IWordResult; 7 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 8 | 9 | /** 10 | * 英文单词必须要全词匹配 11 | * 12 | * https://github.com/houbb/sensitive-word/issues/45 13 | * 14 | * @since 0.13.0 15 | */ 16 | public class WordResultConditionEnglishWordMatch extends AbstractWordResultCondition { 17 | 18 | @Override 19 | protected boolean doMatch(IWordResult wordResult, String text, WordValidModeEnum modeEnum, IWordContext context) { 20 | final int startIndex = wordResult.startIndex(); 21 | final int endIndex = wordResult.endIndex(); 22 | // 判断处理,判断前一个字符是否为英文。如果是,则不满足 23 | if(startIndex > 0) { 24 | char preC = text.charAt(startIndex-1); 25 | if(CharUtil.isEnglish(preC)) { 26 | return false; 27 | } 28 | } 29 | 30 | // 判断后一个字符是否为英文 31 | // v0.19.1 修正 cp cpm 单个字符错误命中问题 32 | if(endIndex < text.length()) { 33 | char afterC = text.charAt(endIndex); 34 | if(CharUtil.isEnglish(afterC)) { 35 | return false; 36 | } 37 | } 38 | 39 | // 判断当前是否为英文单词 40 | for(int i = startIndex; i < endIndex; i++) { 41 | char c = text.charAt(i); 42 | if(!CharUtil.isEnglish(c)) { 43 | return true; 44 | } 45 | } 46 | 47 | return true; 48 | } 49 | 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/combine/allowdeny/AbstractWordAllowDenyCombine.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.combine.allowdeny; 2 | 3 | import com.github.houbb.heaven.util.util.CollectionUtil; 4 | import com.github.houbb.sensitive.word.api.IWordAllow; 5 | import com.github.houbb.sensitive.word.api.IWordContext; 6 | import com.github.houbb.sensitive.word.api.IWordDeny; 7 | import com.github.houbb.sensitive.word.api.combine.IWordAllowDenyCombine; 8 | import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils; 9 | 10 | import java.util.Collection; 11 | import java.util.Collections; 12 | import java.util.List; 13 | 14 | /** 15 | * @author d 16 | * @since 0.8.0 17 | */ 18 | public abstract class AbstractWordAllowDenyCombine implements IWordAllowDenyCombine { 19 | 20 | protected abstract Collection doGetActualDenyList(List allowList, 21 | List denyList, 22 | IWordContext context); 23 | 24 | @Override 25 | public Collection getActualDenyList(final List allowList, 26 | final List denyList, 27 | IWordContext context) { 28 | List formatAllowList = InnerWordFormatUtils.formatWordList(allowList, context); 29 | List formatDenyList = InnerWordFormatUtils.formatWordList(denyList, context); 30 | 31 | if (CollectionUtil.isEmpty(formatDenyList)) { 32 | return Collections.emptyList(); 33 | } 34 | if (CollectionUtil.isEmpty(formatAllowList)) { 35 | return formatDenyList; 36 | } 37 | 38 | return doGetActualDenyList(formatAllowList, formatDenyList, context); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTreeNode.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.data; 2 | 3 | import com.github.houbb.sensitive.word.api.ISensitiveWordDestroy; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | /** 9 | * 树节点 10 | * 11 | * @since 0.7.0 12 | */ 13 | public class WordDataTreeNode implements ISensitiveWordDestroy { 14 | 15 | /** 16 | * 关键词结束标识 17 | */ 18 | private boolean end; 19 | 20 | /** 21 | * 子节点(key是下级字符,value是下级节点) 22 | */ 23 | private Map subNodeMap; 24 | 25 | public boolean end() { 26 | return end; 27 | } 28 | 29 | public WordDataTreeNode end(boolean end) { 30 | this.end = end; 31 | return this; 32 | } 33 | 34 | public WordDataTreeNode getSubNode(final Character c) { 35 | if(subNodeMap == null) { 36 | return null; 37 | } 38 | 39 | return subNodeMap.get(c); 40 | } 41 | public int getNodeSize() { 42 | if (subNodeMap == null) { 43 | return 0; 44 | } 45 | return subNodeMap.size(); 46 | } 47 | 48 | public void clearNode() { 49 | if (subNodeMap == null) { 50 | return; 51 | } 52 | subNodeMap=null; 53 | } 54 | 55 | public void removeNode(final Character c) { 56 | if (subNodeMap == null) { 57 | return; 58 | } 59 | subNodeMap.remove(c); 60 | } 61 | 62 | public WordDataTreeNode addSubNode(Character c, WordDataTreeNode subNode) { 63 | if(this.subNodeMap == null) { 64 | subNodeMap = new HashMap<>(); 65 | } 66 | 67 | subNodeMap.put(c, subNode); 68 | return this; 69 | } 70 | 71 | @Override 72 | public void destroy() { 73 | if(subNodeMap != null) { 74 | subNodeMap.clear(); 75 | } 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordResult.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordResult; 4 | 5 | /** 6 | * @author binbin.hou 7 | * @since 0.1.0 8 | */ 9 | public class WordResult implements IWordResult { 10 | 11 | private int startIndex; 12 | 13 | private int endIndex; 14 | 15 | /** 16 | * 词类别 17 | * @since 0.14.0 18 | */ 19 | private String type; 20 | 21 | /** 22 | * 单词匹配 23 | * @since 0.25.0 24 | */ 25 | private String word; 26 | 27 | private WordResult(){} 28 | 29 | public static WordResult newInstance() { 30 | return new WordResult(); 31 | } 32 | 33 | @Override 34 | public int startIndex() { 35 | return startIndex; 36 | } 37 | 38 | public WordResult startIndex(int startIndex) { 39 | this.startIndex = startIndex; 40 | return this; 41 | } 42 | 43 | @Override 44 | public int endIndex() { 45 | return endIndex; 46 | } 47 | 48 | public WordResult endIndex(int endIndex) { 49 | this.endIndex = endIndex; 50 | return this; 51 | } 52 | 53 | @Override 54 | public String type() { 55 | return type; 56 | } 57 | 58 | public WordResult type(String type) { 59 | this.type = type; 60 | return this; 61 | } 62 | 63 | @Override 64 | public String word() { 65 | return word; 66 | } 67 | 68 | public WordResult word(String word) { 69 | this.word = word; 70 | return this; 71 | } 72 | 73 | @Override 74 | public String toString() { 75 | return "WordResult{" + 76 | "startIndex=" + startIndex + 77 | ", endIndex=" + endIndex + 78 | ", type='" + type + '\'' + 79 | ", word='" + word + '\'' + 80 | '}'; 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckResult.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordCheck; 4 | import com.github.houbb.sensitive.word.support.result.WordLengthResult; 5 | 6 | /** 7 | * 敏感信息监测接口结果 8 | * 9 | * 可以使用责任链的模式,循环调用。 10 | * @author binbin.hou 11 | * @since 0.0.12 12 | */ 13 | public class WordCheckResult { 14 | 15 | /** 16 | * 命中的黑白名单的长度对象 17 | */ 18 | private WordLengthResult wordLengthResult; 19 | 20 | /** 21 | * 检测类 22 | * @since 0.0.12 23 | */ 24 | private Class checkClass; 25 | 26 | /** 27 | * 单词类别 28 | * @since 0.14.0 29 | */ 30 | private String type; 31 | 32 | private WordCheckResult(){} 33 | 34 | public static WordCheckResult newInstance() { 35 | return new WordCheckResult(); 36 | } 37 | 38 | public WordLengthResult wordLengthResult() { 39 | return wordLengthResult; 40 | } 41 | 42 | public WordCheckResult wordLengthResult(WordLengthResult wordLengthResult) { 43 | this.wordLengthResult = wordLengthResult; 44 | return this; 45 | } 46 | 47 | public Class checkClass() { 48 | return checkClass; 49 | } 50 | 51 | public WordCheckResult checkClass(Class checkClass) { 52 | this.checkClass = checkClass; 53 | return this; 54 | } 55 | 56 | public String type() { 57 | return type; 58 | } 59 | 60 | public WordCheckResult type(String type) { 61 | this.type = type; 62 | return this; 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return "WordCheckResult{" + 68 | "wordLengthResult=" + wordLengthResult + 69 | ", checkClass=" + checkClass + 70 | ", type='" + type + '\'' + 71 | '}'; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/api/context/InnerSensitiveWordContext.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.api.context; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordContext; 4 | import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; 5 | 6 | import java.util.Map; 7 | 8 | /** 9 | * 内部信息上下文 10 | * 11 | * @author binbin.hou 12 | * @since 0.6.0 13 | */ 14 | public class InnerSensitiveWordContext { 15 | 16 | /** 17 | * 原始文本 18 | */ 19 | private String originalText; 20 | /** 21 | * 格式化后的字符 22 | */ 23 | private Map formatCharMapping; 24 | /** 25 | * 校验模式 26 | */ 27 | private WordValidModeEnum modeEnum; 28 | /** 29 | * 原始上下文 30 | */ 31 | private IWordContext wordContext; 32 | 33 | public static InnerSensitiveWordContext newInstance() { 34 | return new InnerSensitiveWordContext(); 35 | } 36 | 37 | public String originalText() { 38 | return originalText; 39 | } 40 | 41 | public InnerSensitiveWordContext originalText(String text) { 42 | this.originalText = text; 43 | return this; 44 | } 45 | 46 | public Map formatCharMapping() { 47 | return formatCharMapping; 48 | } 49 | 50 | public InnerSensitiveWordContext formatCharMapping(Map formatCharMapping) { 51 | this.formatCharMapping = formatCharMapping; 52 | return this; 53 | } 54 | 55 | public WordValidModeEnum modeEnum() { 56 | return modeEnum; 57 | } 58 | 59 | public InnerSensitiveWordContext modeEnum(WordValidModeEnum modeEnum) { 60 | this.modeEnum = modeEnum; 61 | return this; 62 | } 63 | 64 | public IWordContext wordContext() { 65 | return wordContext; 66 | } 67 | 68 | public InnerSensitiveWordContext wordContext(IWordContext context) { 69 | this.wordContext = context; 70 | return this; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/tag/WordTagLines.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.tag; 2 | 3 | import com.github.houbb.heaven.util.common.ArgUtil; 4 | import com.github.houbb.heaven.util.lang.StringUtil; 5 | import com.github.houbb.sensitive.word.api.IWordTag; 6 | 7 | import java.util.*; 8 | 9 | /** 10 | * 根据标准的行来处理 11 | * 12 | * 行规范: 13 | * 14 | * 单词 标签1,标签2 15 | * 16 | * @since 0.24.0 17 | */ 18 | public class WordTagLines extends AbstractWordTag { 19 | 20 | private final IWordTag wordTag; 21 | 22 | /** 23 | * 词和标签的分隔符 24 | */ 25 | private final String wordSplit; 26 | /** 27 | * 标签的分隔符 28 | */ 29 | private final String tagSplit; 30 | 31 | public WordTagLines(Collection lines, 32 | final String wordSplit, 33 | final String tagSplit) { 34 | ArgUtil.notNull(lines, "lines"); 35 | ArgUtil.notEmpty(wordSplit, "wordSplit"); 36 | ArgUtil.notEmpty(tagSplit, "tagSplit"); 37 | 38 | this.wordSplit = wordSplit; 39 | this.tagSplit = tagSplit; 40 | 41 | Map> wordTagMap = buildWordTagMap(lines); 42 | wordTag = WordTags.map(wordTagMap); 43 | } 44 | 45 | public WordTagLines(Collection lines) { 46 | this(lines, " ", ","); 47 | } 48 | 49 | private Map> buildWordTagMap(final Collection lines) { 50 | Map> wordTagMap = new HashMap<>(); 51 | 52 | for(String line : lines) { 53 | String[] strings = line.split(wordSplit); 54 | String key = strings[0]; 55 | Set tags = new HashSet<>(StringUtil.splitToList(strings[1], tagSplit)); 56 | wordTagMap.put(key, tags); 57 | } 58 | return wordTagMap; 59 | } 60 | 61 | @Override 62 | protected Set doGetTag(String word) { 63 | return wordTag.getTag(word); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/support/resultcondition/WordTagsTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 5 | import com.github.houbb.sensitive.word.support.allow.WordAllows; 6 | import org.junit.Assert; 7 | import org.junit.Test; 8 | 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | public class WordTagsTest { 13 | 14 | 15 | /** 16 | * 是否包含 17 | * 18 | * @since 0.23.0 19 | */ 20 | @Test 21 | public void wordTagsTest() { 22 | // 只关心SE情 23 | SensitiveWordBs sensitiveWordBsYellow = SensitiveWordBs.newInstance() 24 | .wordDeny(new IWordDeny() { 25 | @Override 26 | public List deny() { 27 | return Arrays.asList("商品", "AV"); 28 | } 29 | }) 30 | .wordAllow(WordAllows.empty()) 31 | .wordTag(new MyWordTag()) 32 | .wordResultCondition(WordResultConditions.wordTags(Arrays.asList("色情"))) 33 | .init(); 34 | 35 | // 只关心广告 36 | SensitiveWordBs sensitiveWordBsAd = SensitiveWordBs.newInstance() 37 | .wordDeny(new IWordDeny() { 38 | @Override 39 | public List deny() { 40 | return Arrays.asList("商品", "AV"); 41 | } 42 | }) 43 | .wordAllow(WordAllows.empty()) 44 | .wordTag(new MyWordTag()) 45 | .wordResultCondition(WordResultConditions.wordTags(Arrays.asList("广告"))) 46 | .init(); 47 | 48 | final String text = "这些 AV 商品什么价格?"; 49 | Assert.assertEquals("[AV]", sensitiveWordBsYellow.findAll(text).toString()); 50 | Assert.assertEquals("[商品]", sensitiveWordBsAd.findAll(text).toString()); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/result/WordLengthResult.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.result; 2 | 3 | /** 4 | * 说明:统一让黑白名单一次遍历,性能优化 5 | * 6 | * @since 0.24.2 7 | */ 8 | public class WordLengthResult { 9 | /** 10 | * 白名单长度 11 | */ 12 | private int wordAllowLen; 13 | /** 14 | * 黑名单长度 15 | */ 16 | private int wordDenyLen; 17 | 18 | /** 19 | * 黑名单匹配词 20 | * @since 0.25.1 21 | */ 22 | private String wordDeny; 23 | 24 | /** 25 | * 白名单实际匹配值 26 | * @since 0.25.1 27 | */ 28 | private String wordAllow; 29 | 30 | public static WordLengthResult newInstance() { 31 | return new WordLengthResult(); 32 | } 33 | 34 | public int wordAllowLen() { 35 | return this.wordAllowLen; 36 | } 37 | 38 | public WordLengthResult wordAllowLen(int wordAllowLen) { 39 | this.wordAllowLen = wordAllowLen; 40 | return this; 41 | } 42 | 43 | public int wordDenyLen() { 44 | return this.wordDenyLen; 45 | } 46 | 47 | public WordLengthResult wordDenyLen(int wordDenyLen) { 48 | this.wordDenyLen = wordDenyLen; 49 | return this; 50 | } 51 | 52 | public String wordDeny() { 53 | return wordDeny; 54 | } 55 | 56 | public WordLengthResult wordDeny(String wordDeny) { 57 | this.wordDeny = wordDeny; 58 | return this; 59 | } 60 | 61 | public String wordAllow() { 62 | return wordAllow; 63 | } 64 | 65 | public WordLengthResult wordAllow(String wordAllow) { 66 | this.wordAllow = wordAllow; 67 | return this; 68 | } 69 | 70 | @Override 71 | public String toString() { 72 | return "WordLengthResult{" + 73 | "wordAllowLen=" + wordAllowLen + 74 | ", wordDenyLen=" + wordDenyLen + 75 | ", wordDeny='" + wordDeny + '\'' + 76 | ", wordAllow='" + wordAllow + '\'' + 77 | '}'; 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "============================= RELEASE START..." 3 | 4 | ## 版本号信息(需要手动指定) 5 | version="0.0.1" 6 | newVersion="0.0.2" 7 | projectName="sisyphus" 8 | 9 | # release 项目版本 10 | ## snapshot 版本号 11 | snapshot_version=${version}"-SNAPSHOT" 12 | ## 新的版本号 13 | release_version=${version} 14 | 15 | mvn versions:set -DgroupId=com.github.houbb -DartifactId=${projectName} -DoldVersion=${snapshot_version} -DnewVersion=${release_version} 16 | mvn -N versions:update-child-modules 17 | mvn versions:commit 18 | echo "1. RELEASE ${snapshot_version} TO ${release_version} DONE." 19 | 20 | 21 | # 推送到 github 22 | git add . 23 | git commit -m "release branch ${version}" 24 | git push 25 | git status 26 | 27 | echo "2. PUSH TO GITHUB DONE." 28 | 29 | 30 | # 推送到 maven 中央仓库 31 | mvn clean deploy -P release 32 | 33 | echo "3. PUSH TO MAVEN CENTER DONE." 34 | 35 | # 合并到 master 分支 36 | branchName="release_"${version} # 分支名称 37 | git checkout master 38 | git pull 39 | git checkout ${branchName} 40 | git rebase master 41 | git checkout master 42 | git merge ${branchName} 43 | git push 44 | 45 | echo "4. MERGE TO MASTER DONE." 46 | 47 | 48 | # 拉取新的分支 49 | newBranchName="release_"${newVersion} 50 | git branch ${newBranchName} 51 | git checkout ${newBranchName} 52 | git push --set-upstream origin ${newBranchName} 53 | 54 | echo "5. NEW BRANCH DONE." 55 | 56 | # 修改新分支的版本号 57 | ## snapshot 版本号 58 | snapshot_new_version=${newVersion}"-SNAPSHOT" 59 | mvn versions:set -DgroupId=com.github.houbb -DartifactId=${projectName} -DoldVersion=${release_version} -DnewVersion=${snapshot_new_version} 60 | mvn -N versions:update-child-modules 61 | mvn versions:commit 62 | 63 | git add . 64 | git commit -m "modify branch ${release_version} TO ${snapshot_new_version}" 65 | git push 66 | git status 67 | echo "6. MODIFY ${release_version} TO ${snapshot_new_version} DONE." 68 | 69 | echo "============================= RELEASE END..." 70 | 71 | 72 | # 使用方式: 73 | # 1. 赋值权限: chmod +x ./release.sh 74 | # 2. 执行: ./release.sh 75 | # Last Update Time: 2018-01-20 13:17:06 76 | # Author: houbb 77 | 78 | 79 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyleC2C.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | import com.github.houbb.sensitive.word.collection.Char2CharMap; 7 | 8 | /** 9 | * 忽略英文的各种格式 10 | * @author binbin.hou 11 | * @since 0.0.6 12 | */ 13 | @ThreadSafe 14 | public class WordFormatIgnoreEnglishStyleC2C implements IWordFormat { 15 | 16 | private static final IWordFormat INSTANCE = new WordFormatIgnoreEnglishStyleC2C(); 17 | 18 | public static IWordFormat getInstance() { 19 | return INSTANCE; 20 | } 21 | 22 | /** 23 | * 英文字母1 24 | * @since 0.0.4 25 | */ 26 | private static final String LETTERS_ONE = 27 | "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" + 28 | "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" + 29 | "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵"; 30 | 31 | /** 32 | * 英文字母2 33 | * @since 0.0.4 34 | */ 35 | private static final String LETTERS_TWO = 36 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + 37 | "abcdefghijklmnopqrstuvwxyz" + 38 | "abcdefghijklmnopqrstuvwxyz"; 39 | 40 | 41 | /** 42 | * 字母映射表 43 | */ 44 | private static final Char2CharMap LETTER_MAP = new Char2CharMap(LETTERS_ONE.length()); 45 | 46 | static { 47 | final int size = LETTERS_ONE.length(); 48 | for(int i = 0; i < size; i++) { 49 | LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i)); 50 | } 51 | } 52 | 53 | /** 54 | * 映射后的 char 55 | * @param c 待转换的 char 56 | * @return 转换结果 57 | * @since 0.29.x 58 | */ 59 | private char getMappingChar(final char c) { 60 | char mc = LETTER_MAP.get(c); 61 | return mc == 0 ? c : mc; 62 | } 63 | 64 | @Override 65 | public char format(char original, IWordContext context) { 66 | return getMappingChar(original); 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyle.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | /** 11 | * 忽略英文的各种格式 12 | * @author binbin.hou 13 | * @since 0.0.6 14 | */ 15 | @Deprecated 16 | @ThreadSafe 17 | public class WordFormatIgnoreEnglishStyle implements IWordFormat { 18 | 19 | private static final IWordFormat INSTANCE = new WordFormatIgnoreEnglishStyle(); 20 | 21 | public static IWordFormat getInstance() { 22 | return INSTANCE; 23 | } 24 | 25 | /** 26 | * 英文字母1 27 | * @since 0.0.4 28 | */ 29 | private static final String LETTERS_ONE = 30 | "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" + 31 | "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" + 32 | "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵"; 33 | 34 | /** 35 | * 英文字母2 36 | * @since 0.0.4 37 | */ 38 | private static final String LETTERS_TWO = 39 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + 40 | "abcdefghijklmnopqrstuvwxyz" + 41 | "abcdefghijklmnopqrstuvwxyz"; 42 | 43 | 44 | /** 45 | * 字母映射表 46 | */ 47 | private static final Map LETTER_MAP = new HashMap<>(LETTERS_ONE.length()); 48 | 49 | static { 50 | final int size = LETTERS_ONE.length(); 51 | for(int i = 0; i < size; i++) { 52 | LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i)); 53 | } 54 | } 55 | 56 | /** 57 | * 映射后的 char 58 | * @param c 待转换的 char 59 | * @return 转换结果 60 | * @since 0.29.x 61 | */ 62 | private char getMappingChar(final char c) { 63 | Character mapChar = LETTER_MAP.get(c); 64 | return mapChar == null ? c : mapChar; 65 | } 66 | 67 | 68 | @Override 69 | public char format(char original, IWordContext context) { 70 | return getMappingChar(original); 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormats.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.util.util.ArrayUtil; 4 | import com.github.houbb.heaven.util.util.CollectionUtil; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * 格式化工具类 12 | * @author binbin.hou 13 | * @since 0.3.5 14 | */ 15 | public final class WordFormats { 16 | 17 | private WordFormats(){} 18 | 19 | /** 20 | * 链式 21 | * @param charFormats 列表 22 | * @return 结果 23 | */ 24 | public static IWordFormat chains(final IWordFormat... charFormats) { 25 | if(ArrayUtil.isEmpty(charFormats)) { 26 | return none(); 27 | } 28 | 29 | List wordFormats = new ArrayList<>(charFormats.length); 30 | return array(wordFormats); 31 | } 32 | 33 | /** 34 | * 链式 35 | * @param charFormats 列表 36 | * @return 结果 37 | */ 38 | public static IWordFormat chains(final List charFormats) { 39 | if(CollectionUtil.isEmpty(charFormats)) { 40 | return none(); 41 | } 42 | 43 | return array(charFormats); 44 | } 45 | 46 | public static IWordFormat none() { 47 | return WordFormatNone.getInstance(); 48 | } 49 | public static IWordFormat ignoreCase() { 50 | return WordFormatIgnoreCase.getInstance(); 51 | } 52 | 53 | public static IWordFormat ignoreEnglishStyle() { 54 | return WordFormatIgnoreEnglishStyleC2C.getInstance(); 55 | } 56 | 57 | public static IWordFormat ignoreChineseStyle() { 58 | return WordFormatIgnoreChineseStyle.getInstance(); 59 | } 60 | 61 | public static IWordFormat ignoreNumStyle() { 62 | return WordFormatIgnoreNumStyleC2C.getInstance(); 63 | } 64 | 65 | public static IWordFormat ignoreWidth() { 66 | return WordFormatIgnoreWidth.getInstance(); 67 | } 68 | 69 | public static IWordFormat array(final List wordFormats) { 70 | return new WordFormatArray(wordFormats); 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckEmail.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.util.lang.CharUtil; 5 | import com.github.houbb.heaven.util.util.regex.RegexUtil; 6 | import com.github.houbb.sensitive.word.api.IWordCheck; 7 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 8 | import com.github.houbb.sensitive.word.constant.WordConst; 9 | import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; 10 | 11 | /** 12 | * email 正则表达式检测实现。 13 | * 14 | * TODO: 这里暂时不实现邮箱后缀的实现。 15 | * 16 | * (1)命中结果应该有标记,属于哪一个验证模式命中 17 | * (2)后期优化方案可以是: 18 | * 如果数字后面紧跟的是邮箱后缀命中,则直接连接起来 num+email-suffix; 19 | * (3)邮箱后缀的去重 20 | * 邮箱后缀可以只处理为和 Num 构建,如果没有直接丢弃的模式。 21 | * 22 | * 也可以严格的保留下来。 23 | * @author binbin.hou 24 | * @since 0.0.9 25 | */ 26 | @ThreadSafe 27 | public class WordCheckEmail extends AbstractConditionWordCheck { 28 | 29 | /** 30 | * @since 0.3.0 31 | */ 32 | private static final IWordCheck INSTANCE = new WordCheckEmail(); 33 | 34 | public static IWordCheck getInstance() { 35 | return INSTANCE; 36 | } 37 | 38 | @Override 39 | protected Class getSensitiveCheckClass() { 40 | return WordCheckEmail.class; 41 | } 42 | 43 | @Override 44 | protected String getType() { 45 | return WordTypeEnum.EMAIL.getCode(); 46 | } 47 | 48 | @Override 49 | protected boolean isCharCondition(char mappingChar, int index, InnerSensitiveWordContext checkContext) { 50 | return CharUtil.isEmilChar(mappingChar); 51 | } 52 | 53 | @Override 54 | protected boolean isStringCondition(int index, StringBuilder stringBuilder, InnerSensitiveWordContext checkContext) { 55 | int bufferLen = stringBuilder.length(); 56 | 57 | //x@a.cn 58 | if(bufferLen < 6) { 59 | return false; 60 | } 61 | if(bufferLen > WordConst.MAX_EMAIL_LEN) { 62 | return false; 63 | } 64 | 65 | String string = stringBuilder.toString(); 66 | return RegexUtil.isEmail(string); 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/AbstractWordCheck.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.util.lang.StringUtil; 5 | import com.github.houbb.sensitive.word.api.IWordCheck; 6 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 7 | import com.github.houbb.sensitive.word.support.result.WordLengthResult; 8 | 9 | /** 10 | * 抽象实现策略 11 | * 12 | * @author binbin.hou 13 | * @since 0.4.0 14 | */ 15 | @ThreadSafe 16 | public abstract class AbstractWordCheck implements IWordCheck { 17 | 18 | /** 19 | * 获取校验类 20 | * @return 类 21 | * @since 0.3.2 22 | */ 23 | protected abstract Class getSensitiveCheckClass(); 24 | 25 | /** 26 | * 获取确切的长度 27 | * @param beginIndex 开始 28 | * @param checkContext 上下文 29 | * @return 长度 30 | * @since 0.4.0 31 | */ 32 | protected abstract WordLengthResult getActualLength(int beginIndex, final InnerSensitiveWordContext checkContext); 33 | 34 | /** 35 | * 获取类别 36 | * @return 类别 37 | * @since 0.14.0 38 | */ 39 | protected abstract String getType(); 40 | 41 | @Override 42 | public WordCheckResult sensitiveCheck(int beginIndex, 43 | final InnerSensitiveWordContext checkContext) { 44 | Class clazz = getSensitiveCheckClass(); 45 | final String txt = checkContext.originalText(); 46 | WordLengthResult wordLengthResult = WordLengthResult.newInstance() 47 | .wordAllowLen(0) 48 | .wordDenyLen(0); 49 | 50 | if(StringUtil.isEmpty(txt)) { 51 | return WordCheckResult.newInstance() 52 | .wordLengthResult(wordLengthResult) 53 | .type(getType()) 54 | .checkClass(clazz); 55 | } 56 | 57 | wordLengthResult = getActualLength(beginIndex, checkContext); 58 | 59 | return WordCheckResult.newInstance() 60 | .wordLengthResult(wordLengthResult) 61 | .type(getType()) 62 | .checkClass(clazz) 63 | ; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordChecks.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.util.util.ArrayUtil; 4 | import com.github.houbb.heaven.util.util.CollectionUtil; 5 | import com.github.houbb.sensitive.word.api.IWordCheck; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * 敏感词检测工具 12 | * @since 0.3.0 13 | */ 14 | public final class WordChecks { 15 | 16 | private WordChecks(){} 17 | 18 | public static IWordCheck chains(final IWordCheck... sensitiveChecks) { 19 | if (ArrayUtil.isEmpty(sensitiveChecks)){ 20 | return none(); 21 | } 22 | 23 | List wordChecks = new ArrayList<>(sensitiveChecks.length); 24 | return array(wordChecks); 25 | } 26 | 27 | public static IWordCheck chains(final List sensitiveChecks) { 28 | if (CollectionUtil.isEmpty(sensitiveChecks)){ 29 | return none(); 30 | } 31 | 32 | return array(sensitiveChecks); 33 | } 34 | 35 | public static IWordCheck email() { 36 | return WordCheckEmail.getInstance(); 37 | } 38 | 39 | public static IWordCheck num() { 40 | return WordCheckNum.getInstance(); 41 | } 42 | 43 | public static IWordCheck url() { 44 | return WordCheckUrl.getInstance(); 45 | } 46 | 47 | public static IWordCheck word() { 48 | return WordCheckWord.getInstance(); 49 | } 50 | 51 | public static IWordCheck none() { 52 | return WordCheckNone.getInstance(); 53 | } 54 | 55 | /** 56 | * ipv4 校验 57 | * @since 0.17.0 58 | * @return 实现 59 | */ 60 | public static IWordCheck ipv4() { 61 | return WordCheckIPV4.getInstance(); 62 | } 63 | 64 | /** 65 | * 不需要前缀的 urlPrefix 66 | * 注意:这种检测方法可能会和代码中的包名称冲突 67 | * 68 | * @return 实现 69 | * @since 0.25.0 70 | */ 71 | public static IWordCheck urlNoPrefix() { 72 | return WordCheckUrlNoPrefix.getInstance(); 73 | } 74 | 75 | /** 76 | * 集合 77 | * 78 | * @return 实现 79 | * @since 0.30.0 80 | */ 81 | public static IWordCheck array(final List wordChecks) { 82 | return new WordCheckArray(wordChecks); 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/data/AbstractWordData.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.data; 2 | 3 | import com.github.houbb.heaven.util.util.CollectionUtil; 4 | import com.github.houbb.sensitive.word.api.IWordData; 5 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 6 | import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; 7 | 8 | import java.util.Collection; 9 | 10 | /** 11 | * 抽象数据 12 | * 13 | * @since 0.7.0 14 | */ 15 | public abstract class AbstractWordData implements IWordData { 16 | 17 | /** 18 | * 是否包含 19 | * @param stringBuilder 字符 20 | * @param innerContext 上下文 21 | * @return 结果 22 | */ 23 | protected abstract WordContainsTypeEnum doContains(StringBuilder stringBuilder, InnerSensitiveWordContext innerContext); 24 | 25 | /** 26 | * 初始化 27 | * @param collection 数据 28 | */ 29 | protected abstract void doInitWordData(Collection collection); 30 | 31 | /** 32 | * 删除敏感词 33 | * @param collection 集合 34 | */ 35 | protected abstract void doRemoveWord(Collection collection); 36 | 37 | /** 38 | * 新增敏感词 39 | * @param collection 敏感词 40 | */ 41 | protected abstract void doAddWord(Collection collection); 42 | 43 | @Override 44 | public void initWordData(Collection collection) { 45 | //1. 预留 46 | 47 | this.doInitWordData(collection); 48 | } 49 | 50 | @Override 51 | public void removeWord(Collection collection) { 52 | if(CollectionUtil.isEmpty(collection)) { 53 | return; 54 | } 55 | 56 | doRemoveWord(collection); 57 | } 58 | 59 | @Override 60 | public void addWord(Collection collection) { 61 | if(CollectionUtil.isEmpty(collection)) { 62 | return; 63 | } 64 | 65 | doAddWord(collection); 66 | } 67 | 68 | @Override 69 | public WordContainsTypeEnum contains(StringBuilder stringBuilder, InnerSensitiveWordContext innerContext) { 70 | if(stringBuilder == null 71 | || stringBuilder.length() <= 0) { 72 | return WordContainsTypeEnum.NOT_FOUND; 73 | } 74 | 75 | return doContains(stringBuilder, innerContext); 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/resultcondition/WordResultConditions.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.resultcondition; 2 | 3 | import com.github.houbb.heaven.support.pipeline.Pipeline; 4 | import com.github.houbb.heaven.util.common.ArgUtil; 5 | import com.github.houbb.heaven.util.util.ArrayUtil; 6 | import com.github.houbb.sensitive.word.api.IWordResultCondition; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * 匹配结果工具类 12 | * 13 | * @since 0.13.0 14 | */ 15 | public final class WordResultConditions { 16 | 17 | /** 18 | * 恒为真 19 | * @return 结果 20 | */ 21 | public static IWordResultCondition alwaysTrue() { 22 | return new WordResultConditionAlwaysTrue(); 23 | } 24 | 25 | /** 26 | * 如果是英文,则必须全词匹匹配 27 | * @return 结果 28 | * @since 0.13.0 29 | */ 30 | public static IWordResultCondition englishWordMatch() { 31 | return new WordResultConditionEnglishWordMatch(); 32 | } 33 | 34 | /** 35 | * 如果是英文或者数字,则必须全词匹匹配 36 | * @return 结果 37 | * @since 0.20.0 38 | */ 39 | public static IWordResultCondition englishWordNumMatch() { 40 | return new WordResultConditionEnglishWordNumMatch(); 41 | } 42 | 43 | /** 44 | * 单词标签 45 | * @param tags 标签列表 46 | * @return 结果 47 | * @since 0.23.0 48 | */ 49 | public static IWordResultCondition wordTags(List tags) { 50 | ArgUtil.notEmpty(tags, "tags"); 51 | 52 | return new WordResultConditionWordTagsMatch(tags); 53 | } 54 | 55 | /** 56 | * 链式调用,支持同时满足多个条件 57 | * 58 | * @since 0.23.0 59 | * @param condition 条件 60 | * @param others 其他条件 61 | * @return 结果 62 | */ 63 | public static IWordResultCondition chains(final IWordResultCondition condition, final IWordResultCondition ... others) { 64 | return new WordResultConditionInit() { 65 | @Override 66 | protected void init(Pipeline pipeline) { 67 | pipeline.addLast(condition); 68 | if(ArrayUtil.isNotEmpty(others)) { 69 | for(IWordResultCondition other : others) { 70 | pipeline.addLast(other); 71 | } 72 | } 73 | } 74 | }; 75 | } 76 | 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckUrl.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.util.lang.CharUtil; 5 | import com.github.houbb.heaven.util.util.regex.RegexUtil; 6 | import com.github.houbb.sensitive.word.api.IWordCheck; 7 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 8 | import com.github.houbb.sensitive.word.constant.WordConst; 9 | import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; 10 | 11 | /** 12 | * URL 正则表达式检测实现。 13 | * 14 | * 也可以严格的保留下来。 15 | * 16 | * (1)暂时先粗略的处理 web-site 17 | * (2)如果网址的最后为图片类型,则跳过。 18 | * (3)长度超过 70,直接结束。 19 | * 20 | * @author binbin.hou 21 | * @since 0.0.9 22 | */ 23 | @ThreadSafe 24 | public class WordCheckUrl extends AbstractConditionWordCheck { 25 | 26 | /** 27 | * @since 0.3.0 28 | */ 29 | private static final IWordCheck INSTANCE = new WordCheckUrl(); 30 | 31 | public static IWordCheck getInstance() { 32 | return INSTANCE; 33 | } 34 | 35 | @Override 36 | protected Class getSensitiveCheckClass() { 37 | return WordCheckUrl.class; 38 | } 39 | 40 | @Override 41 | protected String getType() { 42 | return WordTypeEnum.URL.getCode(); 43 | } 44 | 45 | @Override 46 | protected boolean isCharCondition(char mappingChar, int index, InnerSensitiveWordContext checkContext) { 47 | return CharUtil.isWebSiteChar(mappingChar) || mappingChar == ':' || mappingChar == '/'; 48 | } 49 | 50 | @Override 51 | protected boolean isStringCondition(int index, StringBuilder stringBuilder, InnerSensitiveWordContext checkContext) { 52 | int bufferLen = stringBuilder.length(); 53 | //a.cn 54 | if(bufferLen < 4) { 55 | return false; 56 | } 57 | if(bufferLen > WordConst.MAX_WEB_SITE_LEN) { 58 | return false; 59 | } 60 | 61 | // 改为 http:// 或者 https:// 开头 62 | String string = stringBuilder.toString(); 63 | return isUrl(string); 64 | } 65 | 66 | /** 67 | * 是否为 URL 68 | * @param text 原始文本 69 | * @return 结果 70 | * @since 0.25.0 71 | */ 72 | protected boolean isUrl(final String text) { 73 | return RegexUtil.isUrl(text); 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckIPV4.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.check; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.heaven.util.lang.CharUtil; 5 | import com.github.houbb.heaven.util.lang.StringUtil; 6 | import com.github.houbb.heaven.util.util.regex.RegexUtil; 7 | import com.github.houbb.sensitive.word.api.IWordCheck; 8 | import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; 9 | import com.github.houbb.sensitive.word.constant.WordConst; 10 | import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; 11 | import com.github.houbb.sensitive.word.utils.InnerCharUtils; 12 | 13 | import java.util.List; 14 | 15 | /** 16 | * IPV4 检测 17 | * 18 | * @author binbin.hou 19 | * @since 0.17.0 20 | */ 21 | @ThreadSafe 22 | public class WordCheckIPV4 extends AbstractConditionWordCheck { 23 | 24 | private static final IWordCheck INSTANCE = new WordCheckIPV4(); 25 | 26 | public static IWordCheck getInstance() { 27 | return INSTANCE; 28 | } 29 | 30 | @Override 31 | protected Class getSensitiveCheckClass() { 32 | return WordCheckIPV4.class; 33 | } 34 | 35 | @Override 36 | protected String getType() { 37 | return WordTypeEnum.IPV4.getCode(); 38 | } 39 | 40 | @Override 41 | protected boolean isCharCondition(char mappingChar, int index, InnerSensitiveWordContext checkContext) { 42 | return CharUtil.isNumber(mappingChar) || '.' == mappingChar; 43 | } 44 | 45 | @Override 46 | protected boolean isStringCondition(int index, StringBuilder stringBuilder, InnerSensitiveWordContext checkContext) { 47 | int bufferLen = stringBuilder.length(); 48 | //0.0.0.0 49 | //255.255.255.255 50 | if(bufferLen < 7 51 | || bufferLen > 15) { 52 | return false; 53 | } 54 | 55 | // 尽可能减少对象的创建 56 | String string = stringBuilder.toString(); 57 | List stringList = StringUtil.splitToList(string, '.'); 58 | if(stringList.size() != 4) { 59 | return false; 60 | } 61 | 62 | for(String numStr : stringList) { 63 | int integer = InnerCharUtils.parseInt(numStr); 64 | 65 | if(integer < 0 || integer > 256) { 66 | return false; 67 | } 68 | } 69 | 70 | // 额外处理 71 | return true; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsAllowTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordAllow; 4 | import com.github.houbb.sensitive.word.api.IWordDeny; 5 | import com.github.houbb.sensitive.word.support.allow.WordAllows; 6 | import com.github.houbb.sensitive.word.support.deny.WordDenys; 7 | import com.github.houbb.sensitive.word.support.replace.WordReplaces; 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | /** 15 | *

project: sensitive-word-SensitiveWordBsTest

16 | *

create on 2020/1/7 23:43

17 | * 18 | * @author Administrator 19 | * @since 0.21.0 20 | */ 21 | public class SensitiveWordBsAllowTest { 22 | 23 | /** 24 | * 是否包含 25 | * 26 | * https://github.com/houbb/sensitive-word/issues/76 27 | * 28 | * @since 0.0.1 29 | */ 30 | @Test 31 | public void findAllowTest() { 32 | final String text = "三黄片黄片"; 33 | 34 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 35 | .wordAllow(new IWordAllow() { 36 | @Override 37 | public List allow() { 38 | return Arrays.asList("三黄片"); 39 | } 40 | }) 41 | .init(); 42 | 43 | Assert.assertEquals("[黄片]", sensitiveWordBs.findAll(text).toString()); 44 | } 45 | 46 | /** 47 | * https://github.com/houbb/sensitive-word/issues/19 48 | * 49 | * @since 0.21.0 50 | */ 51 | @Test 52 | public void bug19FixTest() { 53 | final String text = "共产党是白名单不会被检测"; 54 | final String text2 = "共产党是白名单不会被检测,但是共产是黑名单"; 55 | 56 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 57 | .wordAllow(new IWordAllow() { 58 | @Override 59 | public List allow() { 60 | return Arrays.asList("共产党"); 61 | } 62 | }) 63 | .wordDeny(new IWordDeny() { 64 | @Override 65 | public List deny() { 66 | return Arrays.asList("政府", "国家", "共产"); 67 | } 68 | }) 69 | .init(); 70 | 71 | Assert.assertEquals("[]", sensitiveWordBs.findAll(text).toString()); 72 | Assert.assertEquals("[共产]", sensitiveWordBs.findAll(text2).toString()); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyle.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | /** 11 | * 忽略数字的样式 12 | * @author binbin.hou 13 | * @since 0.0.5 14 | */ 15 | @Deprecated 16 | @ThreadSafe 17 | public class WordFormatIgnoreNumStyle implements IWordFormat { 18 | 19 | private static final IWordFormat INSTANCE = new WordFormatIgnoreNumStyle(); 20 | 21 | public static IWordFormat getInstance() { 22 | return INSTANCE; 23 | } 24 | 25 | private static final String NUM_ONE = "⓪0零º₀⓿○" + 26 | "123456789" + 27 | "一二三四五六七八九" + 28 | "壹贰叁肆伍陆柒捌玖" + 29 | "¹²³⁴⁵⁶⁷⁸⁹" + 30 | "₁₂₃₄₅₆₇₈₉" + 31 | "①②③④⑤⑥⑦⑧⑨" + 32 | "⑴⑵⑶⑷⑸⑹⑺⑻⑼" + 33 | "⒈⒉⒊⒋⒌⒍⒎⒏⒐" + 34 | "❶❷❸❹❺❻❼❽❾" + 35 | "➀➁➂➃➄➅➆➇➈" + 36 | "➊➋➌➍➎➏➐➑➒" + 37 | "㈠㈡㈢㈣㈤㈥㈦㈧㈨" + 38 | "⓵⓶⓷⓸⓹⓺⓻⓼⓽" + 39 | "㊀㊁㊂㊃㊄㊅㊆㊇㊈" + 40 | "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" + 41 | "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ"; 42 | 43 | private static final String NUM_TWO = "0000000"+ 44 | "123456789" + 45 | "123456789" + 46 | "123456789" + 47 | "123456789" + 48 | "123456789" + 49 | "123456789" + 50 | "123456789" + 51 | "123456789" + 52 | "123456789" + 53 | "123456789" + 54 | "123456789" + 55 | "123456789" + 56 | "123456789" + 57 | "123456789" + 58 | "123456789" + 59 | "123456789"; 60 | 61 | private static final Map NUMBER_MAP = new HashMap<>(NUM_ONE.length()); 62 | 63 | static { 64 | final int size = NUM_ONE.length(); 65 | for(int i = 0; i < size; i++) { 66 | NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i)); 67 | } 68 | } 69 | 70 | /** 71 | * 映射后的 char 72 | * @param c 待转换的 char 73 | * @return 结果 74 | * @since 0.0.4 75 | */ 76 | private char getMappingChar(final char c) { 77 | Character mapChar = NUMBER_MAP.get(c); 78 | return mapChar == null ? c : mapChar; 79 | } 80 | 81 | @Override 82 | public char format(char original, IWordContext context) { 83 | return getMappingChar(original); 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyleC2C.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.format; 2 | 3 | import com.github.houbb.heaven.annotation.ThreadSafe; 4 | import com.github.houbb.sensitive.word.api.IWordContext; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | import com.github.houbb.sensitive.word.collection.Char2CharMap; 7 | 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | /** 12 | * 忽略数字的样式 13 | * @author binbin.hou 14 | * @since 0.0.5 15 | */ 16 | @ThreadSafe 17 | public class WordFormatIgnoreNumStyleC2C implements IWordFormat { 18 | 19 | private static final IWordFormat INSTANCE = new WordFormatIgnoreNumStyleC2C(); 20 | 21 | public static IWordFormat getInstance() { 22 | return INSTANCE; 23 | } 24 | 25 | private static final String NUM_ONE = "⓪0零º₀⓿○" + 26 | "123456789" + 27 | "一二三四五六七八九" + 28 | "壹贰叁肆伍陆柒捌玖" + 29 | "¹²³⁴⁵⁶⁷⁸⁹" + 30 | "₁₂₃₄₅₆₇₈₉" + 31 | "①②③④⑤⑥⑦⑧⑨" + 32 | "⑴⑵⑶⑷⑸⑹⑺⑻⑼" + 33 | "⒈⒉⒊⒋⒌⒍⒎⒏⒐" + 34 | "❶❷❸❹❺❻❼❽❾" + 35 | "➀➁➂➃➄➅➆➇➈" + 36 | "➊➋➌➍➎➏➐➑➒" + 37 | "㈠㈡㈢㈣㈤㈥㈦㈧㈨" + 38 | "⓵⓶⓷⓸⓹⓺⓻⓼⓽" + 39 | "㊀㊁㊂㊃㊄㊅㊆㊇㊈" + 40 | "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" + 41 | "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ"; 42 | 43 | private static final String NUM_TWO = "0000000"+ 44 | "123456789" + 45 | "123456789" + 46 | "123456789" + 47 | "123456789" + 48 | "123456789" + 49 | "123456789" + 50 | "123456789" + 51 | "123456789" + 52 | "123456789" + 53 | "123456789" + 54 | "123456789" + 55 | "123456789" + 56 | "123456789" + 57 | "123456789" + 58 | "123456789" + 59 | "123456789"; 60 | 61 | private static final Char2CharMap NUMBER_MAP = new Char2CharMap(NUM_ONE.length()); 62 | 63 | static { 64 | final int size = NUM_ONE.length(); 65 | for(int i = 0; i < size; i++) { 66 | NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i)); 67 | } 68 | } 69 | 70 | /** 71 | * 映射后的 char 72 | * @param c 待转换的 char 73 | * @return 结果 74 | * @since 0.0.4 75 | */ 76 | private char getMappingChar(final char c) { 77 | char mc = NUMBER_MAP.get(c); 78 | return mc == 0 ? c : mc; 79 | } 80 | 81 | @Override 82 | public char format(char original, IWordContext context) { 83 | return getMappingChar(original); 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.utils; 2 | 3 | import com.github.houbb.heaven.util.lang.StringUtil; 4 | import com.github.houbb.heaven.util.util.CollectionUtil; 5 | import com.github.houbb.sensitive.word.api.IWordFormat; 6 | import com.github.houbb.sensitive.word.api.IWordContext; 7 | 8 | import java.util.*; 9 | 10 | /** 11 | * 内部格式化工具类 12 | * @since 0.1.1 13 | */ 14 | public final class InnerWordFormatUtils { 15 | 16 | private InnerWordFormatUtils(){} 17 | 18 | /** 19 | * 空字符数组 20 | * @since 0.6.0 21 | */ 22 | private static final char[] EMPTY_CHARS = new char[0]; 23 | 24 | /** 25 | * 格式化 26 | * @param original 原始 27 | * @param context 上下文 28 | * @return 结果 29 | * @since 0.1.1 30 | */ 31 | public static String format(final String original, final IWordContext context) { 32 | if(StringUtil.isEmpty(original)) { 33 | return original; 34 | } 35 | 36 | StringBuilder stringBuilder = new StringBuilder(); 37 | IWordFormat charFormat = context.wordFormat(); 38 | int len = original.length(); 39 | for(int i = 0; i < len; i++) { 40 | char c = original.charAt(i); 41 | char cf = charFormat.format(c, context); 42 | stringBuilder.append(cf); 43 | } 44 | 45 | return stringBuilder.toString(); 46 | } 47 | 48 | /** 49 | * 字符串统一的格式化处理 50 | * 51 | * 注意:这个需要 map 的实现是 {@link it.unimi.dsi.fastutil.chars.Char2CharOpenHashMap} 52 | * @param map 映射集合 53 | * @param c 原始 54 | * @return 结果 55 | * @since 0.28.0 56 | */ 57 | public static char getMappingChar(final Map map, char c) { 58 | //Char2CharOpenHashMap 不存在映射也是返回 null 59 | Object mc = map.get(c); 60 | if(mc == null) { 61 | return c; 62 | } 63 | return (char) mc; 64 | } 65 | 66 | /** 67 | * 格式化列表 68 | * @param list 列表 69 | * @param context 上下文 70 | * @return 结果 71 | * @since 0。3.0 72 | */ 73 | public static List formatWordList(Collection list, 74 | final IWordContext context) { 75 | if(CollectionUtil.isEmpty(list)) { 76 | return new ArrayList<>(); 77 | } 78 | 79 | List resultList = new ArrayList<>(list.size()); 80 | for(String word : list) { 81 | String formatWord = InnerWordFormatUtils.format(word, context); 82 | resultList.add(formatWord); 83 | } 84 | 85 | return resultList; 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkTimesTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.benchmark; 2 | 3 | import com.github.houbb.heaven.util.util.RandomUtil; 4 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 5 | import com.github.houbb.sensitive.word.core.SensitiveWordHelper; 6 | import org.junit.Ignore; 7 | import org.junit.Test; 8 | 9 | @Ignore 10 | public class BenchmarkTimesTest { 11 | 12 | /** 13 | * 测试基准:100+字符串 * 10W次 14 | * 15 | * V0.6.0: 1470ms,接近 7.2W QPS 16 | * V0.7.0: 1380ms 17 | * v0.29.2: 781ms,接近 14W QPS 18 | */ 19 | @Test 20 | public void onlyWordAndNoReplaceTest() { 21 | // 1W 次 22 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 23 | .enableWordCheck(true) 24 | .enableNumCheck(false) 25 | .enableUrlCheck(false) 26 | .enableEmailCheck(false) 27 | .ignoreRepeat(false) 28 | .ignoreCase(false) 29 | .ignoreNumStyle(false) 30 | .ignoreChineseStyle(false) 31 | .ignoreEnglishStyle(false) 32 | .ignoreWidth(false) 33 | .init(); 34 | 35 | String randomText = "你他妈的不要说脏话"+ RandomUtil.randomString("1234567890bcdefghiJKLMNOPQRSTUVWXYZ", 100) 36 | + "我们他妈的从来不说脏说"; 37 | 38 | long start = System.currentTimeMillis(); 39 | for(int i = 0; i < 100_000; i++) { 40 | sensitiveWordBs.findAll(randomText); 41 | } 42 | long end = System.currentTimeMillis(); 43 | System.out.println("------------------ COST: " + (end-start)); 44 | } 45 | 46 | /** 47 | * 测试基准:100+字符串 * 10W次 48 | * 49 | * V0.6.0: 2744ms, 约 3.7W QPS 50 | * V0.7.0: 2723ms 51 | * V0.29.2: 1588ms,约 6.29W QPS 52 | */ 53 | @Test 54 | public void onlyWordAndWithReplaceTest() { 55 | // 1W 次 56 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 57 | .enableWordCheck(true) 58 | .enableNumCheck(false) 59 | .enableUrlCheck(false) 60 | .enableEmailCheck(false) 61 | .ignoreRepeat(true) 62 | .ignoreCase(true) 63 | .ignoreNumStyle(true) 64 | .ignoreChineseStyle(true) 65 | .ignoreEnglishStyle(true) 66 | .ignoreWidth(true) 67 | .init(); 68 | 69 | String randomText = "你他妈的不要说脏话"+ RandomUtil.randomString("1234567890bcdefghiJKLMNOPQRSTUVWXYZ", 100) 70 | + "我们他妈的从来不说脏说"; 71 | 72 | long start = System.currentTimeMillis(); 73 | for(int i = 0; i < 100_000; i++) { 74 | sensitiveWordBs.findAll(randomText); 75 | } 76 | long end = System.currentTimeMillis(); 77 | System.out.println("------------------ COST: " + (end-start)); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTagTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.bs; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordDeny; 4 | import com.github.houbb.sensitive.word.api.IWordTag; 5 | import com.github.houbb.sensitive.word.support.result.WordResultHandlers; 6 | import com.github.houbb.sensitive.word.support.result.WordTagsDto; 7 | import com.github.houbb.sensitive.word.support.tag.WordTags; 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | /** 15 | *

project: sensitive-word-SensitiveWordBsTest

16 | *

create on 2020/1/7 23:43

17 | * 18 | * @author Administrator 19 | * @since 0.10.0 20 | */ 21 | public class SensitiveWordBsTagTest { 22 | 23 | @Test 24 | public void wordResultHandlerWordTagsTest() { 25 | // 自定义测试标签类 26 | IWordTag wordTag = WordTags.lines(Arrays.asList("0售 广告")); 27 | 28 | // 指定初始化 29 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 30 | .wordDeny(new IWordDeny() { 31 | @Override 32 | public List deny() { 33 | return Arrays.asList("0售"); 34 | } 35 | }) 36 | .wordTag(wordTag) 37 | .init() 38 | ; 39 | List wordTagsDtoList1 = sensitiveWordBs.findAll("零售", WordResultHandlers.wordTags()); 40 | Assert.assertEquals("[WordTagsDto{word='零售', tags=[广告]}]", wordTagsDtoList1.toString()); 41 | 42 | List wordTagsDtoList2 = sensitiveWordBs.findAll("0售", WordResultHandlers.wordTags()); 43 | Assert.assertEquals("[WordTagsDto{word='0售', tags=[广告]}]", wordTagsDtoList2.toString()); 44 | } 45 | 46 | @Test 47 | public void wordResultHandlerWordTags2Test() { 48 | // 自定义测试标签类 49 | IWordTag wordTag = WordTags.lines(Arrays.asList("天安门 政治,国家,地址")); 50 | 51 | // 指定初始化 52 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 53 | .wordTag(wordTag) 54 | .init() 55 | ; 56 | List wordTagsDtoList1 = sensitiveWordBs.findAll("天安门", WordResultHandlers.wordTags()); 57 | Assert.assertEquals("[WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordTagsDtoList1.toString()); 58 | } 59 | 60 | @Test 61 | public void wordTagsTest() { 62 | // 自定义测试标签类 63 | IWordTag wordTag = WordTags.lines(Arrays.asList("0售 广告", "天安门 政治,国家,地址")); 64 | // 指定初始化 65 | SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() 66 | .wordTag(wordTag) 67 | .init() 68 | ; 69 | 70 | Assert.assertEquals("[政治, 国家, 地址]", sensitiveWordBs.tags("天安门").toString()); 71 | Assert.assertEquals("[广告]", sensitiveWordBs.tags("零售").toString()); 72 | Assert.assertEquals("[广告]", sensitiveWordBs.tags("0售").toString()); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.support.handler; 2 | 3 | import com.github.houbb.sensitive.word.api.IWordResult; 4 | import com.github.houbb.sensitive.word.bs.SensitiveWordBs; 5 | import com.github.houbb.sensitive.word.core.SensitiveWordHelper; 6 | import com.github.houbb.sensitive.word.support.result.WordResultHandlers; 7 | import com.github.houbb.sensitive.word.support.result.WordTagsDto; 8 | import com.github.houbb.sensitive.word.support.tag.WordTags; 9 | import org.junit.Assert; 10 | import org.junit.Ignore; 11 | import org.junit.Test; 12 | 13 | import java.util.*; 14 | 15 | /** 16 | * @since 0.12.0 17 | */ 18 | public class WordResultHandlerTest { 19 | 20 | @Test 21 | public void findAllWordTest() { 22 | final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; 23 | 24 | List wordList = SensitiveWordHelper.findAll(text); 25 | Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); 26 | List wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word()); 27 | Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString()); 28 | 29 | List wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw()); 30 | Assert.assertEquals("[WordResult{startIndex=0, endIndex=4, type='WORD', word='5星红旗'}, WordResult{startIndex=9, endIndex=12, type='WORD', word='毛主席'}, WordResult{startIndex=18, endIndex=21, type='WORD', word='天安门'}]", wordList3.toString()); 31 | } 32 | 33 | @Test 34 | public void findAllWordTest2() { 35 | final String text = "骂人:你他妈; 邮箱:123@qq.com; mobile: 13088889999; 网址:https://www.baidu.com"; 36 | List wordList3 = SensitiveWordHelper 37 | .findAll(text, WordResultHandlers.raw()); 38 | Assert.assertEquals("[WordResult{startIndex=3, endIndex=6, type='WORD', word='你他妈'}]", wordList3.toString()); 39 | } 40 | 41 | @Test 42 | public void wordTagsTest() { 43 | final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; 44 | 45 | // 默认敏感词标签为空 46 | List wordList1 = SensitiveWordHelper.findAll(text, WordResultHandlers.wordTags()); 47 | Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=null}, WordTagsDto{word='毛主席', tags=[0]}, WordTagsDto{word='天安门', tags=null}]", wordList1.toString()); 48 | 49 | Map> wordMap = new HashMap<>(); 50 | wordMap.put("五星红旗", new HashSet<>(Arrays.asList("政治", "国家"))); 51 | wordMap.put("毛主席", new HashSet<>(Arrays.asList("政治", "伟人", "国家"))); 52 | wordMap.put("天安门", new HashSet<>(Arrays.asList("政治", "国家", "地址"))); 53 | 54 | List wordList2 = SensitiveWordBs.newInstance() 55 | .wordTag(WordTags.map(wordMap)) 56 | .init() 57 | .findAll(text, WordResultHandlers.wordTags()); 58 | Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[政治, 国家]}, WordTagsDto{word='毛主席', tags=[政治, 伟人, 国家]}, WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordList2.toString()); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/github/houbb/sensitive/word/collection/Char2CharMap.java: -------------------------------------------------------------------------------- 1 | package com.github.houbb.sensitive.word.collection; 2 | 3 | /** 4 | * 原生无装箱、拆箱的实现 5 | * 6 | * @since 0.29.2 7 | */ 8 | public final class Char2CharMap { 9 | 10 | private static final char EMPTY_KEY = '\0'; // 特殊标记,表示空槽 11 | private static final float LOAD_FACTOR = 0.5f; 12 | 13 | private char[] keys; 14 | private char[] values; 15 | private int size; 16 | private int mask; // capacity-1,用于快速取模 17 | private int maxSize; 18 | 19 | public Char2CharMap(int expectedSize) { 20 | int capacity = tableSizeFor((int) (expectedSize / LOAD_FACTOR) + 1); 21 | this.keys = new char[capacity]; 22 | this.values = new char[capacity]; 23 | this.mask = capacity - 1; 24 | this.maxSize = (int) (capacity * LOAD_FACTOR); 25 | this.size = 0; 26 | } 27 | 28 | /** 2 的幂次方容量 */ 29 | private static int tableSizeFor(int cap) { 30 | int n = cap - 1; 31 | n |= n >>> 1; 32 | n |= n >>> 2; 33 | n |= n >>> 4; 34 | n |= n >>> 8; 35 | n |= n >>> 16; 36 | return (n < 2) ? 2 : (n >= (1 << 30) ? (1 << 30) : n + 1); 37 | } 38 | 39 | private int hash(char k) { 40 | return (k * 0x9E3779B9) & mask; // 乘法哈希 + mask 41 | } 42 | 43 | /** 插入或覆盖 */ 44 | public void put(char key, char value) { 45 | if (key == EMPTY_KEY) { 46 | throw new IllegalArgumentException("Key '\0' is reserved as EMPTY_KEY."); 47 | } 48 | int idx = hash(key); 49 | while (true) { 50 | if (keys[idx] == EMPTY_KEY) { 51 | keys[idx] = key; 52 | values[idx] = value; 53 | if (++size >= maxSize) { 54 | resize(); 55 | } 56 | return; 57 | } else if (keys[idx] == key) { 58 | values[idx] = value; 59 | return; 60 | } 61 | idx = (idx + 1) & mask; 62 | } 63 | } 64 | 65 | /** 查询,不存在时返回 defaultValue */ 66 | public char get(char key, char defaultValue) { 67 | if (key == EMPTY_KEY) return defaultValue; 68 | int idx = hash(key); 69 | while (true) { 70 | char k = keys[idx]; 71 | if (k == EMPTY_KEY) return defaultValue; 72 | if (k == key) return values[idx]; 73 | idx = (idx + 1) & mask; 74 | } 75 | } 76 | 77 | public char get(char key) { 78 | char defaultVal = 0; 79 | return get(key, defaultVal); 80 | } 81 | 82 | private void resize() { 83 | int newCap = keys.length << 1; 84 | char[] oldKeys = keys; 85 | char[] oldVals = values; 86 | 87 | keys = new char[newCap]; 88 | values = new char[newCap]; 89 | mask = newCap - 1; 90 | maxSize = (int) (newCap * LOAD_FACTOR); 91 | size = 0; 92 | 93 | for (int i = 0; i < oldKeys.length; i++) { 94 | char k = oldKeys[i]; 95 | if (k != EMPTY_KEY) { 96 | put(k, oldVals[i]); 97 | } 98 | } 99 | } 100 | 101 | public int size() { 102 | return size; 103 | } 104 | } 105 | 106 | 107 | --------------------------------------------------------------------------------