├── D1.npy
├── D2.npy
├── D3.npy
├── MinRt.py
├── README.MD
├── TextFilter.py
├── Tokenizer.py
├── main.py
└── tokenize.model


/D1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hqu-little-boy/Filter4J-python/d27820f0d671d14182ad00650629363f06e13acf/D1.npy


--------------------------------------------------------------------------------
/D2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hqu-little-boy/Filter4J-python/d27820f0d671d14182ad00650629363f06e13acf/D2.npy


--------------------------------------------------------------------------------
/D3.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hqu-little-boy/Filter4J-python/d27820f0d671d14182ad00650629363f06e13acf/D3.npy


--------------------------------------------------------------------------------
/MinRt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class MinRt:
 7 |     def __init__(self):
 8 |         raise NotImplementedError("This is a static class")
 9 | 
10 |     @staticmethod
11 |     def do_ai(input_array: np.ndarray) -> int:
12 |         current = np.array(input_array)
13 |         # current = input_array.copy()
14 |         # Dense layer
15 |         if not os.path.exists('D1.npy'):
16 |             raise RuntimeError("D1.npy No exist")
17 | 
18 |         weights = np.load('D1.npy')
19 |         current = np.dot(current.reshape(1, -1), weights).flatten()
20 | 
21 |         # LeakyRelu layer
22 |         n = 20
23 |         if current.shape[0] != n:
24 |             raise RuntimeError(f"Wrong input size for LeakyRelu layer (expected {n}, got {current.shape[0]})")
25 |         # Apply LeakyReLU function
26 |         current = np.where(current > 0, current, current * 0.01)
27 | 
28 |         # Dense layer
29 |         if not os.path.exists('D2.npy'):
30 |             raise RuntimeError("D2.npy No exist")
31 |         weights = np.load('D2.npy')
32 |         current = np.dot(current.reshape(1, -1), weights).flatten()
33 | 
34 |         # LeakyRelu layer
35 |         n = 100
36 |         if current.shape[0] != n:
37 |             raise RuntimeError(f"Wrong input size for LeakyRelu layer (expected {n}, got {current.shape[0]})")
38 |         # Apply LeakyReLU function
39 |         current = np.where(current > 0, current, current * 0.01)
40 | 
41 | 
42 |         # Dense layer
43 |         if not os.path.exists('D3.npy'):
44 |             raise RuntimeError("D3.npy No exist")
45 |         weights = np.load('D3.npy')
46 |         current = np.dot(current.reshape(1, -1), weights).flatten()
47 | 
48 |         # Judge layer
49 |         m = 2
50 |         if current.shape[0] != m:
51 |             raise RuntimeError(f"Wrong input size for Judge layer (expected {m}, got {current.shape[0]})")
52 |         # Find the index of the maximum value
53 |         idx = np.argmax(current)
54 |         return idx
55 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
 1 | # Filter4J
 2 | python实现的文本内容违规检测（文本内容安全、文本审核）库
 3 | 
 4 | Filter4J是一个极小化的，基于深度学习的，文本内容违规检测（文本内容安全、文本审核）库。
 5 | 参考[https://github.com/LL4J/Filter4J](https://github.com/LL4J/Filter4J)
 6 | 
 7 | ## 以下是原项目介绍
 8 | 
 9 | ### 优点：
10 | 
11 | - 基于深度学习，能够有效地对抗拆字、影射、混淆等规避手段
12 | - 代码无第三方依赖，仅3个文件，可以内嵌到任何项目中使用*
13 | - 具有一定的上下文理解能力，能够识别一些包含某些关键词但并不违规的句子
14 | 
15 | ### 缺点：
16 | 
17 | - 模型较大，速度较慢
18 | - 模型具有一定的不可解释性，无法直接得知为什么某个句子被判定为违规
19 | - 受到上游数据集与预训练精度限制，可能会出现一些误判、漏判
20 | 
21 | ### 警告：
22 | 
23 | 基于机器的文本审核系统，无法完全替代人工审核。请在使用本库时，仍然保持对用户输入的警惕。
24 | 作者在此明示，本模型一定存在缺陷且会存在错误判断，其输出结果与实际情况一定存在偏差。
25 | 使用者不应该将其用于任何环境中，除非这种偏差不会对使用者造成任何损失。
26 | 
27 | ### 演示：
28 | 
29 | ````text
30 | Filter4j 演示程序 已经启动!
31 | 在吗？我想草你
32 | 异常
33 | 山火十分可怕，所过之处寸草不生
34 | 正常
35 | 青山绿水，白草红叶黄花
36 | 正常
37 | 在吗？我想ca/o你
38 | 异常
39 | 我问候你全家
40 | 异常
41 | 我们去照相馆拍全家福了
42 | 正常
43 | “你好”是一个常用的问候语
44 | 正常
45 | ````
46 | 
47 | ### 特别鸣谢
48 | 
49 | 北京信息科学与技术国家研究中心 Jiawen Deng(清华大学) et,al. 提供的COLDataset。
50 | 此数据集为我们提供了无与伦比的帮助。
51 | 


--------------------------------------------------------------------------------
/TextFilter.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import os
 3 | from Tokenizer import Tokenizer
 4 | from MinRt import MinRt
 5 | 
 6 | 
 7 | class TextFilter:
 8 |     script: List[str]
 9 |     tokenizer: Tokenizer
10 | 
11 |     def __init__(self):
12 |         self.tokenizer = Tokenizer("tokenize.model")
13 | 
14 |     def is_illegal(self, text: str) -> bool:
15 |         # Assuming MinRt class and its do_ai method are defined in Python  
16 |         return MinRt.do_ai(self.tokenizer.tokenize(text)) == 1
17 | 


--------------------------------------------------------------------------------
/Tokenizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | class Tokenizer:
 3 |     def __init__(self):
 4 |         self.vocab = None
 5 | 
 6 |     def __init__(self, filename):
 7 |         try:
 8 |             with open(filename, 'r', encoding='utf8') as file:
 9 |                 size = int(file.readline().strip())
10 |                 self.vocab = [file.readline().strip() for _ in range(size)]
11 |             # return Tokenizer(vocab)
12 |         except Exception as e:
13 |             print(f"An error occurred: {e}")
14 |             return None
15 | 
16 |     def tokenize(self, text):
17 |         if self.vocab is None:
18 |             raise ValueError("Tokenizer not initialized with a vocabulary.")
19 |         values = [1 if word in text else 0 for word in self.vocab]
20 |         return values
21 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # 这是一个示例 Python 脚本。
 2 | 
 3 | # 按 Shift+F10 执行或将其替换为您的代码。
 4 | # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。
 5 | 
 6 | from TextFilter import TextFilter
 7 | 
 8 | 
 9 | # def print_hi(name):
10 | #     # 在下面的代码行中使用断点来调试脚本。
11 | #     print(f'Hi, {name}')  # 按 Ctrl+F8 切换断点。
12 | #
13 | #
14 | # # 按间距中的绿色按钮以运行脚本。
15 | # if __name__ == '__main__':
16 | #     print_hi('PyCharm')
17 | #
18 | # # 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
19 | import sys
20 | # from filter import TextFilter  # 假设TextFilter在filter模块中
21 | 
22 | 
23 | def main():
24 |     print("Filter4j 演示程序 已经启动!")
25 |     textFilter = TextFilter()
26 |     while True:
27 |         try:
28 |             # 在Python中，我们使用input函数来从控制台获取输入
29 |             str = input()
30 |             # str = '想cao你'
31 |             # 在Python中，我们不需要使用三元运算符，可以直接使用if-else语句
32 |             if textFilter.is_illegal(str):
33 |                 print("异常")
34 |             else:
35 |                 print("正常")
36 |         except KeyboardInterrupt:
37 |             # 当用户按下Ctrl+C时，捕获KeyboardInterrupt异常并退出循环
38 |             print("\n程序已退出。")
39 |             break
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main()


--------------------------------------------------------------------------------