├── targets_test.txt
├── requirements.txt
├── test.css
├── .gitignore
├── keywords_example.txt
├── core
├── config.py
└── detector
│ ├── keyword_detector.py
│ ├── special_hiding_detector.py
│ ├── headless_browser_detector.py
│ ├── html_detector.py
│ └── js_detector.py
├── utils
├── logging_utils.py
├── file_utils.py
├── html_utils.py
├── common_utils.py
├── css_utils.py
├── js_utils.py
└── network_utils.py
├── README.md
├── test_dark_link.html
└── YuanZhao.py
/targets_test.txt:
--------------------------------------------------------------------------------
1 | ./test_dark_link.html
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.9.3
2 | lxml>=4.6.3
3 | requests>=2.25.1
4 | selenium>=4.0.0
5 | urllib3>=1.26.7
6 | chardet>=4.0.0
7 |
8 | # 无头浏览器依赖
9 | webdriver-manager>=3.5.0 # 自动安装和管理ChromeDriver
10 |
--------------------------------------------------------------------------------
/test.css:
--------------------------------------------------------------------------------
1 | /* normal css */
2 | @import url("https://fonts.googleapis.com/css?family=Roboto");
3 | .banner { background-image: url("https://cdn.example.com/images/hero.jpg"); }
4 | .icon { background-image: url(/assets/icon.png); }
5 | .cursor { cursor: url("https://static.example.com/cur.cur"), auto; }
6 | .hidden { display: none; }
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.so
6 | .Python
7 |
8 | # Temporary files
9 | *.swp
10 | *.swo
11 | *~
12 | .project
13 | .settings
14 | .classpath
15 | *.launch
16 |
17 | # IDE
18 | .idea/
19 | .vscode/
20 | *.sublime-project
21 | *.sublime-workspace
22 |
23 | # OS
24 | .DS_Store
25 | Thumbs.db
26 |
27 | # Testing
28 | .tox/
29 | .coverage
30 | .coverage.*
31 | .cache
32 | nosetests.xml
33 | pytest.xml
34 |
35 | # Environment
36 | .env
37 | .env.local
38 | .env.development.local
39 | .env.test.local
40 | .env.production.local
41 |
42 | # Build
43 | /build/
44 | dist/
45 | *.egg
46 | *.egg-info/
47 | bug*.md
48 |
49 | # Reports and logs
50 | reports/
51 | logs/
52 | *.log
53 | # 调试临时文件会保存到reports目录并以时间戳命名,无需具体文件名忽略
54 | reports/*
55 |
56 | # Virtual environments
57 | venv/
58 | env/
59 | pyvenv/
60 |
61 | # OS generated files
62 | .DS_Store
63 | .DS_Store?
64 | ._*
65 | .Spotlight-V100
66 | .Trashes
67 |
--------------------------------------------------------------------------------
/keywords_example.txt:
--------------------------------------------------------------------------------
1 | bet365, gambling, 9
2 | 皇冠体育, gambling, 9
3 | 火博, gambling, 9
4 | 金年会, gambling, 9
5 | 威尼斯人, gambling, 8
6 | 澳门赌场, gambling, 8
7 | 时时彩, gambling, 10
8 | 六合彩, gambling, 10
9 | 赔率, gambling, 7
10 | 投注, gambling, 8
11 | 赌博, gambling, 10
12 | 博彩, gambling, 10
13 | 赌球, gambling, 10
14 | 彩金, gambling, 9
15 | 线上赌场, gambling, 10
16 | 真人娱乐, gambling, 8
17 | 百家乐, gambling, 9
18 | 轮盘, gambling, 8
19 | 老虎机, gambling, 9
20 | 快三, gambling, 9
21 | 彩票软件, gambling, 9
22 | 体彩预测, gambling, 8
23 | 色情, porn, 9
24 | 成人, porn, 8
25 | AV, porn, 9
26 | 黄色网站, porn, 10
27 | 性爱, porn, 9
28 | 黄色, porn, 10
29 | 肉棒, porn, 10
30 | 爆乳, porn, 10
31 | 射精, porn, 10
32 | H视频, porn, 10
33 | 裸聊, porn, 9
34 | 露骨, porn, 8
35 | 情色, porn, 9
36 | 木马, malware, 10
37 | 病毒, malware, 9
38 | 勒索软件, malware, 10
39 | 挖矿, malware, 8
40 | 黑客攻击, malware, 10
41 | DDoS, malware, 10
42 | 后门, malware, 10
43 | 远程控制, malware, 9
44 | 账号密码, phishing, 8
45 | 银行登录, phishing, 10
46 | 支付验证, phishing, 9
47 | 登录, phishing, 6
48 | 账号, phishing, 6
49 | 密码, phishing, 6
50 | 支付, phishing, 8
51 | 转账, phishing, 9
52 | 银行卡, phishing, 8
53 | 验证码, phishing, 7
54 | 高利贷, other, 10
55 | 网贷, other, 7
56 | 小额贷, other, 8
57 | 民间借贷, other, 7
58 | 校园贷, other, 10
59 | 私服, other, 7
60 | 外挂, other, 8
61 | 传奇私服, other, 9
62 | 新开私服, other, 8
63 | 破解版, other, 7
64 | 黑客, other, 8
65 | 渗透测试, other, 5
66 | 漏洞扫描, other, 6
67 | 破解软件, other, 8
68 | 注册机, other, 7
69 | 激活码, other, 6
70 | 黑客工具, other, 9
71 | .cm, other, 7
72 | .tk, other, 6
73 | .ga, other, 6
74 | .ml, other, 6
75 | .tf, other, 6
76 | .gq, other, 6
77 | display:none, other, 9
78 | visibility:hidden, other, 9
79 | opacity:0, other, 8
80 | position:absolute, other, 6
81 | z-index:-1, other, 7
82 | text-indent:-9999px, other, 8
83 | document.write, other, 7
84 | eval(, other, 9
85 | setTimeout("", other, 8
86 | location.href=, other, 7
87 | window.open(, other, 6
88 | XMLHttpRequest, other, 5
89 | fetch(, other, 5
90 | 翻墙, other, 7
91 | VPN, other, 6
92 | 暴力, other, 9
93 | 血腥, other, 8
94 | 恐怖, other, 7
95 | 毒品, other, 10
96 | 大麻, other, 10
97 | 冰毒, other, 10
98 | 摇头丸, other, 10
99 |
--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 配置管理模块
5 | """
6 |
7 | class Config:
8 | """扫描配置类"""
9 |
10 | def __init__(self):
11 | # 扫描目标配置
12 | self.target_type = None # 'local_file', 'local_directory', 'internal_url', 'external_url'
13 | self.target = None
14 | self.crawl_depth = 1
15 | self.depth = self.crawl_depth # 兼容属性
16 |
17 | # 扫描模式配置
18 | self.scan_mode = 'standard' # 'fast', 'standard', 'deep'
19 | self.mode = self.scan_mode # 兼容属性
20 | self.threads = 4
21 | self.timeout = 30
22 | self.internal_timeout = 60 # 内网URL超时时间(秒)
23 | self.external_timeout = 30 # 公网URL超时时间(秒)
24 | self.proxy = None
25 | self.exclude = []
26 |
27 | # 关键字配置
28 | self.keywords_file = None
29 |
30 | # 报告配置
31 | self.report_type = 'txt'
32 | self.report_file = None
33 |
34 | # 调试模式
35 | self.debug = False
36 | # 调试日志读取参数
37 | self.debug_log_wait_ms = 1500
38 | self.debug_log_checks = 3
39 | self.debug_log_interval_ms = 500
40 |
41 | # 日志器
42 | import logging
43 | self.logger = logging.getLogger('YuanZhao')
44 |
45 | # 无头浏览器配置
46 | self.use_headless_browser = False # 是否启用无头浏览器
47 | self.headless_browser = 'chrome' # 无头浏览器类型
48 | self.js_wait_time = 3 # JavaScript执行等待时间(秒)
49 | self.headless_timeout = 60 # 无头浏览器超时时间(秒)
50 | self.headless_auto_download = False # 是否自动下载驱动
51 | self.headless_driver_path = None # 本地驱动路径
52 |
53 | # 文件类型配置
54 | self.html_extensions = ['.html', '.htm', '.shtml', '.xhtml', '.php', '.asp', '.aspx', '.jsp']
55 | self.css_extensions = ['.css', '.less', '.scss', '.sass']
56 | self.js_extensions = ['.js', '.jsx', '.ts', '.tsx']
57 | self.image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
58 |
59 | # 扫描配置项
60 | self.scan_html = True
61 | self.scan_js = True
62 | self.scan_css = True
63 | self.scan_comments = True
64 | self.scan_meta = True
65 | self.scan_iframe = True
66 | self.scan_dom = True
67 | self.scan_encoding = True
68 | self.scan_steganography = True
69 | self.scan_special_hiding = True
70 | self.scan_keywords = True
71 |
72 | # 根据扫描模式调整配置
73 | self._set_mode_config()
74 | # 计算当前模式下需要扫描的扩展名
75 | self.file_extensions = self.get_file_extensions_to_scan()
76 |
77 | def _set_mode_config(self):
78 | """根据扫描模式设置相应的配置"""
79 | if self.scan_mode == 'fast':
80 | # 快速模式:只进行基础扫描
81 | self.scan_html = True
82 | self.scan_js = True
83 | self.scan_css = True
84 | self.scan_comments = True
85 | self.scan_meta = True
86 | self.scan_iframe = False
87 | self.scan_dom = False
88 | self.scan_encoding = False
89 | self.scan_steganography = False
90 | self.scan_special_hiding = False
91 | self.scan_keywords = True
92 |
93 | elif self.scan_mode == 'standard':
94 | # 标准模式:进行大部分扫描
95 | self.scan_html = True
96 | self.scan_js = True
97 | self.scan_css = True
98 | self.scan_comments = True
99 | self.scan_meta = True
100 | self.scan_iframe = True
101 | self.scan_dom = True
102 | self.scan_encoding = True
103 | self.scan_steganography = False
104 | self.scan_special_hiding = True
105 | self.scan_keywords = True
106 |
107 | elif self.scan_mode == 'deep':
108 | # 深度模式:进行所有扫描
109 | self.scan_html = True
110 | self.scan_js = True
111 | self.scan_css = True
112 | self.scan_comments = True
113 | self.scan_meta = True
114 | self.scan_iframe = True
115 | self.scan_dom = True
116 | self.scan_encoding = True
117 | self.scan_steganography = True
118 | self.scan_special_hiding = True
119 | self.scan_keywords = True
120 | # 同步更新扩展名列表
121 | self.file_extensions = self.get_file_extensions_to_scan()
122 |
123 | def update_mode(self, mode):
124 | """更新扫描模式"""
125 | self.scan_mode = mode
126 | self._set_mode_config()
127 |
128 | def get_file_extensions_to_scan(self):
129 | """获取需要扫描的文件扩展名列表"""
130 | extensions = []
131 |
132 | if self.scan_html:
133 | extensions.extend(self.html_extensions)
134 |
135 | if self.scan_js:
136 | extensions.extend(self.js_extensions)
137 |
138 | if self.scan_css:
139 | extensions.extend(self.css_extensions)
140 |
141 | return list(set(extensions)) # 去重
142 |
143 | def get_proxy_dict(self):
144 | """将代理字符串转换为requests使用的代理字典格式"""
145 | if not self.proxy:
146 | return None
147 |
148 | proxies = {
149 | 'http': self.proxy,
150 | 'https': self.proxy
151 | }
152 | return proxies
153 |
154 | def __str__(self):
155 | """返回配置的字符串表示"""
156 | return (
157 | f"Config("
158 | f"target_type={self.target_type}, "
159 | f"target={self.target}, "
160 | f"scan_mode={self.scan_mode}, "
161 | f"threads={self.threads}, "
162 | f"timeout={self.timeout}, "
163 | f"internal_timeout={self.internal_timeout}, "
164 | f"external_timeout={self.external_timeout}, "
165 | f"report_type={self.report_type}, "
166 | f"report_file={self.report_file})"
167 | )
168 |
169 | def get_config_dict(self):
170 | """返回配置的字典表示,用于日志记录"""
171 | return {
172 | 'target_type': self.target_type,
173 | 'target': self.target,
174 | 'crawl_depth': self.crawl_depth,
175 | 'scan_mode': self.scan_mode,
176 | 'threads': self.threads,
177 | 'timeout': self.timeout,
178 | 'internal_timeout': self.internal_timeout,
179 | 'external_timeout': self.external_timeout,
180 | 'proxy': '***' if self.proxy else None,
181 | 'keywords_file': self.keywords_file,
182 | 'report_type': self.report_type,
183 | 'report_file': self.report_file,
184 | 'debug': self.debug
185 | }
186 |
187 |
--------------------------------------------------------------------------------
/utils/logging_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 日志处理工具模块
5 | """
6 |
7 | import os
8 | import logging
9 | import sys
10 | from datetime import datetime
11 |
12 | class Logger:
13 | """
14 | 自定义日志类
15 | """
16 | def __init__(self, name='YuanZhao', log_dir=None, level=logging.INFO, use_console=True):
17 | """
18 | 初始化日志记录器
19 |
20 | Args:
21 | name (str): 日志名称
22 | log_dir (str): 日志文件目录
23 | level (int): 日志级别
24 | use_console (bool): 是否输出到控制台
25 | """
26 | self.logger = logging.getLogger(name)
27 | self.logger.setLevel(level)
28 | self.logger.handlers.clear()
29 |
30 | # 创建格式化器
31 | formatter = logging.Formatter(
32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
33 | datefmt='%Y-%m-%d %H:%M:%S'
34 | )
35 |
36 | # 控制台输出
37 | if use_console:
38 | console_handler = logging.StreamHandler(sys.stdout)
39 | console_handler.setLevel(level)
40 | console_handler.setFormatter(formatter)
41 | self.logger.addHandler(console_handler)
42 |
43 | # 文件输出
44 | if log_dir:
45 | os.makedirs(log_dir, exist_ok=True)
46 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
47 | log_file = os.path.join(log_dir, f'YuanZhao_{timestamp}.log')
48 | file_handler = logging.FileHandler(log_file, encoding='utf-8')
49 | file_handler.setLevel(level)
50 | file_handler.setFormatter(formatter)
51 | self.logger.addHandler(file_handler)
52 |
53 | def debug(self, message):
54 | """记录调试信息"""
55 | self.logger.debug(message)
56 |
57 | def info(self, message):
58 | """记录普通信息"""
59 | self.logger.info(message)
60 |
61 | def warning(self, message):
62 | """记录警告信息"""
63 | self.logger.warning(message)
64 |
65 | def error(self, message, exc_info=False):
66 | """记录错误信息"""
67 | self.logger.error(message, exc_info=exc_info)
68 |
69 | def critical(self, message, exc_info=False):
70 | """记录严重错误信息"""
71 | self.logger.critical(message, exc_info=exc_info)
72 |
73 | def setup_logging(log_dir=None, level=logging.INFO):
74 | """
75 | 全局日志配置
76 |
77 | Args:
78 | log_dir (str): 日志文件目录
79 | level (int): 日志级别
80 |
81 | Returns:
82 | Logger: 日志记录器实例
83 | """
84 | return Logger('YuanZhao', log_dir, level).logger
85 |
86 | def log_exception(logger, exception, message="发生异常"):
87 | """
88 | 记录异常信息
89 |
90 | Args:
91 | logger: 日志记录器
92 | exception: 异常对象
93 | message (str): 错误消息
94 | """
95 | logger.error(f"{message}: {str(exception)}", exc_info=True)
96 |
97 | def log_progress(logger, current, total, message="处理进度"):
98 | """
99 | 记录进度信息
100 |
101 | Args:
102 | logger: 日志记录器
103 | current (int): 当前进度
104 | total (int): 总进度
105 | message (str): 进度消息
106 | """
107 | if total > 0:
108 | percentage = (current / total) * 100
109 | logger.info(f"{message}: {current}/{total} ({percentage:.1f}%)")
110 |
111 | def log_scan_result(logger, file_path, issues):
112 | """
113 | 记录扫描结果
114 |
115 | Args:
116 | logger: 日志记录器
117 | file_path (str): 文件路径
118 | issues (list): 发现的问题列表
119 | """
120 | if issues:
121 | logger.warning(f"文件 {file_path} 发现 {len(issues)} 个问题")
122 | import logging as _logging
123 | if logger.level <= _logging.DEBUG:
124 | for issue in issues:
125 | logger.warning(f" - {issue}")
126 | else:
127 | # 聚合重复项,仅输出前若干项
128 | counts = {}
129 | for issue in issues:
130 | counts[issue] = counts.get(issue, 0) + 1
131 | shown = 0
132 | for text, cnt in counts.items():
133 | logger.warning(f" - {text} x{cnt}")
134 | shown += 1
135 | if shown >= 8:
136 | break
137 | if len(counts) > shown:
138 | logger.warning(f" ... 还有 {len(counts) - shown} 项未展示(非verbose模式)")
139 | else:
140 | logger.debug(f"文件 {file_path} 未发现问题")
141 |
142 | def log_keyword_match(logger, file_path, keyword, category, weight, context):
143 | """
144 | 记录关键字匹配信息
145 |
146 | Args:
147 | logger: 日志记录器
148 | file_path (str): 文件路径
149 | keyword (str): 匹配的关键字
150 | category (str): 关键字类别
151 | weight (int): 风险权重
152 | context (str): 上下文信息
153 | """
154 | logger.warning(
155 | f"关键字匹配 - 文件: {file_path}, "
156 | f"关键字: {keyword}, 类别: {category}, 风险权重: {weight}\n"
157 | f"上下文: {context}"
158 | )
159 |
160 | def log_suspicious_url(logger, file_path, url, risk_level, context):
161 | """
162 | 记录可疑URL信息
163 |
164 | Args:
165 | logger: 日志记录器
166 | file_path (str): 文件路径
167 | url (str): 可疑URL
168 | risk_level (str): 风险等级
169 | context (str): 上下文信息
170 | """
171 | logger.warning(
172 | f"可疑URL - 文件: {file_path}, "
173 | f"URL: {url}, 风险等级: {risk_level}\n"
174 | f"上下文: {context}"
175 | )
176 |
177 | def log_hidden_technique(logger, file_path, technique, risk_level, context):
178 | """
179 | 记录隐藏技术信息
180 |
181 | Args:
182 | logger: 日志记录器
183 | file_path (str): 文件路径
184 | technique (str): 隐藏技术
185 | risk_level (str): 风险等级
186 | context (str): 上下文信息
187 | """
188 | logger.warning(
189 | f"隐藏技术 - 文件: {file_path}, "
190 | f"技术: {technique}, 风险等级: {risk_level}\n"
191 | f"上下文: {context}"
192 | )
193 |
194 | def log_file_skipped(logger, file_path, reason):
195 | """
196 | 记录跳过的文件信息
197 |
198 | Args:
199 | logger: 日志记录器
200 | file_path (str): 文件路径
201 | reason (str): 跳过原因
202 | """
203 | logger.debug(f"跳过文件 {file_path}: {reason}")
204 |
205 | def log_config(logger, config_dict):
206 | """
207 | 记录配置信息
208 |
209 | Args:
210 | logger: 日志记录器
211 | config_dict (dict): 配置字典
212 | """
213 | logger.info("扫描配置:")
214 | for key, value in config_dict.items():
215 | logger.info(f" {key}: {value}")
216 |
217 | def log_summary(logger, total_files, scanned_files, issues_found, scan_time):
218 | """
219 | 记录扫描总结信息
220 |
221 | Args:
222 | logger: 日志记录器
223 | total_files (int): 文件总数
224 | scanned_files (int): 已扫描文件数
225 | issues_found (int): 发现的问题数
226 | scan_time (float): 扫描耗时(秒)
227 | """
228 | logger.info("扫描总结:")
229 | logger.info(f" 总文件数: {total_files}")
230 | logger.info(f" 已扫描文件: {scanned_files}")
231 | logger.info(f" 发现问题: {issues_found}")
232 | logger.info(f" 扫描耗时: {scan_time:.2f} 秒")
233 | try:
234 | if scan_time > 0:
235 | logger.info(f" 平均速度: {scanned_files/scan_time:.2f} 文件/秒")
236 | else:
237 | logger.info(" 平均速度: N/A (耗时为0)")
238 | except Exception:
239 | logger.info(" 平均速度: N/A")
240 |
241 | # 根据问题数量给出警告级别
242 | if issues_found > 50:
243 | logger.critical(f"发现大量问题 ({issues_found}),建议立即检查")
244 | elif issues_found > 10:
245 | logger.error(f"发现较多问题 ({issues_found}),需要关注")
246 | elif issues_found > 0:
247 | logger.warning(f"发现少量问题 ({issues_found}),建议查看")
248 | else:
249 | logger.info("未发现明显问题")
250 |
251 |
--------------------------------------------------------------------------------
/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 文件处理工具模块
5 | """
6 |
7 | import os
8 | import logging
9 | import chardet
10 | from typing import List
11 |
12 | logger = logging.getLogger('YuanZhao.utils.file')
13 |
14 | def read_file(file_path: str, max_size: int = 10 * 1024 * 1024) -> str:
15 | """
16 | 读取文件内容,自动检测编码
17 |
18 | Args:
19 | file_path: 文件路径
20 | max_size: 最大文件大小(默认10MB)
21 |
22 | Returns:
23 | 文件内容
24 | """
25 | try:
26 | # 检查文件大小
27 | file_size = os.path.getsize(file_path)
28 | if file_size > max_size:
29 | logger.warning(f"文件过大,将读取前{max_size/1024/1024:.1f}MB: {file_path}")
30 |
31 | # 检测文件编码
32 | with open(file_path, 'rb') as f:
33 | raw_data = f.read(min(file_size, 10000))
34 | result = chardet.detect(raw_data)
35 | encoding = result['encoding'] or 'utf-8'
36 |
37 | # 读取文件内容
38 | with open(file_path, 'r', encoding=encoding, errors='replace') as f:
39 | content = f.read(max_size)
40 |
41 | return content
42 |
43 | except Exception as e:
44 | logger.error(f"读取文件失败: {file_path}, 错误: {str(e)}")
45 | return ''
46 |
47 | def get_files_to_scan(directory: str, extensions: List[str]) -> List[str]:
48 | """
49 | 递归获取目录中所有指定扩展名的文件
50 |
51 | Args:
52 | directory: 目录路径
53 | extensions: 需要扫描的文件扩展名列表
54 |
55 | Returns:
56 | 文件路径列表
57 | """
58 | files_to_scan = []
59 |
60 | try:
61 | for root, dirs, files in os.walk(directory):
62 | # 过滤掉隐藏目录
63 | dirs[:] = [d for d in dirs if not d.startswith('.')]
64 |
65 | for file in files:
66 | # 过滤掉隐藏文件
67 | if file.startswith('.'):
68 | continue
69 |
70 | # 检查文件扩展名
71 | _, ext = os.path.splitext(file.lower())
72 | if ext in extensions:
73 | file_path = os.path.join(root, file)
74 | files_to_scan.append(file_path)
75 |
76 | logger.info(f"找到 {len(files_to_scan)} 个需要扫描的文件")
77 |
78 | except Exception as e:
79 | logger.error(f"获取文件列表失败: {str(e)}")
80 |
81 | return files_to_scan
82 |
83 | def is_binary_file(file_path: str) -> bool:
84 | """
85 | 检查文件是否为二进制文件
86 |
87 | Args:
88 | file_path: 文件路径
89 |
90 | Returns:
91 | 是否为二进制文件
92 | """
93 | try:
94 | with open(file_path, 'rb') as f:
95 | chunk = f.read(1024)
96 |
97 | # 检查是否包含null字节
98 | if b'\x00' in chunk:
99 | return True
100 |
101 | # 检查非文本字符的比例
102 | text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)))
103 | non_text = sum(1 for byte in chunk if byte not in text_chars)
104 |
105 | # 如果超过30%的字符是非文本字符,则认为是二进制文件
106 | return non_text / len(chunk) > 0.3
107 |
108 | except Exception as e:
109 | logger.error(f"检查文件类型失败: {file_path}, 错误: {str(e)}")
110 | return False
111 |
112 | def get_file_info(file_path: str) -> dict:
113 | """
114 | 获取文件信息
115 |
116 | Args:
117 | file_path: 文件路径
118 |
119 | Returns:
120 | 文件信息字典
121 | """
122 | try:
123 | stat_info = os.stat(file_path)
124 |
125 | info = {
126 | 'path': file_path,
127 | 'size': stat_info.st_size,
128 | 'created_time': stat_info.st_ctime,
129 | 'modified_time': stat_info.st_mtime,
130 | 'is_binary': is_binary_file(file_path)
131 | }
132 |
133 | return info
134 |
135 | except Exception as e:
136 | logger.error(f"获取文件信息失败: {file_path}, 错误: {str(e)}")
137 | return {}
138 |
139 | def ensure_directory(directory: str):
140 | """
141 | 确保目录存在,如果不存在则创建
142 |
143 | Args:
144 | directory: 目录路径
145 | """
146 | try:
147 | if not os.path.exists(directory):
148 | os.makedirs(directory)
149 | logger.info(f"创建目录: {directory}")
150 | except Exception as e:
151 | logger.error(f"创建目录失败: {directory}, 错误: {str(e)}")
152 | raise
153 |
154 | def get_relative_path(file_path: str, base_directory: str) -> str:
155 | """
156 | 获取文件相对于基础目录的路径
157 |
158 | Args:
159 | file_path: 文件路径
160 | base_directory: 基础目录
161 |
162 | Returns:
163 | 相对路径
164 | """
165 | try:
166 | return os.path.relpath(file_path, base_directory)
167 | except Exception as e:
168 | logger.error(f"获取相对路径失败: {str(e)}")
169 | return file_path
170 |
171 | def filter_files_by_size(files: List[str], min_size: int = 0, max_size: int = None) -> List[str]:
172 | """
173 | 根据文件大小过滤文件列表
174 |
175 | Args:
176 | files: 文件路径列表
177 | min_size: 最小文件大小(字节)
178 | max_size: 最大文件大小(字节)
179 |
180 | Returns:
181 | 过滤后的文件列表
182 | """
183 | filtered_files = []
184 |
185 | for file_path in files:
186 | try:
187 | file_size = os.path.getsize(file_path)
188 |
189 | if file_size < min_size:
190 | continue
191 |
192 | if max_size is not None and file_size > max_size:
193 | continue
194 |
195 | filtered_files.append(file_path)
196 |
197 | except Exception as e:
198 | logger.warning(f"获取文件大小失败: {file_path}, 错误: {str(e)}")
199 |
200 | return filtered_files
201 |
202 | def _match_exclude(path: str, exclude_patterns: List[str]) -> bool:
203 | try:
204 | import fnmatch
205 | for pattern in exclude_patterns or []:
206 | if fnmatch.fnmatch(path, pattern) or (pattern.endswith('/') and path.replace('\\','/').startswith(pattern.rstrip('/'))):
207 | return True
208 | except Exception:
209 | pass
210 | return False
211 |
212 | # 兼容性函数,为了支持scanner.py中的导入(扩展签名)
213 | def get_file_list(directory: str, recursive: bool = True, depth: int = 1, extensions: List[str] = None, exclude: List[str] = None) -> List[str]:
214 | """
215 | 获取目录中的文件列表,支持递归、深度限制与排除模式
216 |
217 | Args:
218 | directory: 目录路径
219 | recursive: 是否递归
220 | depth: 递归深度(包含根层级)
221 | extensions: 需要扫描的文件扩展名列表
222 | exclude: 排除的文件或目录通配符列表
223 | Returns:
224 | 文件路径列表
225 | """
226 | results: List[str] = []
227 | try:
228 | extensions = [ext.lower() for ext in (extensions or [])]
229 | base_depth = directory.rstrip('\\/').count(os.sep)
230 | for root, dirs, files in os.walk(directory):
231 | # 处理深度
232 | current_depth = root.rstrip('\\/').count(os.sep) - base_depth
233 | if not recursive or current_depth >= depth:
234 | dirs[:] = []
235 | # 排除目录
236 | if exclude:
237 | dirs[:] = [d for d in dirs if not _match_exclude(os.path.join(root, d), exclude)]
238 | for file in files:
239 | path = os.path.join(root, file)
240 | if exclude and _match_exclude(path, exclude):
241 | continue
242 | if file.startswith('.'):
243 | continue
244 | _, ext = os.path.splitext(file.lower())
245 | if not extensions or ext in extensions:
246 | results.append(path)
247 | logger.info(f"找到 {len(results)} 个需要扫描的文件")
248 | except Exception as e:
249 | logger.error(f"获取文件列表失败: {str(e)}")
250 | return results
251 |
252 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 渊照 - 专业暗链扫描工具
2 |
3 | 「渊照」是一款功能强大的专业暗链扫描工具,专注于检测网站、HTML文件或目录中的隐蔽链接、隐藏元素和恶意代码。该工具能够智能识别扫描目标类型(本地文件/目录、内网URL、公网URL),并自动调整扫描策略以获得最佳效果,是安全人员进行网站安全审计和应急响应的理想工具。
4 |
5 | ## 功能特性
6 |
7 | ### 智能目标识别与处理
8 | - **多类型目标支持**:自动识别和扫描本地文件、本地目录、内网URL和公网URL
9 | - **差异化扫描策略**:根据目标类型应用最优扫描策略
10 | - **递归目录扫描**:支持可配置的扫描深度
11 | - **文件过滤机制**:支持通过通配符排除特定文件或目录
12 |
13 | ### 核心扫描能力
14 | - **多层次检测机制**:HTML代码检测、JavaScript代码分析、CSS代码检测、元标签扫描、注释内容分析
15 | - **高级威胁识别**:加密/编码链接检测、可疑域名检测、随机生成域名检测、短链接服务检测、非标准端口检测、可疑查询参数检测
16 | - **特殊隐藏手法检测**:CSS隐藏技术、颜色隐藏、零宽字符隐藏、字体大小隐藏等
17 | - **关键字匹配系统**:支持CSV格式自定义关键字文件,包含关键字、类别和风险权重
18 | - **智能风险评分**:基于多维度风险评估
19 |
20 | ### 无头浏览器增强检测
21 | - **动态内容捕获**:使用Chrome无头浏览器执行JavaScript并捕获动态内容
22 | - **DOM操作监控**:跟踪动态DOM修改
23 | - **iframe深度分析**:渲染和分析iframe内容
24 | - **网络请求捕获**:监控HTTP请求和重定向链
25 |
26 | ### 全面的报告系统
27 | - **多种报告格式**:文本报告(txt)、HTML报告(html)、JSON报告(json)、CSV报告(csv)
28 | - **丰富的报告内容**:扫描概览、问题详情、风险评估、上下文展示
29 | - **来源类型标注**:在可疑链接中增加 `context_type` 字段(如 `html/js/css/comments`),用于区分链接的来源场景,便于后续数据分析与过滤
30 | - **来源标签与位置**:统一输出 `source_tag`(如 `debug/normal`)与定位范围 `position (start,end)`,HTML/CSV/JSON 报告保持一致
31 | - **风险排序与阈值展示**:HTML报告对“可疑链接”按风险降序展示,并默认仅展示风险≥4的项,减少噪音;关键字匹配表支持从上下文提取可点击链接
32 |
33 | ### 灵活的配置选项
34 | - **多种扫描模式**:fast/standard/deep
35 | - **性能优化选项**:可配置并发线程数、请求超时设置、代理服务器支持
36 | - **关键词来源**:支持从 `keywords_example.txt` 或自定义 `--keyword-file` 读取,文件允许 `#` 注释行,CSV格式:`关键字,类别,风险权重`
37 |
38 | ## 安装指南
39 |
40 | ### 环境要求
41 | - Python 3.8+
42 |
43 | ### 安装依赖
44 | ```bash
45 | pip install -r requirements.txt
46 | ```
47 |
48 | ## 使用方法
49 |
50 | ### 查看帮助信息
51 | ```bash
52 | python YuanZhao.py --help
53 | ```
54 |
55 | ### 完整使用案例命令
56 |
57 | #### 1. 本地文件扫描场景
58 | ```bash
59 | # 基本扫描 - 单个HTML文件
60 | python YuanZhao.py /path/to/file.html
61 |
62 | # 高级扫描 + HTML报告
63 | python YuanZhao.py /path/to/file.html -m standard -f html
64 |
65 | # 详细日志模式
66 | python YuanZhao.py /path/to/suspicious.html --verbose
67 |
68 | # 自定义输出目录
69 | python YuanZhao.py /path/to/file.html -o /custom/report/dir
70 |
71 | # 特定报告格式(JSON)
72 | python YuanZhao.py /path/to/file.html -f json
73 | ```
74 |
75 | #### 2. 本地目录扫描场景
76 | ```bash
77 | # 默认深度扫描目录
78 | python YuanZhao.py /path/to/website
79 |
80 | # 自定义深度扫描(仅当前目录和一级子目录)
81 | python YuanZhao.py /path/to/website -d 1
82 |
83 | # 深度递归扫描
84 | python YuanZhao.py /path/to/website -d 5
85 |
86 | # 排除特定文件/目录
87 | python YuanZhao.py /path/to/website --exclude "*.jpg" "*.png" "logs/*" "vendor/"
88 |
89 | # 调整线程数(提高性能)
90 | python YuanZhao.py /path/to/website -t 16
91 |
92 | # 完整模式 + 多格式报告
93 | python YuanZhao.py /path/to/website -m deep -f html -o security_reports --threads 12
94 | ```
95 |
96 | #### 3. 网络URL扫描场景
97 | ```bash
98 | # 基本网站扫描
99 | python YuanZhao.py https://example.com
100 |
101 | # 内网地址扫描
102 | python YuanZhao.py http://192.168.1.100
103 |
104 | # 本地开发服务器扫描
105 | python YuanZhao.py http://localhost:8080
106 |
107 | # 带路径的URL扫描
108 | python YuanZhao.py https://example.com/news/article
109 |
110 | # 设置超时时间(公网默认使用全局超时,内网未显式设置时会按较长超时)
111 | python YuanZhao.py https://example.com --timeout 60
112 |
113 | # 使用代理服务器
114 | python YuanZhao.py https://example.com --proxy http://127.0.0.1:8080
115 |
116 | # 带认证的代理
117 | python YuanZhao.py https://example.com --proxy http://username:password@proxy.example.com:8080
118 | ```
119 |
120 | #### 4. 高级功能场景
121 | ```bash
122 | # 无头浏览器扫描(动态内容)
123 | python YuanZhao.py https://dynamic-website.com --headless
124 |
125 | # 无头浏览器 + 延长等待时间
126 | python YuanZhao.py https://heavy-js-website.com --headless --js-wait 10
127 |
128 | # 无头浏览器超时时间
129 | python YuanZhao.py https://example.com --headless --headless-timeout 120
130 |
131 | # 自定义关键字检测
132 | python YuanZhao.py /path/to/target --keyword-file custom_keywords.txt
133 |
134 | # 基础模式快速扫描
135 | python YuanZhao.py https://example.com -m fast -d 1 -t 5
136 |
137 | # 全部模式深度扫描
138 | python YuanZhao.py /path/to/important-site -m deep -d 3 -f html --verbose
139 | ```
140 |
141 | #### 5. 批量目标扫描(多链接/多路径)
142 | ```bash
143 | # 方式A:指定列表文件(每行一个目标:URL/文件/目录)
144 | python YuanZhao.py --target-file e:\targets.txt -m deep -f html -o reports --verbose
145 |
146 | # 方式B:直接把 .txt 作为 target 传入(同样按列表处理)
147 | python YuanZhao.py e:\targets.txt -m deep -f html -o reports --verbose
148 |
149 | # 示例列表文件内容
150 | # https://example.com
151 | # e:\webroot
152 | # e:\webroot\index.html
153 | ```
154 |
155 | #### 6. 特定场景优化命令
156 | ```bash
157 | # 应急响应场景
158 | python YuanZhao.py /compromised/webroot -m deep -f html -o incident_response --keyword-file malware_keywords.txt --verbose
159 |
160 | # 定期安全审计
161 | python YuanZhao.py /path/to/webroot -d 3 -m standard -f json -o weekly_scan_$(date +%Y%m%d)
162 |
163 | # 新闻页面专项扫描
164 | python YuanZhao.py https://example.com/news -m deep -d 1 -t 8 --verbose
165 |
166 | # 大规模并行扫描
167 | python YuanZhao.py /large/website -d 2 -t 20 --exclude "*.zip" "*.rar" "backup/*"
168 |
169 | # 自动化集成扫描(生成JSON报告)
170 | python YuanZhao.py https://example.com -f json -o automated_scan_results --no-color
171 | ```
172 | ### 自定义关键字文件格式
173 | ```
174 | 关键字文件为CSV格式,每行包含三个字段:
175 |
176 | 关键字,类别,风险权重
177 | poker,gambling,8
178 | casino,gambling,9
179 | malware,malware,10
180 | phishing,phishing,9
181 | ```
182 |
183 | 类别可选值:gambling (博彩)、porn (色情)、malware (恶意软件)、phishing (钓鱼)、other (其他)
184 | 风险权重范围:1-10(10为最高风险)
185 | 默认关键字文件:项目根目录 `keywords_example.txt`(若未指定 `--keyword-file` 将自动加载)。文件允许以 `#` 开头的注释行。
186 |
187 | ## 主要参数说明
188 |
189 | ### 基本参数
190 | - `target`: 扫描目标(文件路径、目录路径或URL)- 必需参数
191 | - `-d, --depth`: 递归扫描深度(默认:3,0表示仅扫描当前文件/目录)
192 | - `-m, --mode`: 扫描模式(fast/standard/deep,默认:deep)
193 | - `-t, --threads`: 并发线程数(默认:8)
194 |
195 | ### 报告相关参数
196 | - `-o, --output`: 报告输出目录(默认:./reports)
197 | - `-f, --format`: 报告格式(txt/html/json/csv,默认:txt)
198 |
199 | ### 网络相关参数
200 | - `--timeout`: 请求超时时间(秒,默认:30)。公网目标默认使用此值,内网目标未显式设置 `internal_timeout` 时按较长超时(约为全局超时的两倍)。
201 | - `--proxy`: 代理设置(支持带认证与不带认证的HTTP代理),示例:`http://127.0.0.1:8080` 或 `http://user:pass@host:8080`
202 |
203 | ### 高级参数
204 | - `--keyword-file`: 自定义关键字文件路径
205 | - `--target-file`: 批量目标列表文件路径(每行一个目标:URL/文件/目录)
206 | - `--exclude`: 排除的文件或目录
207 | - `--verbose`: 显示详细日志信息
208 | - `--no-color`: 禁用彩色输出(适用于自动化脚本)
209 |
210 | ### 无头浏览器参数
211 | - `--headless`: 启用无头浏览器扫描
212 | - `--browser-type`: 无头浏览器类型(支持: chrome,默认: chrome)
213 | - `--js-wait`: JavaScript执行等待时间(秒,默认: 3)
214 | - `--headless-timeout`: 无头浏览器超时时间(秒,默认: 60)
215 | - `--headless-binary`: Chrome二进制路径(例如:`C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe`)
216 | - `--headless-driver`: ChromeDriver路径(例如:`C:\\drivers\\chromedriver.exe`)
217 |
218 | ## 常见问题解答
219 |
220 | **Q: 扫描结果中的误报如何处理?**
221 | A: 可通过以下方式降低噪音:
222 | - 使用自定义关键字文件调整权重
223 | - 利用报告的风险阈值(HTML默认展示风险≥4)聚焦高风险项
224 | - 依赖优化后的CSS检测逻辑与可信CDN白名单,避免将正常资源识别为可疑
225 |
226 | **Q: 如何提高大型网站的扫描效率?**
227 | A: 增加线程数、设置合理的爬取深度,或先使用基础模式(`fast`)进行初步筛选。对于公网网站,建议控制扫描范围。
228 |
229 | **Q: 为什么有些动态生成的链接没被检测到?**
230 | A: 启用无头浏览器模式`--headless`并适当增加JavaScript执行等待时间`--js-wait`。
231 |
232 | **Q: 使用无头浏览器时需要注意什么?**
233 | A: 使用无头浏览器会增加资源消耗和时间,建议适当降低线程数,为复杂页面增加等待时间,仅在必要时启用。
234 |
235 | ## 项目结构
236 |
237 | ```
238 | YuanZhao/
239 | ├── YuanZhao.py # 主程序入口
240 | ├── requirements.txt # 依赖列表
241 | ├── README.md # 项目说明
242 | ├── core/ # 核心模块
243 | │ ├── scanner.py # 扫描引擎
244 | │ ├── detector/ # 各类检测器
245 | │ ├── reporter.py # 报告生成器
246 | │ └── config.py # 配置管理
247 | ├── utils/ # 工具类
248 | └── keywords_example.txt # 关键字示例文件
249 | ```
250 |
251 | ## 许可证与免责声明
252 |
253 | 本工具仅供安全测试和应急响应使用,请确保您有足够的授权对目标进行扫描,避免对未经授权的系统进行测试。
254 |
255 | ## 开发者提示(工具接口)
256 | - CSS工具正式接口:`extract_css_properties/remove_css_comments/extract_css_comments`
257 | - 统一正式接口(`extract_css_properties/remove_css_comments/extract_css_comments`)。
258 |
259 | ## 开发者选项(日志与报告)
260 | - `debug_log_wait_ms`:调试读取日志的初始等待时间(毫秒),默认 1500
261 | - `debug_log_checks`:日志稳定性检查次数,默认 3
262 | - `debug_log_interval_ms`:每次稳定性检查的间隔(毫秒),默认 500
263 | - 提取统计日志级别:常规运行为 `debug`(匹配数与总提取数),在 `--verbose` 场景下查看更详细日志
264 | - 报告来源字段:`context_type`(html/js/css/comments)与 `source_tag`(debug/normal)用于区分来源与路径
265 | - 关键字匹配报告:支持从上下文自动提取 `http(s)` 链接并渲染为可点击链接
266 | - HTML报告“可疑链接详情”默认按风险降序并过滤低风险项(≥4显示)
267 | ## Star History
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
--------------------------------------------------------------------------------
/test_dark_link.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | 暗链测试页面 - 无头浏览器检测测试
8 |
62 |
161 |
162 |
163 | 正常网站内容
164 |
165 |
166 |
欢迎访问我们的网站
167 |
这是一个正常的网站内容段落,介绍网站的主要功能和服务。
168 |
正常链接
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
185 |
186 |
189 |
190 |
191 | 赌博网站推荐:http://bet365-clone.gq
192 |
193 |
194 |
195 | 私服游戏下载:http://game-private-server.tk
196 |
197 |
198 |
199 | 黑客工具:http://hacker-tools.ml
200 |
201 |
202 |
203 | 时时彩预测:http://lottery-predict.cm
204 |
205 |
206 |
207 | 色情视频:http://adult-videos.ga
208 |
209 |
210 |
211 | 高利贷服务:http://loan-shark.tf
212 |
213 |
214 |
215 |
216 |
217 |
218 | 正常文本http://hidden-evil-site.tk继续正常文本
219 |
220 |
221 | 大量空格后的隐藏内容 http://hidden-link-using-spaces.ga
222 |
223 |
224 |
231 |
232 |
249 |
250 |
256 |
257 |
258 |
--------------------------------------------------------------------------------
/core/detector/keyword_detector.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 关键字检测器模块
5 | """
6 |
7 | import re
8 | import logging
9 | from typing import List, Dict, Tuple
10 | import chardet
11 |
12 | logger = logging.getLogger('YuanZhao.detector.keyword')
13 |
14 | class KeywordDetector:
15 | """关键字检测器"""
16 |
17 | def __init__(self, config):
18 | self.config = config
19 | self.keywords = [] # 存储关键字列表 [(keyword, category, weight), ...]
20 | self.keyword_patterns = [] # 编译后的正则表达式模式列表
21 |
22 | def load_keywords(self, keywords_file: str) -> bool:
23 | """从文件加载关键字"""
24 | try:
25 | # 检测文件编码
26 | with open(keywords_file, 'rb') as f:
27 | raw_data = f.read(10000)
28 | result = chardet.detect(raw_data)
29 | encoding = result['encoding'] or 'utf-8'
30 |
31 | # 读取关键字文件
32 | with open(keywords_file, 'r', encoding=encoding) as f:
33 | import csv
34 | reader = csv.reader(f)
35 | for line_num, parts in enumerate(reader, 1):
36 | # 去除空行
37 | if not parts or all((p.strip() == '' for p in parts)):
38 | continue
39 | # 忽略注释行
40 | if parts and parts[0].strip().startswith('#'):
41 | continue
42 | if len(parts) < 3:
43 | logger.warning(f"关键字文件第{line_num}行格式错误,跳过: {parts}")
44 | continue
45 | keyword = parts[0].strip()
46 | category = parts[1].strip()
47 | # 验证风险权重
48 | try:
49 | weight = int(parts[2].strip())
50 | if not 1 <= weight <= 10:
51 | logger.warning(f"关键字文件第{line_num}行风险权重超出范围(1-10),使用默认值5: {parts}")
52 | weight = 5
53 | except Exception:
54 | logger.warning(f"关键字文件第{line_num}行风险权重不是数字,使用默认值5: {parts}")
55 | weight = 5
56 | valid_categories = ['gambling', 'porn', 'malware', 'phishing', 'other']
57 | if category not in valid_categories:
58 | logger.warning(f"关键字文件第{line_num}行类别无效,使用默认类别other: {parts}")
59 | category = 'other'
60 | self.keywords.append((keyword, category, weight))
61 |
62 | # 编译正则表达式模式
63 | self._compile_keyword_patterns()
64 |
65 | logger.info(f"成功加载 {len(self.keywords)} 个关键字")
66 | return True
67 |
68 | except Exception as e:
69 | logger.error(f"加载关键字文件失败: {str(e)}", exc_info=True)
70 | # 如果加载失败,使用内置的默认关键字
71 | self._load_default_keywords()
72 | return False
73 |
74 | def _load_default_keywords(self):
75 | """默认从项目根目录读取 keywords_example.txt"""
76 | import os
77 | try:
78 | root = os.getcwd()
79 | path = os.path.join(root, 'keywords_example.txt')
80 | if os.path.exists(path):
81 | self.load_keywords(path)
82 | return
83 | logger.warning("未找到默认关键字文件 keywords_example.txt,关键字功能将受限")
84 | self.keywords = []
85 | self.keyword_patterns = []
86 | except Exception as e:
87 | logger.error(f"加载默认关键字失败: {str(e)}")
88 | self.keywords = []
89 | self.keyword_patterns = []
90 |
91 | def _compile_keyword_patterns(self):
92 | """编译关键字正则表达式模式"""
93 | self.keyword_patterns = []
94 |
95 | for keyword, category, weight in self.keywords:
96 | if keyword.isascii() and re.fullmatch(r'[A-Za-z]+', keyword) and len(keyword) <= 2:
97 | pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
98 | else:
99 | pattern = re.compile(re.escape(keyword), re.IGNORECASE)
100 | self.keyword_patterns.append((pattern, keyword, category, weight))
101 |
102 | def detect(self, content: str, source: str) -> List[Dict]:
103 | """检测内容中的关键字匹配"""
104 | results = []
105 |
106 | # 如果没有加载关键字,使用默认关键字
107 | if not self.keywords:
108 | self._load_default_keywords()
109 |
110 | try:
111 | # 对每个关键字模式进行匹配
112 | for pattern, original_keyword, category, weight in self.keyword_patterns:
113 | for match in pattern.finditer(content):
114 | # 获取匹配上下文
115 | context = self._get_context(content, match.start(), match.end())
116 |
117 | # 构建结果
118 | result = {
119 | 'keyword': original_keyword,
120 | 'category': self._get_category_name(category),
121 | 'weight': weight,
122 | 'source': source,
123 | 'context': context,
124 | 'match_position': match.start()
125 | }
126 |
127 | # 避免重复添加相同位置的匹配
128 | if not self._is_duplicate_match(results, result):
129 | results.append(result)
130 |
131 | # 按风险权重排序
132 | results.sort(key=lambda x: x['weight'], reverse=True)
133 |
134 | except Exception as e:
135 | logger.error(f"关键字检测失败: {str(e)}", exc_info=True)
136 |
137 | return results
138 |
139 | def _get_category_name(self, category: str) -> str:
140 | """获取类别的中文名称"""
141 | category_names = {
142 | 'gambling': '博彩',
143 | 'porn': '色情',
144 | 'malware': '恶意软件',
145 | 'phishing': '钓鱼',
146 | 'other': '其他'
147 | }
148 |
149 | return category_names.get(category, '其他')
150 |
151 | def _get_context(self, content: str, start: int, end: int, context_size: int = 50) -> str:
152 | """获取匹配内容的上下文"""
153 | start_context = max(0, start - context_size)
154 | end_context = min(len(content), end + context_size)
155 |
156 | context = content[start_context:end_context]
157 | context = context.replace('\n', ' ').replace('\r', ' ')
158 |
159 | # 截断过长的上下文
160 | if len(context) > 200:
161 | context = context[:100] + '...' + context[-100:]
162 |
163 | return context
164 |
165 | def _is_duplicate_match(self, existing_results: List[Dict], new_result: Dict) -> bool:
166 | """检查是否为重复的匹配"""
167 | # 检查是否在相同位置附近有相同关键字的匹配
168 | position = new_result['match_position']
169 | keyword = new_result['keyword']
170 | source = new_result['source']
171 |
172 | for result in existing_results:
173 | if (result['keyword'] == keyword and
174 | result['source'] == source and
175 | abs(result['match_position'] - position) < 10):
176 | return True
177 |
178 | return False
179 |
180 | def get_keyword_statistics(self) -> Dict:
181 | """获取关键字统计信息"""
182 | stats = {
183 | 'total_keywords': len(self.keywords),
184 | 'by_category': {}
185 | }
186 |
187 | # 按类别统计
188 | for _, category, _ in self.keywords:
189 | category_name = self._get_category_name(category)
190 | if category_name not in stats['by_category']:
191 | stats['by_category'][category_name] = 0
192 | stats['by_category'][category_name] += 1
193 |
194 | return stats
195 |
196 | def add_keyword(self, keyword: str, category: str = 'other', weight: int = 5):
197 | """动态添加关键字"""
198 | # 验证参数
199 | if not keyword or not keyword.strip():
200 | logger.warning("尝试添加空关键字,跳过")
201 | return False
202 |
203 | weight = max(1, min(10, weight)) # 限制在1-10范围内
204 |
205 | valid_categories = ['gambling', 'porn', 'malware', 'phishing', 'other']
206 | if category not in valid_categories:
207 | category = 'other'
208 |
209 | # 检查是否已存在
210 | for existing_keyword, _, _ in self.keywords:
211 | if existing_keyword == keyword:
212 | logger.warning(f"关键字 '{keyword}' 已存在")
213 | return False
214 |
215 | # 添加关键字
216 | self.keywords.append((keyword, category, weight))
217 |
218 | # 编译新的模式
219 | if keyword.isascii() and re.fullmatch(r'[A-Za-z]+', keyword) and len(keyword) <= 2:
220 | pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
221 | else:
222 | pattern = re.compile(re.escape(keyword), re.IGNORECASE)
223 | self.keyword_patterns.append((pattern, keyword, category, weight))
224 |
225 | logger.info(f"成功添加关键字: {keyword} (类别: {category}, 权重: {weight})")
226 | return True
227 |
228 | def clear_keywords(self):
229 | """清空所有关键字"""
230 | self.keywords = []
231 | self.keyword_patterns = []
232 | logger.info("已清空所有关键字")
233 |
234 |
--------------------------------------------------------------------------------
/utils/html_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTML处理工具模块
5 | """
6 |
7 | import re
8 | import logging
9 | from typing import List, Dict, Optional
10 | from bs4 import BeautifulSoup, Comment
11 |
12 | logger = logging.getLogger('YuanZhao.utils.html')
13 |
14 | def clean_html(html_content: str) -> str:
15 | """
16 | 清理HTML内容,去除空白字符等
17 |
18 | Args:
19 | html_content: HTML内容
20 |
21 | Returns:
22 | 清理后的HTML内容
23 | """
24 | try:
25 | # 移除多余的空白字符
26 | html_content = re.sub(r'\s+', ' ', html_content)
27 | # 移除首尾空白
28 | html_content = html_content.strip()
29 | return html_content
30 | except Exception as e:
31 | logger.error(f"清理HTML失败: {str(e)}")
32 | return html_content
33 |
34 | def extract_html_comments(html_content: str) -> List[Dict[str, str]]:
35 | """
36 | 提取HTML注释
37 |
38 | Args:
39 | html_content: HTML内容
40 |
41 | Returns:
42 | 注释列表,每项包含注释内容和位置
43 | """
44 | comments = []
45 |
46 | try:
47 | # 使用正则表达式提取注释
48 | comment_pattern = re.compile(r'', re.DOTALL)
49 | matches = comment_pattern.finditer(html_content)
50 |
51 | for match in matches:
52 | comment_content = match.group(1)
53 | start_pos = match.start(0)
54 | end_pos = match.end(0)
55 |
56 | comments.append({
57 | 'content': comment_content.strip(),
58 | 'position': (start_pos, end_pos)
59 | })
60 |
61 | except Exception as e:
62 | logger.error(f"提取HTML注释失败: {str(e)}")
63 |
64 | return comments
65 |
66 | def extract_script_tags(html_content: str) -> List[Dict[str, str]]:
67 | """
68 | 提取HTML中的script标签
69 |
70 | Args:
71 | html_content: HTML内容
72 |
73 | Returns:
74 | script标签列表
75 | """
76 | scripts = []
77 |
78 | try:
79 | soup = BeautifulSoup(html_content, 'lxml')
80 | script_tags = soup.find_all('script')
81 |
82 | for script in script_tags:
83 | script_info = {
84 | 'src': script.get('src', ''),
85 | 'content': script.string or '',
86 | 'type': script.get('type', ''),
87 | 'language': script.get('language', '')
88 | }
89 |
90 | # 获取script标签的原始字符串
91 | if script: # 确保script不为None
92 | script_info['original_tag'] = str(script)
93 | else:
94 | script_info['original_tag'] = ''
95 |
96 | scripts.append(script_info)
97 |
98 | except Exception as e:
99 | logger.error(f"提取script标签失败: {str(e)}")
100 |
101 | # 如果BeautifulSoup失败,尝试使用正则表达式
102 | try:
103 | script_pattern = re.compile(r'', re.DOTALL | re.IGNORECASE)
104 | matches = script_pattern.finditer(html_content)
105 |
106 | for match in matches:
107 | scripts.append({
108 | 'src': '',
109 | 'content': match.group(1) or '',
110 | 'type': '',
111 | 'language': '',
112 | 'original_tag': match.group(0)
113 | })
114 | except Exception as fallback_error:
115 | logger.error(f"正则提取script标签也失败: {str(fallback_error)}")
116 |
117 | return scripts
118 |
119 | def extract_link_tags(html_content: str) -> List[Dict[str, str]]:
120 | """
121 | 提取HTML中的link标签
122 |
123 | Args:
124 | html_content: HTML内容
125 |
126 | Returns:
127 | link标签列表
128 | """
129 | links = []
130 |
131 | try:
132 | soup = BeautifulSoup(html_content, 'lxml')
133 | link_tags = soup.find_all('link')
134 |
135 | for link in link_tags:
136 | links.append({
137 | 'href': link.get('href', ''),
138 | 'rel': link.get('rel', ''),
139 | 'type': link.get('type', ''),
140 | 'original_tag': str(link) if link else ''
141 | })
142 |
143 | except Exception as e:
144 | logger.error(f"提取link标签失败: {str(e)}")
145 |
146 | return links
147 |
148 | def extract_meta_tags(html_content: str) -> List[Dict[str, str]]:
149 | """
150 | 提取HTML中的meta标签
151 |
152 | Args:
153 | html_content: HTML内容
154 |
155 | Returns:
156 | meta标签列表
157 | """
158 | metas = []
159 |
160 | try:
161 | soup = BeautifulSoup(html_content, 'lxml')
162 | meta_tags = soup.find_all('meta')
163 |
164 | for meta in meta_tags:
165 | meta_info = {
166 | 'name': meta.get('name', ''),
167 | 'content': meta.get('content', ''),
168 | 'http-equiv': meta.get('http-equiv', ''),
169 | 'charset': meta.get('charset', ''),
170 | 'original_tag': str(meta) if meta else ''
171 | }
172 | metas.append(meta_info)
173 |
174 | except Exception as e:
175 | logger.error(f"提取meta标签失败: {str(e)}")
176 |
177 | return metas
178 |
179 | def extract_iframe_tags(html_content: str) -> List[Dict[str, str]]:
180 | """
181 | 提取HTML中的iframe标签
182 |
183 | Args:
184 | html_content: HTML内容
185 |
186 | Returns:
187 | iframe标签列表
188 | """
189 | iframes = []
190 |
191 | try:
192 | soup = BeautifulSoup(html_content, 'lxml')
193 | iframe_tags = soup.find_all('iframe')
194 |
195 | for iframe in iframe_tags:
196 | iframes.append({
197 | 'src': iframe.get('src', ''),
198 | 'width': iframe.get('width', ''),
199 | 'height': iframe.get('height', ''),
200 | 'style': iframe.get('style', ''),
201 | 'original_tag': str(iframe) if iframe else ''
202 | })
203 |
204 | except Exception as e:
205 | logger.error(f"提取iframe标签失败: {str(e)}")
206 |
207 | return iframes
208 |
209 | def extract_all_tags(html_content: str, tag_name: str) -> List[BeautifulSoup]:
210 | """
211 | 提取指定标签的所有实例
212 |
213 | Args:
214 | html_content: HTML内容
215 | tag_name: 标签名称
216 |
217 | Returns:
218 | 标签列表
219 | """
220 | tags = []
221 |
222 | try:
223 | soup = BeautifulSoup(html_content, 'lxml')
224 | tags = soup.find_all(tag_name)
225 | except Exception as e:
226 | logger.error(f"提取{tag_name}标签失败: {str(e)}")
227 |
228 | return tags
229 |
230 | def get_dom_structure(html_content: str, max_depth: int = 3) -> Dict:
231 | """
232 | 获取DOM结构概览
233 |
234 | Args:
235 | html_content: HTML内容
236 | max_depth: 最大深度
237 |
238 | Returns:
239 | DOM结构字典
240 | """
241 | try:
242 | soup = BeautifulSoup(html_content, 'lxml')
243 |
244 | def _process_element(element, depth):
245 | if depth > max_depth:
246 | return {}
247 |
248 | tag_info = {
249 | 'tag': element.name,
250 | 'attributes': {k: v for k, v in element.attrs.items()},
251 | 'children': []
252 | }
253 |
254 | for child in element.children:
255 | if hasattr(child, 'name') and child.name:
256 | tag_info['children'].append(_process_element(child, depth + 1))
257 |
258 | return tag_info
259 |
260 | return _process_element(soup.find('html') or soup, 0)
261 |
262 | except Exception as e:
263 | logger.error(f"获取DOM结构失败: {str(e)}")
264 | return {}
265 |
266 | def find_hidden_elements(html_content: str) -> List[Dict[str, str]]:
267 | """
268 | 查找可能被隐藏的元素
269 |
270 | Args:
271 | html_content: HTML内容
272 |
273 | Returns:
274 | 隐藏元素列表
275 | """
276 | hidden_elements = []
277 |
278 | try:
279 | soup = BeautifulSoup(html_content, 'lxml')
280 |
281 | # 查找可能隐藏的元素
282 | for element in soup.find_all():
283 | # 检查style属性
284 | style = element.get('style', '').lower()
285 |
286 | if any(hidden in style for hidden in ['display:none', 'visibility:hidden', 'opacity:0']):
287 | hidden_elements.append({
288 | 'tag': element.name,
289 | 'style': style,
290 | 'content': element.get_text(),
291 | 'original_tag': str(element) if element else ''
292 | })
293 |
294 | # 检查hidden属性
295 | if element.get('hidden') is not None:
296 | hidden_elements.append({
297 | 'tag': element.name,
298 | 'reason': 'hidden attribute',
299 | 'content': element.get_text(),
300 | 'original_tag': str(element) if element else ''
301 | })
302 |
303 | except Exception as e:
304 | logger.error(f"查找隐藏元素失败: {str(e)}")
305 |
306 | return hidden_elements
307 |
308 | def extract_text_from_html(html_content: str) -> str:
309 | """
310 | 从HTML中提取纯文本
311 |
312 | Args:
313 | html_content: HTML内容
314 |
315 | Returns:
316 | 提取的纯文本
317 | """
318 | try:
319 | soup = BeautifulSoup(html_content, 'lxml')
320 |
321 | # 移除script和style标签
322 | for script in soup(['script', 'style']):
323 | if script:
324 | script.decompose()
325 |
326 | # 提取文本
327 | text = soup.get_text(separator=' ', strip=True)
328 |
329 | # 清理空白字符
330 | text = re.sub(r'\s+', ' ', text)
331 |
332 | return text
333 |
334 | except Exception as e:
335 | logger.error(f"提取HTML文本失败: {str(e)}")
336 | return html_content
337 |
338 | def remove_html_tags(html_content: str, keep_whitespace: bool = False) -> str:
339 | """
340 | 移除HTML标签
341 |
342 | Args:
343 | html_content: HTML内容
344 | keep_whitespace: 是否保留空白
345 |
346 | Returns:
347 | 移除标签后的文本
348 | """
349 | try:
350 | # 使用正则表达式移除标签
351 | text = re.sub(r'<[^>]+>', '', html_content)
352 |
353 | if not keep_whitespace:
354 | # 移除多余的空白字符
355 | text = re.sub(r'\s+', ' ', text).strip()
356 |
357 | return text
358 |
359 | except Exception as e:
360 | logger.error(f"移除HTML标签失败: {str(e)}")
361 | return html_content
362 |
363 | def get_character_encoding(html_content: str) -> Optional[str]:
364 | """
365 | 获取HTML文档的字符编码
366 |
367 | Args:
368 | html_content: HTML内容
369 |
370 | Returns:
371 | 字符编码
372 | """
373 | try:
374 | # 检查meta标签中的charset
375 | charset_match = re.search(r']+charset=["\']?([^"\'>\s]+)', html_content, re.IGNORECASE)
376 | if charset_match:
377 | return charset_match.group(1).lower()
378 |
379 | # 检查http-equiv中的content-type
380 | content_type_match = re.search(r']+http-equiv=["\']?content-type["\']?[^>]*content=["\']?[^"\']*charset=([^"\'>\s;]+)', html_content, re.IGNORECASE)
381 | if content_type_match:
382 | return content_type_match.group(1).lower()
383 |
384 | return None
385 |
386 | except Exception as e:
387 | logger.error(f"获取字符编码失败: {str(e)}")
388 | return None
389 |
390 | # 兼容性函数,为了支持html_detector.py中的导入
391 | def extract_comments(html_content: str) -> List[Dict[str, str]]:
392 | """
393 | 提取HTML注释(extract_html_comments的别名)
394 |
395 | Args:
396 | html_content: HTML内容
397 |
398 | Returns:
399 | 注释列表
400 | """
401 | return extract_html_comments(html_content)
402 |
--------------------------------------------------------------------------------
/utils/common_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 通用工具模块
5 | """
6 |
7 | import re
8 | import time
9 | import hashlib
10 | import logging
11 | import os
12 | from typing import List, Dict, Any, Optional, Tuple
13 |
14 | logger = logging.getLogger('YuanZhao.utils.common')
15 |
16 | def calculate_file_hash(file_path: str, hash_type: str = 'md5') -> Optional[str]:
17 | """
18 | 计算文件哈希值
19 |
20 | Args:
21 | file_path: 文件路径
22 | hash_type: 哈希算法类型 (md5, sha1, sha256)
23 |
24 | Returns:
25 | 哈希值字符串
26 | """
27 | try:
28 | hash_func = getattr(hashlib, hash_type)
29 | hash_obj = hash_func()
30 |
31 | with open(file_path, 'rb') as f:
32 | while True:
33 | data = f.read(65536) # 64KB chunks
34 | if not data:
35 | break
36 | hash_obj.update(data)
37 |
38 | return hash_obj.hexdigest()
39 |
40 | except Exception as e:
41 | logger.error(f"计算文件哈希失败: {file_path}, 错误: {str(e)}")
42 | return None
43 |
44 | def calculate_string_hash(string: str, hash_type: str = 'md5') -> Optional[str]:
45 | """
46 | 计算字符串哈希值
47 |
48 | Args:
49 | string: 输入字符串
50 | hash_type: 哈希算法类型
51 |
52 | Returns:
53 | 哈希值字符串
54 | """
55 | try:
56 | hash_func = getattr(hashlib, hash_type)
57 | return hash_func(string.encode('utf-8')).hexdigest()
58 | except Exception as e:
59 | logger.error(f"计算字符串哈希失败: {str(e)}")
60 | return None
61 |
62 | def clean_text(text: str) -> str:
63 | """
64 | 清理文本,去除控制字符和多余空白
65 |
66 | Args:
67 | text: 输入文本
68 |
69 | Returns:
70 | 清理后的文本
71 | """
72 | try:
73 | # 移除控制字符,保留换行和制表符
74 | text = ''.join(char for char in text if char.isprintable() or char in '\n\t')
75 | # 清理多余空白
76 | text = re.sub(r'\s+', ' ', text)
77 | return text.strip()
78 | except Exception as e:
79 | logger.error(f"清理文本失败: {str(e)}")
80 | return text
81 |
82 | def extract_text_between(text: str, start_marker: str, end_marker: str) -> List[str]:
83 | """
84 | 提取两个标记之间的文本
85 |
86 | Args:
87 | text: 原始文本
88 | start_marker: 开始标记
89 | end_marker: 结束标记
90 |
91 | Returns:
92 | 提取的文本列表
93 | """
94 | try:
95 | pattern = re.compile(re.escape(start_marker) + '(.*?)' + re.escape(end_marker), re.DOTALL)
96 | return pattern.findall(text)
97 | except Exception as e:
98 | logger.error(f"提取文本失败: {str(e)}")
99 | return []
100 |
101 | def detect_encoding(text: str) -> Optional[str]:
102 | """
103 | 检测文本编码(传入为 str 时返回默认编码)
104 | """
105 | try:
106 | # 对已解码的 str 返回 utf-8,避免误导性“探测”
107 | return 'utf-8'
108 | except Exception as e:
109 | logger.error(f"检测编码失败: {str(e)}")
110 | return None
111 |
112 | def safe_decode(bytes_data: bytes, default_encoding: str = 'utf-8') -> str:
113 | """
114 | 安全解码字节数据
115 |
116 | Args:
117 | bytes_data: 字节数据
118 | default_encoding: 默认编码
119 |
120 | Returns:
121 | 解码后的字符串
122 | """
123 | try:
124 | # 尝试多种编码
125 | encodings = [default_encoding, 'gbk', 'gb2312', 'iso-8859-1']
126 |
127 | for encoding in encodings:
128 | try:
129 | return bytes_data.decode(encoding)
130 | except UnicodeDecodeError:
131 | continue
132 |
133 | # 如果都失败,使用replace模式
134 | return bytes_data.decode(default_encoding, errors='replace')
135 |
136 | except Exception as e:
137 | logger.error(f"安全解码失败: {str(e)}")
138 | return str(bytes_data)
139 |
140 | def format_size(size_bytes: int) -> str:
141 | """
142 | 格式化文件大小
143 |
144 | Args:
145 | size_bytes: 字节大小
146 |
147 | Returns:
148 | 格式化的大小字符串
149 | """
150 | try:
151 | for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
152 | if size_bytes < 1024.0:
153 | return f"{size_bytes:.2f} {unit}"
154 | size_bytes /= 1024.0
155 | return f"{size_bytes:.2f} PB"
156 | except Exception as e:
157 | logger.error(f"格式化大小失败: {str(e)}")
158 | return f"{size_bytes} B"
159 |
160 | def format_time(seconds: float) -> str:
161 | """
162 | 格式化时间
163 |
164 | Args:
165 | seconds: 秒数
166 |
167 | Returns:
168 | 格式化的时间字符串
169 | """
170 | try:
171 | if seconds < 1:
172 | return f"{seconds * 1000:.2f} ms"
173 | elif seconds < 60:
174 | return f"{seconds:.2f} s"
175 | elif seconds < 3600:
176 | minutes, seconds = divmod(seconds, 60)
177 | return f"{int(minutes)} m {seconds:.2f} s"
178 | else:
179 | hours, remainder = divmod(seconds, 3600)
180 | minutes, seconds = divmod(remainder, 60)
181 | return f"{int(hours)} h {int(minutes)} m {seconds:.2f} s"
182 | except Exception as e:
183 | logger.error(f"格式化时间失败: {str(e)}")
184 | return f"{seconds} s"
185 |
186 | def get_file_extension(file_path: str) -> str:
187 | """
188 | 获取文件扩展名
189 |
190 | Args:
191 | file_path: 文件路径
192 |
193 | Returns:
194 | 扩展名(小写)
195 | """
196 | try:
197 | _, ext = os.path.splitext(file_path.lower())
198 | return ext
199 | except Exception as e:
200 | logger.error(f"获取文件扩展名失败: {str(e)}")
201 | return ''
202 |
203 | def validate_ip_address(ip: str) -> bool:
204 | """
205 | 验证IP地址格式
206 |
207 | Args:
208 | ip: IP地址字符串
209 |
210 | Returns:
211 | 是否为有效IP地址
212 | """
213 | try:
214 | # IPv4地址验证
215 | pattern = re.compile(r'^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
216 | return bool(pattern.match(ip))
217 | except Exception as e:
218 | logger.error(f"验证IP地址失败: {str(e)}")
219 | return False
220 |
221 | def count_occurrences(text: str, keyword: str, case_sensitive: bool = False) -> int:
222 | """
223 | 统计关键字出现次数
224 |
225 | Args:
226 | text: 文本内容
227 | keyword: 关键字
228 | case_sensitive: 是否区分大小写
229 |
230 | Returns:
231 | 出现次数
232 | """
233 | try:
234 | if not case_sensitive:
235 | text = text.lower()
236 | keyword = keyword.lower()
237 |
238 | return text.count(keyword)
239 | except Exception as e:
240 | logger.error(f"统计关键字失败: {str(e)}")
241 | return 0
242 |
243 | def is_valid_email(email: str) -> bool:
244 | """
245 | 验证邮箱格式
246 |
247 | Args:
248 | email: 邮箱地址
249 |
250 | Returns:
251 | 是否为有效邮箱
252 | """
253 | try:
254 | pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
255 | return bool(pattern.match(email))
256 | except Exception as e:
257 | logger.error(f"验证邮箱失败: {str(e)}")
258 | return False
259 |
260 | def sanitize_filename(filename: str) -> str:
261 | """
262 | 清理文件名,移除特殊字符
263 |
264 | Args:
265 | filename: 原始文件名
266 |
267 | Returns:
268 | 清理后的文件名
269 | """
270 | try:
271 | # 移除或替换特殊字符
272 | sanitized = re.sub(r'[\\/:*?"<>|]', '_', filename)
273 | # 移除控制字符
274 | sanitized = ''.join(char for char in sanitized if char.isprintable() or char.isspace())
275 | # 限制长度
276 | max_length = 200
277 | if len(sanitized) > max_length:
278 | name, ext = os.path.splitext(sanitized)
279 | sanitized = name[:max_length - len(ext)] + ext
280 | return sanitized.strip() or 'unnamed'
281 | except Exception as e:
282 | logger.error(f"清理文件名失败: {str(e)}")
283 | return 'unnamed'
284 |
285 | def merge_dicts(dict1: Dict, dict2: Dict, deep: bool = True) -> Dict:
286 | """
287 | 合并两个字典
288 |
289 | Args:
290 | dict1: 第一个字典
291 | dict2: 第二个字典
292 | deep: 是否深度合并
293 |
294 | Returns:
295 | 合并后的字典
296 | """
297 | try:
298 | result = dict1.copy()
299 |
300 | if deep:
301 | for key, value in dict2.items():
302 | if key in result and isinstance(result[key], dict) and isinstance(value, dict):
303 | result[key] = merge_dicts(result[key], value, deep=True)
304 | else:
305 | result[key] = value
306 | else:
307 | result.update(dict2)
308 |
309 | return result
310 | except Exception as e:
311 | logger.error(f"合并字典失败: {str(e)}")
312 | return dict1
313 |
314 | def remove_duplicates_preserve_order(items: List) -> List:
315 | """
316 | 移除列表中的重复项,保留原始顺序
317 |
318 | Args:
319 | items: 输入列表
320 |
321 | Returns:
322 | 去重后的列表
323 | """
324 | try:
325 | seen = set()
326 | return [item for item in items if not (item in seen or seen.add(item))]
327 | except Exception as e:
328 | logger.error(f"去重失败: {str(e)}")
329 | return items
330 |
331 | def truncate_text(text: str, max_length: int, suffix: str = '...') -> str:
332 | """
333 | 截断文本
334 |
335 | Args:
336 | text: 输入文本
337 | max_length: 最大长度
338 | suffix: 后缀
339 |
340 | Returns:
341 | 截断后的文本
342 | """
343 | try:
344 | if len(text) <= max_length:
345 | return text
346 | return text[:max_length - len(suffix)] + suffix
347 | except Exception as e:
348 | logger.error(f"截断文本失败: {str(e)}")
349 | return text
350 |
351 | def retry(func, max_retries: int = 3, delay: float = 1.0, exceptions: tuple = (Exception,)) -> Any:
352 | """
353 | 重试装饰器
354 |
355 | Args:
356 | func: 要重试的函数
357 | max_retries: 最大重试次数
358 | delay: 重试间隔(秒)
359 | exceptions: 捕获的异常类型
360 |
361 | Returns:
362 | 函数执行结果
363 | """
364 | def wrapper(*args, **kwargs):
365 | last_exception = None
366 |
367 | for attempt in range(max_retries):
368 | try:
369 | return func(*args, **kwargs)
370 | except exceptions as e:
371 | last_exception = e
372 | if attempt < max_retries - 1:
373 | logger.warning(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}, {delay}秒后重试...")
374 | time.sleep(delay)
375 |
376 | logger.error(f"所有尝试都失败了: {str(last_exception)}")
377 | raise last_exception
378 |
379 | return wrapper
380 |
381 | # 移除末尾的导入语句
382 |
383 | # 兼容性函数,为了支持html_detector.py中的导入
384 | def extract_text_between_markers(text: str, start_marker: str, end_marker: str) -> List[str]:
385 | """
386 | 提取两个标记之间的文本(extract_text_between的别名)
387 |
388 | Args:
389 | text: 原始文本
390 | start_marker: 开始标记
391 | end_marker: 结束标记
392 |
393 | Returns:
394 | 提取的文本列表
395 | """
396 | return extract_text_between(text, start_marker, end_marker)
397 |
398 | def get_context(text: str, position: int, context_length: int = 50) -> str:
399 | """
400 | 获取文本中指定位置的上下文
401 |
402 | Args:
403 | text: 原始文本
404 | position: 目标位置
405 | context_length: 上下文长度
406 |
407 | Returns:
408 | 包含上下文的文本
409 | """
410 | try:
411 | # 计算上下文的起始和结束位置
412 | context_start = max(0, position - context_length)
413 | context_end = min(len(text), position + context_length)
414 |
415 | # 提取上下文
416 | context = text[context_start:context_end]
417 |
418 | # 添加省略号
419 | prefix = '...' if context_start > 0 else ''
420 | suffix = '...' if context_end < len(text) else ''
421 |
422 | return f"{prefix}{context}{suffix}"
423 | except Exception as e:
424 | logger.error(f"获取上下文失败: {str(e)}")
425 | return text
426 |
427 | def calculate_entropy(text: str) -> float:
428 | """
429 | 计算文本的熵值
430 |
431 | Args:
432 | text: 输入文本
433 |
434 | Returns:
435 | 熵值
436 | """
437 | try:
438 | import math
439 |
440 | # 计算字符频率
441 | frequency = {}
442 | for char in text:
443 | if char in frequency:
444 | frequency[char] += 1
445 | else:
446 | frequency[char] = 1
447 |
448 | # 计算熵
449 | entropy = 0.0
450 | total_chars = len(text)
451 |
452 | for count in frequency.values():
453 | probability = count / total_chars
454 | entropy -= probability * math.log2(probability)
455 |
456 | return entropy
457 | except Exception as e:
458 | logger.error(f"计算熵值失败: {str(e)}")
459 | return 0.0
460 |
461 |
--------------------------------------------------------------------------------
/YuanZhao.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | 渊照 - 暗链扫描工具
6 | """
7 |
8 | import os
9 | import sys
10 | import argparse
11 | import logging
12 | import re
13 | from datetime import datetime
14 | from urllib.parse import urlparse
15 |
16 | # 添加项目根目录到Python路径
17 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
18 |
19 | from utils.logging_utils import setup_logging, log_config, log_summary
20 | from core.config import Config
21 | from core.scanner import Scanner
22 | from core.reporter import Reporter
23 |
24 | def parse_arguments():
25 | """
26 | 解析命令行参数
27 | """
28 | description = '''渊照 - 专业暗链扫描工具
29 |
30 | 用于智能检测网站、HTML文件或目录中的可疑暗链、隐藏元素和恶意代码。
31 | 支持自动识别扫描目标类型(本地文件/目录、内网URL、公网URL),并应用最优扫描策略。
32 | 提供多种扫描模式和报告格式,具备强大的检测能力和灵活的配置选项。
33 |
34 | 主要功能:
35 | - 基础扫描:HTML代码、JavaScript代码、CSS代码、元标签、注释扫描
36 | - 高级扫描:加密/编码链接检测、隐写术检测、DOM操作检测、iframe检测
37 | - 特殊隐藏手法检测:颜色隐藏、绝对定位隐藏、零宽字符隐藏、字体大小隐藏等
38 | - 关键字匹配:支持自定义关键字文件,按类别组织关键字,进行多语言匹配
39 | - 优化的HTML报告:清晰展示可疑链接信息,上下文列直接显示从日志中检测到的完整问题
40 | '''
41 |
42 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)
43 |
44 | # 扫描目标
45 | parser.add_argument('target', help='扫描目标:文件路径、目录路径或URL(支持http/https协议)')
46 |
47 | # 扫描配置
48 | parser.add_argument('-d', '--depth', type=int, default=3,
49 | help='递归扫描深度(默认:3,0表示仅扫描当前文件/目录)')
50 | parser.add_argument('-m', '--mode', choices=['fast', 'standard', 'deep'], default='deep',
51 | help='''扫描模式:
52 | fast(基础):仅检测基本的暗链与明显可疑元素,快速
53 | standard(高级):增加JS/HTML/CSS分析与隐藏元素检测
54 | deep(完整):执行全部检测模块,适合深度审计''')
55 | parser.add_argument('-t', '--threads', type=int, default=8,
56 | help='并发线程数(默认:8,范围1-100)')
57 | parser.add_argument('-o', '--output', help='报告输出目录(默认:./reports)')
58 | parser.add_argument('-f', '--format', choices=['txt', 'html', 'json', 'csv'], default='txt',
59 | help='''报告格式(默认:txt):
60 | txt:简洁的文本报告,适合快速查看和日志记录
61 | html:详细的网页报告,包含样式和表格,上下文列直接显示问题链接
62 | json:结构化数据,适合程序处理和自动化集成
63 | csv:表格数据,适合导入电子表格软件进行进一步分析''')
64 |
65 | # 高级配置
66 | parser.add_argument('--timeout', type=int, default=30,
67 | help='请求超时时间(秒,默认:30)。注意:工具会根据目标类型(内网/公网)自动优化超时设置')
68 | parser.add_argument('--proxy', help='''代理设置,格式:
69 | http://username:password@host:port(有认证)或
70 | http://host:port(无认证)''')
71 | parser.add_argument('--keyword-file', help='''自定义关键字文件路径(CSV格式)
72 | 格式示例:关键字,类别,风险权重
73 | 类别可选:gambling(博彩), porn(色情), malware(恶意软件), phishing(钓鱼), other(其他)
74 | 风险权重范围:1-10,10为最高风险''')
75 | parser.add_argument('--exclude', nargs='+', help='排除的文件或目录(支持通配符,如 "*.log" "node_modules/")')
76 | parser.add_argument('--no-color', action='store_true', help='禁用彩色输出')
77 | parser.add_argument('--verbose', action='store_true', default=False, help='显示详细日志信息,包括检测过程和调试内容')
78 |
79 | # 无头浏览器选项
80 | parser.add_argument('--headless', action='store_true', help='启用无头浏览器扫描 (增强动态内容检测)')
81 | parser.add_argument('--browser-type', choices=['chrome'], default='chrome', help='无头浏览器类型 (默认: chrome)')
82 | parser.add_argument('--js-wait', type=int, default=3, help='JavaScript执行等待时间 (秒, 默认: 3)')
83 | parser.add_argument('--headless-timeout', type=int, default=60, help='无头浏览器超时时间 (秒, 默认: 60)')
84 | parser.add_argument('--headless-binary', help='Chrome二进制路径 (例如: C\\Program Files\\Google\\Chrome\\Application\\chrome.exe)')
85 | parser.add_argument('--headless-driver', help='ChromeDriver路径 (例如: C\\drivers\\chromedriver.exe)')
86 | parser.add_argument('--target-file', help='目标列表文件,每行一个目标')
87 |
88 | # 添加使用示例
89 | parser.epilog = '''
90 | 使用示例:
91 | # 扫描单个HTML文件
92 | python YuanZhao.py test.html
93 |
94 | # 扫描目录及其子目录(深度为2)
95 | python YuanZhao.py ./website -d 2
96 |
97 | # 扫描URL,使用高级模式,保存为HTML格式报告
98 | python YuanZhao.py https://example.com -m standard -f html
99 |
100 | # 使用自定义关键字文件,禁用彩色输出
101 | python YuanZhao.py ./website --keyword-file custom_keywords.txt --no-color
102 |
103 | # 完整扫描公网网站并生成HTML报告(优化后格式,在上下文列显示完整问题链接)
104 | python YuanZhao.py https://example.com -m deep -d 1 -t 8 --timeout 30 -f html --verbose
105 |
106 | # 扫描特定新闻页面并在可疑链接详情中显示问题信息
107 | python YuanZhao.py https://example.com/news.php -m deep -d 1 -t 8 --timeout 30 -f html --verbose
108 |
109 | # 对内网网站进行深度扫描,使用较长超时时间
110 | python YuanZhao.py http://192.168.1.100 -d 4 -m deep --timeout 60 -f html -o intranet_reports
111 |
112 | # 扫描并排除特定文件类型
113 | python YuanZhao.py ./website --exclude "*.log" "temp/*" "node_modules/"
114 |
115 | # 使用无头浏览器增强扫描动态内容
116 | python YuanZhao.py https://example.com --headless --js-wait 5
117 | '''
118 |
119 | return parser.parse_args()
120 |
121 | def validate_arguments(args):
122 | """
123 | 验证命令行参数
124 | """
125 | # 验证目标是否存在(如果是文件或目录)
126 | if not args.target.startswith(('http://', 'https://')):
127 | if not os.path.exists(args.target):
128 | print(f"错误:目标 '{args.target}' 不存在")
129 | return False
130 | if args.target.lower().endswith('.txt'):
131 | try:
132 | with open(args.target, 'r', encoding='utf-8') as f:
133 | lines = [line.strip() for line in f.readlines() if line.strip()]
134 | if not lines:
135 | print("错误:目标列表文件为空")
136 | return False
137 | except Exception:
138 | print("错误:无法读取目标列表文件")
139 | return False
140 |
141 | # 验证关键字文件
142 | if args.keyword_file and not os.path.exists(args.keyword_file):
143 | print(f"错误:关键字文件 '{args.keyword_file}' 不存在")
144 | return False
145 | if args.target_file and not os.path.exists(args.target_file):
146 | print(f"错误:目标列表文件 '{args.target_file}' 不存在")
147 | return False
148 |
149 | # 验证线程数
150 | if args.threads < 1 or args.threads > 100:
151 | print("错误:线程数必须在1-100之间")
152 | return False
153 |
154 | # 验证扫描深度
155 | if args.depth < 0:
156 | print("错误:扫描深度不能为负数")
157 | return False
158 |
159 | return True
160 |
161 | def main():
162 | """
163 | 主函数
164 | """
165 | # 解析参数
166 | args = parse_arguments()
167 |
168 | # 验证参数
169 | if not validate_arguments(args):
170 | sys.exit(1)
171 |
172 | # 创建报告目录
173 | report_dir = args.output or os.path.join(os.getcwd(), 'reports')
174 | os.makedirs(report_dir, exist_ok=True)
175 |
176 | # 设置日志
177 | log_level = logging.DEBUG if args.verbose else logging.INFO
178 | logger = setup_logging(log_dir=report_dir, level=log_level)
179 |
180 | # 记录开始时间
181 | start_time = datetime.now()
182 | logger.info(f"开始扫描:{args.target}")
183 | logger.info(f"扫描模式:{args.mode}")
184 |
185 | # 创建配置
186 | config = Config()
187 |
188 | # 设置配置属性
189 | # 判断目标类型
190 | if args.target.startswith(('http://', 'https://')):
191 | # 检查是否为内网链接
192 | parsed_url = urlparse(args.target)
193 | domain = parsed_url.netloc
194 | # 内网域名/IP特征
195 | if (re.match(r'^127\.0\.0\.1(:\d+)?$', domain) or
196 | re.match(r'^localhost(:\d+)?$', domain) or
197 | re.match(r'^10\.\d+\.\d+\.\d+(:\d+)?$', domain) or
198 | re.match(r'^172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+(:\d+)?$', domain) or
199 | re.match(r'^192\.168\.\d+\.\d+(:\d+)?$', domain)):
200 | config.target_type = 'internal_url'
201 | else:
202 | config.target_type = 'external_url'
203 | elif os.path.isfile(args.target):
204 | config.target_type = 'local_file'
205 | elif os.path.isdir(args.target):
206 | config.target_type = 'local_directory'
207 | else:
208 | config.target_type = 'unknown'
209 |
210 | config.target = args.target
211 | config.crawl_depth = args.depth
212 | config.depth = args.depth # 同步更新depth属性
213 |
214 | # 映射扫描模式(仅使用新名称)
215 | mode_mapping = {
216 | 'fast': 'fast',
217 | 'standard': 'standard',
218 | 'deep': 'deep'
219 | }
220 | config.scan_mode = mode_mapping.get(args.mode, 'standard')
221 | config.mode = config.scan_mode # 同步更新mode属性
222 | config._set_mode_config() # 更新模式相关配置
223 |
224 | config.threads = args.threads
225 | config.timeout = args.timeout
226 | config.proxy = args.proxy
227 | config.keywords_file = args.keyword_file
228 | config.report_type = args.format
229 | config.report_file = os.path.join(report_dir, f"scan_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{args.format}")
230 | config.debug = args.verbose
231 | # 排除规则
232 | config.exclude = args.exclude or []
233 |
234 | # 设置无头浏览器配置
235 | config.use_headless_browser = args.headless
236 | config.headless_browser = args.browser_type
237 | config.js_wait_time = args.js_wait
238 | config.headless_timeout = args.headless_timeout
239 | config.headless_binary = args.headless_binary
240 | config.headless_driver_path = args.headless_driver
241 | if args.headless:
242 | config.headless_auto_download = True
243 |
244 | # 记录配置
245 | log_config(logger, config.get_config_dict())
246 |
247 | try:
248 | targets = []
249 | if args.target_file:
250 | with open(args.target_file, 'r', encoding='utf-8') as f:
251 | targets = [line.strip() for line in f.readlines() if line.strip()]
252 | summary_target = f"目标列表: {args.target_file} ({len(targets)} 项)"
253 | elif not args.target.startswith(('http://', 'https://')) and args.target.lower().endswith('.txt'):
254 | with open(args.target, 'r', encoding='utf-8') as f:
255 | targets = [line.strip() for line in f.readlines() if line.strip()]
256 | summary_target = f"目标列表: {args.target} ({len(targets)} 项)"
257 | else:
258 | targets = [args.target]
259 | summary_target = args.target
260 | agg = {
261 | 'total_files': 0,
262 | 'scanned_files': 0,
263 | 'scanned_urls': 0,
264 | 'total_issues': 0,
265 | 'suspicious_links': [],
266 | 'hidden_elements': [],
267 | 'keyword_matches': [],
268 | 'js_issues': [],
269 | 'css_issues': [],
270 | 'scan_time': 0
271 | }
272 | for tgt in targets:
273 | if tgt.startswith(('http://', 'https://')):
274 | parsed_url = urlparse(tgt)
275 | domain = parsed_url.netloc
276 | if (re.match(r'^127\.0\.0\.1(:\d+)?$', domain) or
277 | re.match(r'^localhost(:\d+)?$', domain) or
278 | re.match(r'^10\.\d+\.\d+\.\d+(:\d+)?$', domain) or
279 | re.match(r'^172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+(:\d+)?$', domain) or
280 | re.match(r'^192\.168\.\d+\.\d+(:\d+)?$', domain)):
281 | config.target_type = 'internal_url'
282 | else:
283 | config.target_type = 'external_url'
284 | elif os.path.isfile(tgt):
285 | config.target_type = 'local_file'
286 | elif os.path.isdir(tgt):
287 | config.target_type = 'local_directory'
288 | else:
289 | continue
290 | config.target = tgt
291 | scanner = Scanner(config)
292 | res = scanner.scan()
293 | agg['total_files'] += res.get('total_files', 0)
294 | agg['scanned_files'] += res.get('scanned_files', 0)
295 | agg['scanned_urls'] += res.get('scanned_urls', 0)
296 | agg['total_issues'] += res.get('total_issues', 0)
297 | agg['suspicious_links'].extend(res.get('suspicious_links', []))
298 | agg['hidden_elements'].extend(res.get('hidden_elements', []))
299 | agg['keyword_matches'].extend(res.get('keyword_matches', []))
300 | agg['js_issues'].extend(res.get('js_issues', []))
301 | agg['css_issues'].extend(res.get('css_issues', []))
302 | end_time = datetime.now()
303 | duration = str(end_time - start_time)
304 | config.target = summary_target
305 | reporter = Reporter(config)
306 | report_file = reporter.generate_report(agg, duration)
307 | scan_time = (end_time - start_time).total_seconds()
308 | log_summary(
309 | logger,
310 | total_files=agg.get('total_files', 0),
311 | scanned_files=agg.get('scanned_files', 0),
312 | issues_found=agg.get('total_issues', 0),
313 | scan_time=scan_time
314 | )
315 | logger.info(f"扫描完成!报告已保存至:{report_file}")
316 | print(f"\n扫描完成!报告已保存至:{report_file}")
317 |
318 | except Exception as e:
319 | logger.error(f"扫描过程中发生错误:{str(e)}", exc_info=True)
320 | print(f"错误:扫描过程中发生错误 - {str(e)}")
321 | sys.exit(1)
322 |
323 | if __name__ == '__main__':
324 | main()
325 |
326 |
--------------------------------------------------------------------------------
/utils/css_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | CSS处理工具模块
5 | """
6 |
7 | import re
8 | import logging
9 | from typing import List, Dict, Optional
10 |
11 | logger = logging.getLogger('YuanZhao.utils.css')
12 |
13 | def extract_css_urls(css_content: str) -> List[Dict[str, str]]:
14 | """
15 | 提取CSS中的URL
16 |
17 | Args:
18 | css_content: CSS内容
19 |
20 | Returns:
21 | URL列表
22 | """
23 | urls = []
24 |
25 | try:
26 | # 匹配CSS中的url()函数
27 | url_pattern = re.compile(r'url\s*\(\s*(["\']?)([^"\'\)]+)\1\s*\)', re.IGNORECASE)
28 | matches = url_pattern.finditer(css_content)
29 |
30 | for match in matches:
31 | url = match.group(2)
32 | start_pos = match.start(0)
33 | end_pos = match.end(0)
34 |
35 | # 获取上下文
36 | context_start = max(0, start_pos - 50)
37 | context_end = min(len(css_content), end_pos + 50)
38 | context = css_content[context_start:context_end]
39 |
40 | urls.append({
41 | 'url': url,
42 | 'original': match.group(0),
43 | 'context': context,
44 | 'position': (start_pos, end_pos)
45 | })
46 |
47 | except Exception as e:
48 | logger.error(f"提取CSS URL失败: {str(e)}")
49 |
50 | return urls
51 |
52 | def extract_import_rules(css_content: str) -> List[Dict[str, str]]:
53 | """
54 | 提取CSS中的@import规则
55 |
56 | Args:
57 | css_content: CSS内容
58 |
59 | Returns:
60 | @import规则列表
61 | """
62 | import_rules = []
63 |
64 | try:
65 | # 匹配@import规则
66 | import_pattern = re.compile(r'@import\s+(["\']?)([^"\';\n]+)\1\s*([^;\n]*)\s*;', re.IGNORECASE)
67 | matches = import_pattern.finditer(css_content)
68 |
69 | for match in matches:
70 | url = match.group(2)
71 | media = match.group(3)
72 | start_pos = match.start(0)
73 | end_pos = match.end(0)
74 |
75 | import_rules.append({
76 | 'url': url,
77 | 'media': media,
78 | 'original': match.group(0),
79 | 'position': (start_pos, end_pos)
80 | })
81 |
82 | except Exception as e:
83 | logger.error(f"提取CSS @import规则失败: {str(e)}")
84 |
85 | return import_rules
86 |
87 | def extract_selectors(css_content: str) -> List[Dict[str, str]]:
88 | """
89 | 提取CSS选择器
90 |
91 | Args:
92 | css_content: CSS内容
93 |
94 | Returns:
95 | 选择器列表
96 | """
97 | selectors = []
98 |
99 | try:
100 | # 移除注释
101 | css_content = remove_css_comments(css_content)
102 |
103 | # 匹配CSS规则
104 | rule_pattern = re.compile(r'([^{]+)\s*{[^}]*}', re.DOTALL)
105 | rules = rule_pattern.finditer(css_content)
106 |
107 | for rule in rules:
108 | selector_text = rule.group(1).strip()
109 |
110 | # 分割多个选择器
111 | for selector in selector_text.split(','):
112 | selector = selector.strip()
113 | if selector:
114 | selectors.append({
115 | 'selector': selector,
116 | 'position': (rule.start(1), rule.end(1))
117 | })
118 |
119 | except Exception as e:
120 | logger.error(f"提取CSS选择器失败: {str(e)}")
121 |
122 | return selectors
123 |
124 | def extract_css_properties(css_content: str) -> List[Dict[str, str]]:
125 | """
126 | 提取CSS属性
127 |
128 | Args:
129 | css_content: CSS内容
130 |
131 | Returns:
132 | CSS属性列表
133 | """
134 | properties = []
135 |
136 | try:
137 | # 移除注释
138 | css_content = remove_css_comments(css_content)
139 |
140 | # 匹配CSS规则体
141 | body_pattern = re.compile(r'\{([^}]*)\}', re.DOTALL)
142 | bodies = body_pattern.finditer(css_content)
143 |
144 | for body in bodies:
145 | body_content = body.group(1)
146 | body_start = body.start(1)
147 |
148 | # 匹配属性
149 | prop_pattern = re.compile(r'([^:;\s]+)\s*:\s*([^;]+);')
150 | props = prop_pattern.finditer(body_content)
151 |
152 | for prop in props:
153 | prop_name = prop.group(1).strip()
154 | prop_value = prop.group(2).strip()
155 |
156 | properties.append({
157 | 'property': prop_name,
158 | 'value': prop_value,
159 | 'position': (body_start + prop.start(1), body_start + prop.end(1))
160 | })
161 |
162 | except Exception as e:
163 | logger.error(f"提取CSS属性失败: {str(e)}")
164 |
165 | return properties
166 |
167 | def detect_hidden_elements(css_content: str) -> List[Dict[str, str]]:
168 | """
169 | 检测可能用于隐藏元素的CSS规则
170 |
171 | Args:
172 | css_content: CSS内容
173 |
174 | Returns:
175 | 隐藏规则列表
176 | """
177 | hidden_rules = []
178 |
179 | # 隐藏元素的属性模式
180 | hiding_patterns = [
181 | (r'display\s*:\s*none', 'display: none'),
182 | (r'visibility\s*:\s*hidden', 'visibility: hidden'),
183 | (r'opacity\s*:\s*0', 'opacity: 0'),
184 | (r'position\s*:\s*absolute.*left\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*top\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*width\s*:\s*\d+px\s*;.*height\s*:\s*\d+px', 'absolute positioned tiny element'),
185 | (r'position\s*:\s*absolute.*left\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*top\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)', 'absolute positioned'),
186 | (r'overflow\s*:\s*hidden', 'overflow: hidden'),
187 | (r'clip\s*:\s*rect\(0\s*px\s*0\s*px\s*0\s*px\s*0\s*px\)', 'clip: rect'),
188 | (r'font-size\s*:\s*0(?:px)?', 'font-size: 0'),
189 | (r'line-height\s*:\s*0(?:px)?', 'line-height: 0'),
190 | (r'text-indent\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)', 'text-indent'),
191 | (r'color\s*:\s*transparent', 'color: transparent'),
192 | (r'background-color\s*:\s*transparent', 'background-color: transparent'),
193 | (r'height\s*:\s*0(?:px)?', 'height: 0'),
194 | (r'width\s*:\s*0(?:px)?', 'width: 0'),
195 | ]
196 |
197 | try:
198 | # 移除注释
199 | css_content = remove_css_comments(css_content)
200 |
201 | # 匹配CSS规则
202 | rule_pattern = re.compile(r'([^{]+)\s*{([^}]*)}', re.DOTALL)
203 | rules = rule_pattern.finditer(css_content)
204 |
205 | for rule in rules:
206 | selector = rule.group(1).strip()
207 | body = rule.group(2)
208 | start_pos = rule.start(0)
209 | end_pos = rule.end(0)
210 |
211 | # 检查每个隐藏模式
212 | for pattern_str, hiding_type in hiding_patterns:
213 | pattern = re.compile(pattern_str, re.DOTALL | re.IGNORECASE)
214 |
215 | if pattern.search(body):
216 | hidden_rules.append({
217 | 'type': hiding_type,
218 | 'selector': selector,
219 | 'css': body.strip(),
220 | 'original_rule': rule.group(0),
221 | 'position': (start_pos, end_pos)
222 | })
223 | break # 每个规则只记录一次
224 |
225 | except Exception as e:
226 | logger.error(f"检测隐藏元素失败: {str(e)}")
227 |
228 | return hidden_rules
229 |
230 | def detect_suspicious_selectors(css_content: str) -> List[Dict[str, str]]:
231 | """
232 | 检测可疑的CSS选择器
233 |
234 | Args:
235 | css_content: CSS内容
236 | Returns:
237 | 可疑选择器列表
238 | """
239 | suspicious_selectors = []
240 |
241 | # 可疑选择器模式
242 | suspicious_patterns = [
243 | # 随机字符串类名或ID
244 | (r'\.(\w{8,})[^\w\-]', 'long_random_class'),
245 | (r'#(\w{8,})[^\w\-]', 'long_random_id'),
246 | # 连续数字类名或ID
247 | (r'\.(\d{4,})[^\w\-]', 'numeric_class'),
248 | (r'#(\d{4,})[^\w\-]', 'numeric_id'),
249 | # 特殊字符选择器
250 | (r'[\[\*\+\~\^\$\|]', 'complex_selector'),
251 | ]
252 |
253 | try:
254 | # 移除注释
255 | css_content = remove_css_comments(css_content)
256 |
257 | # 匹配CSS规则
258 | rule_pattern = re.compile(r'([^{]+)\s*{[^}]*}', re.DOTALL)
259 | rules = rule_pattern.finditer(css_content)
260 |
261 | for rule in rules:
262 | selector_text = rule.group(1).strip()
263 |
264 | # 检查每个可疑模式
265 | for pattern_str, selector_type in suspicious_patterns:
266 | pattern = re.compile(pattern_str, re.DOTALL)
267 |
268 | if pattern.search(selector_text):
269 | suspicious_selectors.append({
270 | 'type': selector_type,
271 | 'selector': selector_text,
272 | 'position': (rule.start(1), rule.end(1))
273 | })
274 | break # 每个选择器只记录一次
275 |
276 | except Exception as e:
277 | logger.error(f"检测可疑选择器失败: {str(e)}")
278 |
279 | return suspicious_selectors
280 |
281 | def remove_css_comments(css_content: str) -> str:
282 | """
283 | 移除CSS注释
284 |
285 | Args:
286 | css_content: CSS内容
287 |
288 | Returns:
289 | 移除注释后的CSS内容
290 | """
291 | try:
292 | # 移除CSS注释
293 | css_content = re.sub(r'/\*.*?\*/', '', css_content, flags=re.DOTALL)
294 | return css_content
295 | except Exception as e:
296 | logger.error(f"移除CSS注释失败: {str(e)}")
297 | return css_content
298 |
299 |
300 | def analyze_complexity(css_content: str) -> Dict[str, int]:
301 | """
302 | 分析CSS复杂度
303 |
304 | Args:
305 | css_content: CSS内容
306 |
307 | Returns:
308 | 包含复杂度指标的字典
309 | """
310 | complexity = {
311 | 'rules_count': 0,
312 | 'selectors_count': 0,
313 | 'properties_count': 0,
314 | 'imports_count': 0,
315 | 'media_queries_count': 0
316 | }
317 |
318 | try:
319 | # 移除注释
320 | css_content = remove_css_comments(css_content)
321 |
322 | # 计算规则数量
323 | rule_pattern = re.compile(r'\{[^}]*\}', re.DOTALL)
324 | complexity['rules_count'] = len(rule_pattern.findall(css_content))
325 |
326 | # 计算选择器数量
327 | selectors = extract_selectors(css_content)
328 | complexity['selectors_count'] = len(selectors)
329 |
330 | # 计算属性数量
331 | properties = extract_css_properties(css_content)
332 | complexity['properties_count'] = len(properties)
333 |
334 | # 计算导入规则数量
335 | imports = extract_import_rules(css_content)
336 | complexity['imports_count'] = len(imports)
337 |
338 | # 计算媒体查询数量
339 | media_query_pattern = re.compile(r'@media\s+[^\{]*\{[^}]*\}', re.DOTALL)
340 | complexity['media_queries_count'] = len(media_query_pattern.findall(css_content))
341 |
342 | except Exception as e:
343 | logger.error(f"分析CSS复杂度失败: {str(e)}")
344 |
345 | return complexity
346 |
347 | def extract_css_comments(css_content: str) -> List[Dict[str, str]]:
348 | """
349 | 提取CSS注释
350 |
351 | Args:
352 | css_content: CSS内容
353 |
354 | Returns:
355 | 注释列表
356 | """
357 | comments = []
358 |
359 | try:
360 | comment_pattern = re.compile(r'/\*(.*?)\*/', re.DOTALL)
361 | matches = comment_pattern.finditer(css_content)
362 |
363 | for match in matches:
364 | comment_content = match.group(1).strip()
365 | start_pos = match.start(0)
366 | end_pos = match.end(0)
367 |
368 | comments.append({
369 | 'content': comment_content,
370 | 'position': (start_pos, end_pos)
371 | })
372 |
373 | except Exception as e:
374 | logger.error(f"提取CSS注释失败: {str(e)}")
375 |
376 | return comments
377 |
378 | def analyze_css_complexity(css_content: str) -> Dict[str, int]:
379 | """
380 | 分析CSS复杂度
381 |
382 | Args:
383 | css_content: CSS内容
384 |
385 | Returns:
386 | 复杂度指标
387 | """
388 | try:
389 | # 移除注释
390 | css_content = remove_css_comments(css_content)
391 |
392 | # 计算规则数量
393 | rule_pattern = re.compile(r'[^\s\n\r]+\s*{[^}]*}', re.DOTALL)
394 | rules = rule_pattern.findall(css_content)
395 | rule_count = len(rules)
396 |
397 | # 计算选择器数量
398 | selectors = extract_selectors(css_content)
399 | selector_count = len(selectors)
400 |
401 | # 计算属性数量
402 | properties = extract_css_properties(css_content)
403 | property_count = len(properties)
404 |
405 | # 计算URL数量
406 | urls = extract_css_urls(css_content)
407 | url_count = len(urls)
408 |
409 | return {
410 | 'rule_count': rule_count,
411 | 'selector_count': selector_count,
412 | 'property_count': property_count,
413 | 'url_count': url_count,
414 | 'file_size': len(css_content),
415 | }
416 |
417 | except Exception as e:
418 | logger.error(f"分析CSS复杂度失败: {str(e)}")
419 | return {}
420 |
421 | def find_duplicate_rules(css_content: str) -> List[Dict[str, str]]:
422 | """
423 | 查找重复的CSS规则
424 |
425 | Args:
426 | css_content: CSS内容
427 |
428 | Returns:
429 | 重复规则列表
430 | """
431 | duplicate_rules = []
432 | seen_rules = {}
433 |
434 | try:
435 | # 移除注释
436 | css_content = remove_css_comments(css_content)
437 |
438 | # 匹配CSS规则
439 | rule_pattern = re.compile(r'([^{]+)\s*{([^}]*)}', re.DOTALL)
440 | rules = rule_pattern.finditer(css_content)
441 |
442 | for rule in rules:
443 | selector = rule.group(1).strip()
444 | body = rule.group(2).strip()
445 |
446 | # 使用body作为键,查找重复
447 | if body in seen_rules:
448 | duplicate_rules.append({
449 | 'selector': selector,
450 | 'duplicate_selector': seen_rules[body],
451 | 'css_body': body
452 | })
453 | else:
454 | seen_rules[body] = selector
455 |
456 | except Exception as e:
457 | logger.error(f"查找重复规则失败: {str(e)}")
458 |
459 | return duplicate_rules
460 |
--------------------------------------------------------------------------------
/core/detector/special_hiding_detector.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 特殊隐藏技术检测器模块
5 | """
6 |
7 | import re
8 | import logging
9 | from typing import List, Dict
10 |
11 | logger = logging.getLogger('YuanZhao.detector.special_hiding')
12 |
13 | class SpecialHidingDetector:
14 | """特殊隐藏技术检测器"""
15 |
16 | def __init__(self, config):
17 | self.config = config
18 | self._init_patterns()
19 |
20 | def _init_patterns(self):
21 | """初始化正则表达式模式"""
22 | # 零宽字符模式
23 | self.zero_width_chars = [
24 | '\u200B', # 零宽空格
25 | '\u200C', # 零宽不连字
26 | '\u200D', # 零宽连字
27 | '\u2060', # 字连接符
28 | '\uFEFF', # 字节顺序标记
29 | ]
30 | self.zero_width_pattern = re.compile('|'.join(re.escape(c) for c in self.zero_width_chars))
31 |
32 | # 空白字符堆积
33 | self.whitespace_pattern = re.compile(r'(\s|\t|\r|\n){10,}')
34 |
35 | # 颜色隐藏(颜色接近背景色)
36 | self.color_pattern = re.compile(
37 | r'color\s*:\s*(#\w{3,6}|rgba?\([^)]+\))',
38 | re.IGNORECASE
39 | )
40 | self.background_color_pattern = re.compile(
41 | r'background-color\s*:\s*(#\w{3,6}|rgba?\([^)]+\))',
42 | re.IGNORECASE
43 | )
44 |
45 | # 绝对定位隐藏(离屏元素)
46 | self.absolute_position_pattern = re.compile(
47 | r'position\s*:\s*absolute.*?(left|top|bottom|right)\s*:\s*(-?\d+(?:\.\d+)?(?:px|em|%)?)',
48 | re.IGNORECASE | re.DOTALL
49 | )
50 |
51 | # 字体大小隐藏
52 | self.font_size_pattern = re.compile(
53 | r'font-size\s*:\s*(0|0\.\d+)',
54 | re.IGNORECASE
55 | )
56 |
57 | # 文本缩进隐藏
58 | self.text_indent_pattern = re.compile(
59 | r'text-indent\s*:\s*(-\d+(?:\.\d+)?(?:px|em|%))',
60 | re.IGNORECASE
61 | )
62 |
63 | # 透明度隐藏
64 | self.opacity_pattern = re.compile(
65 | r'opacity\s*:\s*(0|0\.\d+)',
66 | re.IGNORECASE
67 | )
68 | self.visibility_pattern = re.compile(
69 | r'visibility\s*:\s*hidden',
70 | re.IGNORECASE
71 | )
72 | self.display_none_pattern = re.compile(
73 | r'display\s*:\s*none',
74 | re.IGNORECASE
75 | )
76 |
77 | # 多层嵌套隐藏
78 | self.nested_elements_pattern = re.compile(
79 | r'<(div|span|p|a)[^>]*>\s*<(div|span|p|a)[^>]*>\s*<(div|span|p|a)[^>]*>',
80 | re.IGNORECASE
81 | )
82 |
83 | # HTML实体编码隐藏
84 | self.html_entity_pattern = re.compile(r'(\d+);|([0-9a-f]+);')
85 |
86 | # 可疑的编码混合
87 | self.mixed_encoding_pattern = re.compile(
88 | r'https?://(?:[\w\-._~:/?#[\]@!$&\'()*+,;=]|%[0-9a-fA-F]{2})+',
89 | re.IGNORECASE
90 | )
91 |
92 | def detect(self, content: str, source: str) -> List[Dict]:
93 | """检测特殊隐藏技术"""
94 | results = []
95 |
96 | try:
97 | # 检测零宽字符
98 | zero_width_results = self._detect_zero_width_chars(content, source)
99 | results.extend(zero_width_results)
100 |
101 | # 检测空白字符堆积
102 | whitespace_results = self._detect_whitespace(content, source)
103 | results.extend(whitespace_results)
104 |
105 | # 检测颜色隐藏
106 | color_results = self._detect_color_hiding(content, source)
107 | results.extend(color_results)
108 |
109 | # 检测绝对定位隐藏
110 | position_results = self._detect_position_hiding(content, source)
111 | results.extend(position_results)
112 |
113 | # 检测字体大小隐藏
114 | font_size_results = self._detect_font_size_hiding(content, source)
115 | results.extend(font_size_results)
116 |
117 | # 检测文本缩进隐藏
118 | indent_results = self._detect_text_indent_hiding(content, source)
119 | results.extend(indent_results)
120 |
121 | # 检测透明度隐藏
122 | opacity_results = self._detect_opacity_hiding(content, source)
123 | results.extend(opacity_results)
124 |
125 | # 检测多层嵌套隐藏
126 | nested_results = self._detect_nested_elements(content, source)
127 | results.extend(nested_results)
128 |
129 | # 检测HTML实体编码隐藏
130 | entity_results = self._detect_html_entities(content, source)
131 | results.extend(entity_results)
132 |
133 | except Exception as e:
134 | logger.error(f"特殊隐藏技术检测失败: {str(e)}", exc_info=True)
135 |
136 | return results
137 |
138 | def _detect_zero_width_chars(self, content: str, source: str) -> List[Dict]:
139 | """检测零宽字符"""
140 | results = []
141 |
142 | matches = list(self.zero_width_pattern.finditer(content))
143 | if matches:
144 | # 收集所有零宽字符的上下文
145 | context = self._get_context(content, matches[0].start(), matches[-1].end(), 100)
146 |
147 | # 解码隐藏内容(如果可能)
148 | hidden_content = self._extract_hidden_content(content, self.zero_width_chars)
149 |
150 | results.append({
151 | 'link': f'零宽字符隐藏 ({len(matches)}个字符)',
152 | 'source': source,
153 | 'type': 'zero_width_hiding',
154 | 'detection_method': 'regex',
155 | 'risk_level': '高',
156 | 'context': context,
157 | 'hidden_content': hidden_content if hidden_content else None
158 | })
159 |
160 | return results
161 |
162 | def _detect_whitespace(self, content: str, source: str) -> List[Dict]:
163 | """检测空白字符堆积"""
164 | results = []
165 |
166 | for match in self.whitespace_pattern.finditer(content):
167 | # 检查是否在HTML标签之间或注释中
168 | context = self._get_context(content, match.start(), match.end(), 50)
169 |
170 | # 只有在标签之间大量空白才认为可疑
171 | if '<' not in context and '>' not in context:
172 | results.append({
173 | 'link': f'空白字符堆积 ({len(match.group(1))}个字符)',
174 | 'source': source,
175 | 'type': 'whitespace_hiding',
176 | 'detection_method': 'regex',
177 | 'risk_level': '中',
178 | 'context': context
179 | })
180 |
181 | return results
182 |
183 | def _detect_color_hiding(self, content: str, source: str) -> List[Dict]:
184 | """检测颜色隐藏"""
185 | results = []
186 |
187 | # 找到所有颜色定义
188 | for color_match in self.color_pattern.finditer(content):
189 | color = color_match.group(1)
190 |
191 | # 在同一段落中查找背景颜色
192 | start_pos = max(0, color_match.start() - 200)
193 | end_pos = min(len(content), color_match.end() + 200)
194 | segment = content[start_pos:end_pos]
195 |
196 | bg_match = self.background_color_pattern.search(segment)
197 | if bg_match:
198 | bg_color = bg_match.group(1)
199 |
200 | # 如果颜色非常接近背景色,标记为可疑
201 | if self._colors_are_similar(color, bg_color):
202 | results.append({
203 | 'link': f'颜色隐藏 (文字:{color}, 背景:{bg_color})',
204 | 'source': source,
205 | 'type': 'color_hiding',
206 | 'detection_method': 'regex',
207 | 'risk_level': '高',
208 | 'context': self._get_context(content, color_match.start(), color_match.end())
209 | })
210 |
211 | return results
212 |
213 | def _detect_position_hiding(self, content: str, source: str) -> List[Dict]:
214 | """检测绝对定位隐藏"""
215 | results = []
216 |
217 | for match in self.absolute_position_pattern.finditer(content):
218 | direction = match.group(1).lower()
219 | value = match.group(2)
220 |
221 | # 提取数值部分
222 | num_value = float(re.search(r'([-\d.]+)', value).group(1))
223 |
224 | # 如果位置在屏幕外(非常大的负值或正值)
225 | if abs(num_value) > 1000:
226 | results.append({
227 | 'link': f'绝对定位隐藏 ({direction}:{value})',
228 | 'source': source,
229 | 'type': 'position_hiding',
230 | 'detection_method': 'regex',
231 | 'risk_level': '高',
232 | 'context': self._get_context(content, match.start(), match.end())
233 | })
234 |
235 | return results
236 |
237 | def _detect_font_size_hiding(self, content: str, source: str) -> List[Dict]:
238 | """检测字体大小隐藏"""
239 | results = []
240 |
241 | for match in self.font_size_pattern.finditer(content):
242 | size = match.group(1)
243 |
244 | results.append({
245 | 'link': f'字体大小隐藏 (size:{size})',
246 | 'source': source,
247 | 'type': 'font_size_hiding',
248 | 'detection_method': 'regex',
249 | 'risk_level': '高',
250 | 'context': self._get_context(content, match.start(), match.end())
251 | })
252 |
253 | return results
254 |
255 | def _detect_text_indent_hiding(self, content: str, source: str) -> List[Dict]:
256 | """检测文本缩进隐藏"""
257 | results = []
258 |
259 | for match in self.text_indent_pattern.finditer(content):
260 | indent = match.group(1)
261 |
262 | # 提取数值部分
263 | num_value = float(re.search(r'([-\d.]+)', indent).group(1))
264 |
265 | # 如果缩进很大(负值),可能是隐藏文本
266 | if num_value < -50:
267 | results.append({
268 | 'link': f'文本缩进隐藏 (indent:{indent})',
269 | 'source': source,
270 | 'type': 'text_indent_hiding',
271 | 'detection_method': 'regex',
272 | 'risk_level': '高',
273 | 'context': self._get_context(content, match.start(), match.end())
274 | })
275 |
276 | return results
277 |
278 | def _detect_opacity_hiding(self, content: str, source: str) -> List[Dict]:
279 | """检测透明度隐藏"""
280 | results = []
281 |
282 | # 检测opacity
283 | for match in self.opacity_pattern.finditer(content):
284 | opacity = match.group(1)
285 | results.append({
286 | 'link': f'透明度隐藏 (opacity:{opacity})',
287 | 'source': source,
288 | 'type': 'opacity_hiding',
289 | 'detection_method': 'regex',
290 | 'risk_level': '高',
291 | 'context': self._get_context(content, match.start(), match.end())
292 | })
293 |
294 | # 检测visibility:hidden
295 | for match in self.visibility_pattern.finditer(content):
296 | results.append({
297 | 'link': '可见性隐藏 (visibility:hidden)',
298 | 'source': source,
299 | 'type': 'visibility_hiding',
300 | 'detection_method': 'regex',
301 | 'risk_level': '高',
302 | 'context': self._get_context(content, match.start(), match.end())
303 | })
304 |
305 | # 检测display:none
306 | for match in self.display_none_pattern.finditer(content):
307 | results.append({
308 | 'link': '显示隐藏 (display:none)',
309 | 'source': source,
310 | 'type': 'display_hiding',
311 | 'detection_method': 'regex',
312 | 'risk_level': '高',
313 | 'context': self._get_context(content, match.start(), match.end())
314 | })
315 |
316 | return results
317 |
318 | def _detect_nested_elements(self, content: str, source: str) -> List[Dict]:
319 | """检测多层嵌套隐藏"""
320 | results = []
321 |
322 | for match in self.nested_elements_pattern.finditer(content):
323 | results.append({
324 | 'link': '多层嵌套隐藏',
325 | 'source': source,
326 | 'type': 'nested_hiding',
327 | 'detection_method': 'regex',
328 | 'risk_level': '中',
329 | 'context': self._get_context(content, match.start(), match.end())
330 | })
331 |
332 | return results
333 |
334 | def _detect_html_entities(self, content: str, source: str) -> List[Dict]:
335 | """检测HTML实体编码隐藏"""
336 | results = []
337 |
338 | # 计算HTML实体的密度
339 | entity_matches = list(self.html_entity_pattern.finditer(content))
340 |
341 | # 如果在较短的文本中有大量实体编码,可能是隐藏内容
342 | if len(entity_matches) > 10:
343 | # 尝试解码一些实体看看是否包含可疑内容
344 | sample = content[max(0, entity_matches[0].start() - 20):entity_matches[min(5, len(entity_matches)-1)].end() + 20]
345 |
346 | results.append({
347 | 'link': f'HTML实体编码隐藏 ({len(entity_matches)}个实体)',
348 | 'source': source,
349 | 'type': 'entity_hiding',
350 | 'detection_method': 'regex',
351 | 'risk_level': '中',
352 | 'context': sample
353 | })
354 |
355 | return results
356 |
357 | def _colors_are_similar(self, color1: str, color2: str) -> bool:
358 | """检查两个颜色是否相似"""
359 | # 这是一个简化的实现,实际应用中可能需要更复杂的颜色比较
360 | # 在这里我们只是检查是否完全相同或都是深色/浅色
361 |
362 | # 转换为小写以便比较
363 | color1 = color1.lower()
364 | color2 = color2.lower()
365 |
366 | # 如果完全相同,肯定是相似的
367 | if color1 == color2:
368 | return True
369 |
370 | # 检查是否都是深色(简化判断)
371 | dark_colors = ['#000', '#000000', 'black', 'rgb(0,0,0)']
372 | if color1 in dark_colors and color2 in dark_colors:
373 | return True
374 |
375 | # 检查是否都是白色
376 | white_colors = ['#fff', '#ffffff', 'white', 'rgb(255,255,255)']
377 | if color1 in white_colors and color2 in white_colors:
378 | return True
379 |
380 | return False
381 |
382 | def _extract_hidden_content(self, content: str, markers: List[str]) -> str:
383 | """从内容中提取使用特定标记隐藏的内容"""
384 | # 这个方法可以进一步扩展来提取使用零宽字符编码的隐藏内容
385 | # 目前只是一个简单的实现
386 |
387 | # 移除所有标记字符,看看是否有剩余的有意义内容
388 | clean_content = content
389 | for marker in markers:
390 | clean_content = clean_content.replace(marker, '')
391 |
392 | # 如果清理后的内容与原内容不同,返回清理后的内容(限制长度)
393 | if clean_content != content:
394 | return clean_content.strip()[:200]
395 |
396 | return None
397 |
398 | def _get_context(self, content: str, start: int, end: int, context_size: int = 50) -> str:
399 | """获取匹配内容的上下文"""
400 | start_context = max(0, start - context_size)
401 | end_context = min(len(content), end + context_size)
402 |
403 | context = content[start_context:end_context]
404 | context = context.replace('\n', ' ').replace('\r', ' ')
405 |
406 | # 移除零宽字符以便显示
407 | for char in self.zero_width_chars:
408 | context = context.replace(char, '')
409 |
410 | return context
411 |
--------------------------------------------------------------------------------
/utils/js_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | JavaScript处理工具模块
5 | """
6 |
7 | import re
8 | import logging
9 | from typing import List, Dict, Any
10 |
11 | logger = logging.getLogger('YuanZhao.utils.js')
12 |
13 | # 常见的可疑JavaScript模式
14 | SUSPICIOUS_PATTERNS = [
15 | # 文档写入相关
16 | r'document\.write\s*\(',
17 | r'document\.writeln\s*\(',
18 | r'document\.createElement\s*\(\s*["\']script["\']\s*\)',
19 |
20 | # DOM操作相关
21 | r'appendChild\s*\(',
22 | r'insertBefore\s*\(',
23 | r'innerHTML\s*=',
24 | r'outerHTML\s*=',
25 |
26 | # 编码解码相关
27 | r'decodeURIComponent\s*\(',
28 | r'decodeURI\s*\(',
29 | r'eval\s*\(',
30 | r'Function\s*\(',
31 | r'fromCharCode\s*\(',
32 |
33 | # URL相关
34 | r'location\.href\s*=',
35 | r'window\.location\s*=',
36 | r'location\.replace\s*\(',
37 | r'location\.assign\s*\(',
38 |
39 | # 定时器相关
40 | r'setTimeout\s*\(',
41 | r'setInterval\s*\(',
42 |
43 | # AJAX相关
44 | r'XMLHttpRequest',
45 | r'fetch\s*\(',
46 | r'axios',
47 |
48 | # 混淆相关
49 | r'\+\s*"', # 字符串拼接
50 | r'["\']\s*\+\s*["\']', # 空字符串拼接
51 | r'\[\d+\]', # 数字索引访问
52 | ]
53 |
54 | def extract_suspicious_patterns(js_content: str) -> List[Dict[str, str]]:
55 | """
56 | 提取可疑的JavaScript模式
57 |
58 | Args:
59 | js_content: JavaScript代码
60 |
61 | Returns:
62 | 可疑模式列表
63 | """
64 | suspicious_matches = []
65 |
66 | try:
67 | for pattern_str in SUSPICIOUS_PATTERNS:
68 | pattern = re.compile(pattern_str, re.IGNORECASE)
69 | matches = pattern.finditer(js_content)
70 |
71 | for match in matches:
72 | code_segment = match.group(0)
73 | start_pos = match.start(0)
74 | end_pos = match.end(0)
75 |
76 | # 获取上下文
77 | context = get_code_context(js_content, start_pos, end_pos)
78 |
79 | suspicious_matches.append({
80 | 'pattern': pattern_str,
81 | 'code_segment': code_segment,
82 | 'context': context,
83 | 'position': (start_pos, end_pos)
84 | })
85 |
86 | except Exception as e:
87 | logger.error(f"提取可疑模式失败: {str(e)}")
88 |
89 | return suspicious_matches
90 |
91 | def get_code_context(js_content: str, start_pos: int, end_pos: int, context_lines: int = 3) -> str:
92 | """
93 | 获取代码上下文
94 |
95 | Args:
96 | js_content: 完整代码
97 | start_pos: 开始位置
98 | end_pos: 结束位置
99 | context_lines: 上下文行数
100 |
101 | Returns:
102 | 包含上下文的代码段
103 | """
104 | try:
105 | # 获取行号
106 | lines = js_content.split('\n')
107 | current_line = 0
108 | char_count = 0
109 |
110 | for i, line in enumerate(lines):
111 | char_count += len(line) + 1 # +1 for newline
112 | if char_count > start_pos:
113 | current_line = i
114 | break
115 |
116 | # 获取上下文行
117 | start_line = max(0, current_line - context_lines)
118 | end_line = min(len(lines), current_line + context_lines + 1)
119 |
120 | context_lines = lines[start_line:end_line]
121 |
122 | return '\n'.join(context_lines)
123 |
124 | except Exception as e:
125 | logger.error(f"获取代码上下文失败: {str(e)}")
126 | # 回退到简单的字符上下文
127 | context_start = max(0, start_pos - 100)
128 | context_end = min(len(js_content), end_pos + 100)
129 | return js_content[context_start:context_end]
130 |
131 | def detect_dynamic_urls(js_content: str) -> List[Dict[str, str]]:
132 | """
133 | 检测动态生成的URL
134 |
135 | Args:
136 | js_content: JavaScript代码
137 |
138 | Returns:
139 | 动态URL列表
140 | """
141 | dynamic_urls = []
142 |
143 | # 检测常见的URL赋值模式
144 | url_patterns = [
145 | re.compile(r'(?:href|src|url)\s*=\s*([^;\n]+);', re.DOTALL),
146 | re.compile(r'(?:location\.href|window\.location)\s*=\s*([^;\n]+);', re.DOTALL),
147 | re.compile(r'fetch\s*\(\s*([^)]+)\s*\)', re.DOTALL),
148 | re.compile(r'\.open\s*\(\s*["\'](get|post|put|delete)["\']\s*,\s*([^)]+)\s*\)', re.DOTALL),
149 | ]
150 |
151 | try:
152 | for pattern in url_patterns:
153 | matches = pattern.finditer(js_content)
154 |
155 | for match in matches:
156 | code_segment = match.group(0)
157 | start_pos = match.start(0)
158 | end_pos = match.end(0)
159 |
160 | # 判断是否包含变量或表达式
161 | if any(ch in code_segment for ch in ['+', '\'', '"', '`', '[', ']', '(', ')']):
162 | # 优先尝试从表达式中提取规范化URL常量
163 | url_const = None
164 | m_http = re.search(r'["\'`]\s*(https?://[^"\'`\s]+)\s*["\'`]', code_segment)
165 | if m_http:
166 | url_const = m_http.group(1)
167 | m_proto = re.search(r'["\'`]\s*(//[^"\'`\s]+)\s*["\'`]', code_segment)
168 | if (not url_const) and m_proto:
169 | url_const = 'https:' + m_proto.group(1)
170 | dynamic_urls.append({
171 | 'url': url_const if url_const else None,
172 | 'expression': code_segment,
173 | 'reason': '动态构建的URL',
174 | 'context': get_code_context(js_content, start_pos, end_pos),
175 | 'position': (start_pos, end_pos)
176 | })
177 |
178 | except Exception as e:
179 | logger.error(f"检测动态URL失败: {str(e)}")
180 |
181 | return dynamic_urls
182 |
183 | def detect_obfuscated_code(js_content: str) -> List[Dict[str, str]]:
184 | """
185 | 检测混淆的JavaScript代码
186 |
187 | Args:
188 | js_content: JavaScript代码
189 |
190 | Returns:
191 | 混淆代码列表
192 | """
193 | obfuscated_segments = []
194 |
195 | # 检测常见的混淆模式
196 | obfuscation_patterns = [
197 | # 大量的字符串拼接
198 | (r'("[^"\\]*(?:\\.[^"\\]*)*"\s*\+\s*){3,}', 'multiple_string_concatenation'),
199 | # 长的十六进制字符串
200 | (r'(\\x[0-9a-fA-F]{2}){10,}', 'hex_encoding'),
201 | # Unicode编码
202 | (r'(\\u[0-9a-fA-F]{4}){5,}', 'unicode_encoding'),
203 | # 数组混淆
204 | (r'(\[\s*\d+\s*\]\s*\+){3,}', 'array_obfuscation'),
205 | # eval + 字符串
206 | (r'eval\s*\(\s*["\'](?:[^"\'\\]|\\.)*["\']\s*\)', 'eval_with_string'),
207 | # 大量的变量替换
208 | (r'(var|let|const)\s+[a-z]\s*=\s*[^;]+;\s*[a-z]\s*\+\s*=[^;]+;', 'variable_replacement'),
209 | ]
210 |
211 | try:
212 | for pattern_str, obfuscation_type in obfuscation_patterns:
213 | pattern = re.compile(pattern_str, re.DOTALL)
214 | matches = pattern.finditer(js_content)
215 |
216 | for match in matches:
217 | code_segment = match.group(0)
218 | start_pos = match.start(0)
219 | end_pos = match.end(0)
220 |
221 | obfuscated_segments.append({
222 | 'type': obfuscation_type,
223 | 'code_segment': code_segment,
224 | 'context': get_code_context(js_content, start_pos, end_pos),
225 | 'position': (start_pos, end_pos)
226 | })
227 |
228 | except Exception as e:
229 | logger.error(f"检测混淆代码失败: {str(e)}")
230 |
231 | return obfuscated_segments
232 |
233 | def extract_function_calls(js_content: str, function_name: str) -> List[Dict[str, str]]:
234 | """
235 | 提取特定函数调用
236 |
237 | Args:
238 | js_content: JavaScript代码
239 | function_name: 函数名
240 |
241 | Returns:
242 | 函数调用列表
243 | """
244 | function_calls = []
245 |
246 | try:
247 | # 构建函数调用的正则表达式
248 | pattern_str = rf'{re.escape(function_name)}\s*\(\s*([^)]*)\s*\)' # 避免 re.escape 对 \ 进行转义
249 | pattern = re.compile(pattern_str, re.DOTALL)
250 | matches = pattern.finditer(js_content)
251 |
252 | for match in matches:
253 | full_call = match.group(0)
254 | arguments = match.group(1)
255 | start_pos = match.start(0)
256 | end_pos = match.end(0)
257 |
258 | function_calls.append({
259 | 'function': function_name,
260 | 'arguments': arguments,
261 | 'full_call': full_call,
262 | 'context': get_code_context(js_content, start_pos, end_pos),
263 | 'position': (start_pos, end_pos)
264 | })
265 |
266 | except Exception as e:
267 | logger.error(f"提取函数调用失败: {str(e)}")
268 |
269 | return function_calls
270 |
271 | def detect_document_modification(js_content: str) -> List[Dict[str, str]]:
272 | """
273 | 检测文档修改操作
274 |
275 | Args:
276 | js_content: JavaScript代码
277 |
278 | Returns:
279 | 文档修改操作列表
280 | """
281 | modifications = []
282 |
283 | # 文档修改相关的模式
284 | modification_patterns = [
285 | (r'document\.write\s*\(', 'document.write'),
286 | (r'document\.writeln\s*\(', 'document.writeln'),
287 | (r'innerHTML\s*=', 'innerHTML assignment'),
288 | (r'outerHTML\s*=', 'outerHTML assignment'),
289 | (r'appendChild\s*\(', 'appendChild'),
290 | (r'insertBefore\s*\(', 'insertBefore'),
291 | (r'insertAdjacentHTML\s*\(', 'insertAdjacentHTML'),
292 | (r'createElement\s*\(', 'createElement'),
293 | ]
294 |
295 | try:
296 | for pattern_str, modification_type in modification_patterns:
297 | pattern = re.compile(pattern_str, re.IGNORECASE)
298 | matches = pattern.finditer(js_content)
299 |
300 | for match in matches:
301 | code_segment = match.group(0)
302 | start_pos = match.start(0)
303 | end_pos = match.end(0)
304 |
305 | target = modification_type
306 | value = code_segment
307 | modifications.append({
308 | 'action': 'modify_document',
309 | 'target': target,
310 | 'value': value,
311 | 'description': modification_type,
312 | 'context': get_code_context(js_content, start_pos, end_pos),
313 | 'position': (start_pos, end_pos)
314 | })
315 |
316 | except Exception as e:
317 | logger.error(f"检测文档修改失败: {str(e)}")
318 |
319 | return modifications
320 |
321 | def extract_variable_assignments(js_content: str, variable_name: str) -> List[Dict[str, str]]:
322 | """
323 | 提取变量赋值
324 |
325 | Args:
326 | js_content: JavaScript代码
327 | variable_name: 变量名
328 |
329 | Returns:
330 | 变量赋值列表
331 | """
332 | assignments = []
333 |
334 | try:
335 | # 构建变量赋值的正则表达式
336 | pattern_str = rf'(?:var|let|const)?\s*{re.escape(variable_name)}\s*=\s*([^;\n]+)' # 避免 re.escape 对 \ 进行转义
337 | pattern = re.compile(pattern_str, re.DOTALL)
338 | matches = pattern.finditer(js_content)
339 |
340 | for match in matches:
341 | full_assignment = match.group(0)
342 | value = match.group(1)
343 | start_pos = match.start(0)
344 | end_pos = match.end(0)
345 |
346 | assignments.append({
347 | 'variable': variable_name,
348 | 'value': value,
349 | 'full_assignment': full_assignment,
350 | 'context': get_code_context(js_content, start_pos, end_pos),
351 | 'position': (start_pos, end_pos)
352 | })
353 |
354 | except Exception as e:
355 | logger.error(f"提取变量赋值失败: {str(e)}")
356 |
357 | return assignments
358 |
359 | def extract_comments(js_content: str) -> List[Dict[str, Any]]:
360 | """
361 | 提取JavaScript注释
362 |
363 | Args:
364 | js_content: JavaScript代码
365 |
366 | Returns:
367 | 注释列表
368 | """
369 | comments = []
370 |
371 | try:
372 | # 匹配单行注释
373 | single_line_pattern = re.compile(r'//(.*?)$', re.MULTILINE)
374 | single_line_matches = single_line_pattern.finditer(js_content)
375 |
376 | for match in single_line_matches:
377 | comment_content = match.group(1).strip()
378 | start_pos = match.start(0)
379 | end_pos = match.end(0)
380 |
381 | comments.append({
382 | 'type': 'single_line',
383 | 'content': comment_content,
384 | 'position': (start_pos, end_pos)
385 | })
386 |
387 | # 匹配多行注释
388 | multi_line_pattern = re.compile(r'/\*(.*?)\*/', re.DOTALL)
389 | multi_line_matches = multi_line_pattern.finditer(js_content)
390 |
391 | for match in multi_line_matches:
392 | comment_content = match.group(1).strip()
393 | start_pos = match.start(0)
394 | end_pos = match.end(0)
395 |
396 | comments.append({
397 | 'type': 'multi_line',
398 | 'content': comment_content,
399 | 'position': (start_pos, end_pos)
400 | })
401 |
402 | except Exception as e:
403 | logger.error(f"提取JavaScript注释失败: {str(e)}")
404 |
405 | return comments
406 |
407 | def strip_comments(js_content: str) -> str:
408 | """
409 | 移除JavaScript注释
410 | """
411 | try:
412 | s = js_content
413 | out = []
414 | i = 0
415 | n = len(s)
416 | in_sq = False
417 | in_dq = False
418 | in_bt = False
419 | while i < n:
420 | ch = s[i]
421 | if not in_sq and not in_dq and not in_bt and ch == '/' and i + 1 < n:
422 | nxt = s[i+1]
423 | if nxt == '/':
424 | j = i + 2
425 | while j < n and s[j] not in '\n\r':
426 | j += 1
427 | i = j
428 | continue
429 | if nxt == '*':
430 | j = i + 2
431 | while j + 1 < n and not (s[j] == '*' and s[j+1] == '/'):
432 | j += 1
433 | i = j + 2 if j + 1 < n else n
434 | continue
435 | out.append(ch)
436 | if ch == "'" and not in_dq and not in_bt:
437 | esc = i > 0 and s[i-1] == '\\'
438 | if not esc:
439 | in_sq = not in_sq
440 | elif ch == '"' and not in_sq and not in_bt:
441 | esc = i > 0 and s[i-1] == '\\'
442 | if not esc:
443 | in_dq = not in_dq
444 | elif ch == '`' and not in_sq and not in_dq:
445 | in_bt = not in_bt
446 | i += 1
447 | return ''.join(out)
448 | except Exception as e:
449 | logger.error(f"移除JavaScript注释失败: {str(e)}")
450 | return js_content
451 |
452 | # 兼容性函数,为了支持js_detector.py中的导入
453 | def identify_obfuscated_code(js_content: str) -> Dict[str, Any]:
454 | """
455 | 识别混淆代码并返回聚合信息
456 | """
457 | segments = detect_obfuscated_code(js_content)
458 | is_obf = len(segments) > 0
459 | patterns = [seg.get('type', '') for seg in segments]
460 | sample = segments[0].get('code_segment', '') if segments else ''
461 | return {
462 | 'is_obfuscated': is_obf,
463 | 'detected_patterns': patterns,
464 | 'sample': sample
465 | }
466 |
467 | ## 兼容别名已移除,请使用 detect_document_modification
468 |
469 | ## 兼容别名已移除,请使用 strip_comments
470 |
471 |
--------------------------------------------------------------------------------
/utils/network_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 网络处理工具模块
5 | """
6 |
7 | import os
8 | import re
9 | import logging
10 | import requests
11 | import ssl
12 | from typing import Dict, List, Tuple, Optional, Any
13 | from urllib.parse import urlparse, urljoin
14 |
15 | logger = logging.getLogger('YuanZhao.utils.network')
16 |
17 | # 常见URL模式正则表达式
18 | URL_PATTERNS = [
19 | # 标准URL
20 | re.compile(r'https?://[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE),
21 | # 协议相对URL
22 | re.compile(r'//[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE),
23 | # 仅域名
24 | re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?', re.IGNORECASE),
25 | # IP地址形式
26 | re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b(?::\d{1,5})?', re.IGNORECASE),
27 | # JavaScript伪协议
28 | re.compile(r'javascript:[^\s"\'>]+', re.IGNORECASE),
29 | # data URI
30 | re.compile(r'data:[^;]+;base64,[^\s"\'>]+', re.IGNORECASE),
31 | # 相对路径
32 | re.compile(r'/[^\s"\'>]+', re.IGNORECASE),
33 | ]
34 |
35 |
36 | def normalize_url(url: str, base_url: Optional[str] = None) -> str:
37 | """
38 | 规范化URL
39 |
40 | Args:
41 | url: 原始URL
42 | base_url: 基础URL,用于解析相对路径
43 |
44 | Returns:
45 | 规范化后的URL
46 | """
47 | try:
48 | # 处理双斜杠开头的URL:优先https,或继承base_url协议
49 | if url.startswith('//'):
50 | if base_url:
51 | base_parsed = urlparse(base_url)
52 | scheme = base_parsed.scheme or 'https'
53 | return f'{scheme}:{url}'
54 | return f'https:{url}'
55 |
56 | # 处理相对路径
57 | if base_url and not (url.startswith('http://') or url.startswith('https://')):
58 | return urljoin(base_url, url)
59 |
60 | # 对于纯域名,默认添加https://
61 | parsed = urlparse(url)
62 | if not parsed.scheme:
63 | return f'https://{url}'
64 |
65 | return url
66 |
67 | except Exception as e:
68 | logger.error(f"规范化URL失败: {url}, 错误: {str(e)}")
69 | return url
70 |
71 | def get_url_type(url: str) -> str:
72 | """
73 | 获取URL类型
74 |
75 | Args:
76 | url: URL字符串
77 |
78 | Returns:
79 | URL类型
80 | """
81 | if url.startswith('http://') or url.startswith('https://'):
82 | return 'absolute'
83 | elif url.startswith('//'):
84 | return 'protocol-relative'
85 | elif url.startswith('/'):
86 | return 'root-relative'
87 | else:
88 | return 'relative'
89 |
90 | def check_url_reachability(url: str, timeout: int = 5, headers: Optional[Dict] = None) -> Tuple[bool, Optional[str]]:
91 | """
92 | 检查URL是否可达
93 |
94 | Args:
95 | url: 要检查的URL
96 | timeout: 超时时间(秒)
97 | headers: 请求头
98 |
99 | Returns:
100 | (是否可达, 状态码或错误信息)
101 | """
102 | try:
103 | if headers is None:
104 | headers = {
105 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
106 | }
107 |
108 | response = requests.head(url, timeout=timeout, headers=headers, allow_redirects=True)
109 | return response.status_code < 400, str(response.status_code)
110 |
111 | except requests.exceptions.RequestException as e:
112 | logger.warning(f"URL检查失败: {url}, 错误: {str(e)}")
113 | return False, str(e)
114 |
115 | def validate_url(url: str) -> bool:
116 | """
117 | 验证URL格式是否有效
118 |
119 | Args:
120 | url: 要验证的URL
121 |
122 | Returns:
123 | URL是否有效
124 | """
125 | try:
126 | result = urlparse(url)
127 |
128 | # 对于绝对URL,需要有scheme和netloc
129 | if url.startswith('http://') or url.startswith('https://'):
130 | return all([result.scheme, result.netloc])
131 |
132 | # 对于相对URL,返回True
133 | return True
134 |
135 | except Exception as e:
136 | logger.error(f"URL验证失败: {url}, 错误: {str(e)}")
137 | return False
138 |
139 | def get_domain(url: str) -> Optional[str]:
140 | """
141 | 从URL中提取域名
142 |
143 | Args:
144 | url: URL字符串
145 |
146 | Returns:
147 | 域名
148 | """
149 | try:
150 | parsed = urlparse(url)
151 | return parsed.netloc
152 | except Exception as e:
153 | logger.error(f"提取域名失败: {url}, 错误: {str(e)}")
154 | return None
155 |
156 | def is_external_link(url: str, base_domain: Optional[str] = None) -> bool:
157 | """
158 | 判断是否为外部链接
159 |
160 | Args:
161 | url: 要检查的URL
162 | base_domain: 基础域名
163 |
164 | Returns:
165 | 是否为外部链接
166 | """
167 | url_domain = get_domain(url)
168 | if not url_domain:
169 | return False
170 | if not base_domain:
171 | # 未提供基础域名时,尽量避免误报:只有显式协议的绝对链接视为外部
172 | return url.startswith(('http://', 'https://'))
173 | # 检查是否为同一域名或子域名
174 | # 同域或子域视为内部,其余为外部
175 | return not (url_domain == base_domain or url_domain.endswith(f'.{base_domain}'))
176 |
177 | # 兼容性函数,用于判断字符串是否为URL
178 | def is_url(text: str) -> bool:
179 | """
180 | 判断字符串是否为URL
181 |
182 | Args:
183 | text: 要检查的文本
184 |
185 | Returns:
186 | 是否为URL
187 | """
188 | try:
189 | # 首先检查是否为本地文件,如果是,直接返回False
190 | if os.path.isfile(text) or os.path.isdir(text):
191 | logger.debug(f"{text} 是本地文件或目录,不视为URL")
192 | return False
193 |
194 | # 检查是否以http://或https://开头
195 | if text.startswith(('http://', 'https://')):
196 | return True
197 |
198 | # 过滤典型代码符号,避免误判为URL
199 | if re.search(r"^(document|window|parent|this)\.[A-Za-z_]", text):
200 | return False
201 | if re.search(r"^[A-Za-z_][A-Za-z0-9_]*\s*\(", text):
202 | if not re.search(r"https?://", text):
203 | # 若函数调用前缀,但内容中存在引号包裹的URL片段,视为URL
204 | quoted = re.findall(r'"([^"]+)"|\'([^\']+)' , text)
205 | candidates = [q[0] or q[1] for q in quoted]
206 | if not any((p.search(seg) for seg in URL_PATTERNS for seg in candidates)):
207 | return False
208 |
209 | # 检查是否通过URL格式验证
210 | if not validate_url(text):
211 | return False
212 |
213 | # 检查是否匹配至少一个URL模式
214 | for pattern in URL_PATTERNS:
215 | if pattern.search(text):
216 | return True
217 |
218 | return False
219 | except Exception as e:
220 | logger.error(f"URL检查失败: {text}, 错误: {str(e)}")
221 | return False
222 |
223 | # 兼容性函数,validate_url的别名
224 | def is_valid_url(url: str) -> bool:
225 | """
226 | 验证URL格式是否有效(validate_url的别名)
227 |
228 | Args:
229 | url: 要验证的URL
230 |
231 | Returns:
232 | URL是否有效
233 | """
234 | return validate_url(url)
235 |
236 | def get_url_context(text: str, position: Tuple[int, int], context_length: int = 50) -> str:
237 | """
238 | 获取URL在文本中的上下文
239 |
240 | Args:
241 | text: 原始文本
242 | position: URL在文本中的位置 (start, end)
243 | context_length: 上下文长度
244 |
245 | Returns:
246 | 包含上下文的文本
247 | """
248 | start_pos, end_pos = position
249 |
250 | # 计算上下文的起始和结束位置
251 | context_start = max(0, start_pos - context_length)
252 | context_end = min(len(text), end_pos + context_length)
253 |
254 | # 提取上下文
255 | context = text[context_start:context_end]
256 |
257 | # 添加省略号
258 | prefix = '...' if context_start > 0 else ''
259 | suffix = '...' if context_end < len(text) else ''
260 |
261 | return f"{prefix}{context}{suffix}"
262 |
263 | def build_request_session(proxy: Optional[str] = None, timeout: int = 10) -> requests.Session:
264 | """
265 | 构建请求会话
266 |
267 | Args:
268 | proxy: 代理设置
269 | timeout: 超时时间
270 |
271 | Returns:
272 | 请求会话对象
273 | """
274 | session = requests.Session()
275 |
276 | # 设置默认请求头
277 | session.headers.update({
278 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
279 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
280 | 'Accept-Language': 'zh-CN,zh;q=0.9',
281 | })
282 |
283 | # 设置代理
284 | if proxy:
285 | proxies = {
286 | 'http': proxy,
287 | 'https': proxy
288 | }
289 | session.proxies.update(proxies)
290 | logger.info(f"设置代理: {proxy}")
291 |
292 | # 配置HTTPS适配器,启用兼容旧式TLS重协商
293 | class TLSAdapter(requests.adapters.HTTPAdapter):
294 | def init_poolmanager(self, *args, **kwargs):
295 | ctx = ssl.create_default_context()
296 | try:
297 | ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0)
298 | except Exception:
299 | pass
300 | kwargs['ssl_context'] = ctx
301 | return super().init_poolmanager(*args, **kwargs)
302 | def proxy_manager_for(self, *args, **kwargs):
303 | ctx = ssl.create_default_context()
304 | try:
305 | ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0)
306 | except Exception:
307 | pass
308 | kwargs['ssl_context'] = ctx
309 | return super().proxy_manager_for(*args, **kwargs)
310 | try:
311 | session.mount('https://', TLSAdapter(max_retries=3))
312 | except Exception:
313 | pass
314 | # 超时需在请求时传递
315 |
316 | return session
317 |
318 | def fetch_url_content(url: str, session: Optional[requests.Session] = None, **kwargs) -> Optional[Tuple[str, dict]]:
319 | """
320 | 获取URL内容或本地文件内容
321 |
322 | Args:
323 | url: 要获取的URL或本地文件路径
324 | session: 请求会话对象
325 | **kwargs: 其他请求参数
326 |
327 | Returns:
328 | 元组 (内容字符串, 头部信息字典),失败时返回None
329 | """
330 | try:
331 | # 检查是否为本地文件路径
332 | if not url.startswith(('http://', 'https://')):
333 | # 尝试作为本地文件读取
334 | if os.path.isfile(url):
335 | logger.info(f"读取本地文件: {url}")
336 | with open(url, 'r', encoding='utf-8') as f:
337 | content = f.read()
338 | # 返回内容和模拟的头部信息
339 | return content, {'Content-Type': 'text/html'}
340 | else:
341 | logger.error(f"本地文件不存在: {url}")
342 | return None
343 |
344 | # 添加标准浏览器请求头以避免被反爬机制拦截
345 | default_headers = {
346 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
347 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
348 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
349 | 'Accept-Encoding': 'gzip, deflate, br',
350 | 'Connection': 'keep-alive',
351 | 'Upgrade-Insecure-Requests': '1',
352 | 'Cache-Control': 'max-age=0'
353 | }
354 |
355 | # 合并默认请求头和传入的请求头
356 | headers = default_headers.copy()
357 | if 'headers' in kwargs:
358 | headers.update(kwargs['headers'])
359 | kwargs['headers'] = headers
360 |
361 | # 增加重试机制
362 | timeout = kwargs.get('timeout', 10)
363 | if session:
364 | response = session.get(url, timeout=timeout, **kwargs)
365 | else:
366 | # 创建临时会话以设置重试策略
367 | temp_session = requests.Session()
368 | adapter = requests.adapters.HTTPAdapter(max_retries=3)
369 | temp_session.mount('http://', adapter)
370 | try:
371 | class TLSAdapter(requests.adapters.HTTPAdapter):
372 | def init_poolmanager(self, *args, **kwargs):
373 | ctx = ssl.create_default_context()
374 | try:
375 | ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0)
376 | except Exception:
377 | pass
378 | kwargs['ssl_context'] = ctx
379 | return super().init_poolmanager(*args, **kwargs)
380 | temp_session.mount('https://', TLSAdapter(max_retries=3))
381 | except Exception:
382 | temp_session.mount('https://', adapter)
383 | response = temp_session.get(url, timeout=timeout, **kwargs)
384 |
385 | response.raise_for_status()
386 |
387 | # 尝试自动检测编码,并在失败时回退到原始字节解码
388 | enc = response.apparent_encoding or response.encoding or 'utf-8'
389 | try:
390 | response.encoding = enc
391 | text = response.text
392 | except Exception:
393 | try:
394 | text = response.content.decode(enc, errors='replace')
395 | except Exception:
396 | text = response.content.decode('utf-8', errors='replace')
397 |
398 | return text, dict(response.headers)
399 |
400 | except requests.exceptions.RequestException as e:
401 | logger.error(f"获取URL内容失败: {url}, 错误: {str(e)}")
402 | return None
403 | except Exception as e:
404 | logger.error(f"读取内容失败: {url}, 错误: {str(e)}")
405 | return None
406 |
407 | # 兼容性函数,为了支持html_detector.py中的导入
408 | def extract_domain(url: str) -> Optional[str]:
409 | """
410 | 从URL中提取域名(get_domain的别名)
411 |
412 | Args:
413 | url: URL字符串
414 |
415 | Returns:
416 | 域名
417 | """
418 | return get_domain(url)
419 |
420 | def analyze_url_risk(url: str) -> Dict[str, Any]:
421 | """
422 | 评估URL风险等级
423 | Returns: {risk_level: int, reason: str}
424 | """
425 | try:
426 | risk = 0
427 | reasons = []
428 | parsed = urlparse(url)
429 | scheme = parsed.scheme.lower()
430 | domain = parsed.netloc.lower()
431 | # 协议风险
432 | if scheme == 'javascript':
433 | risk += 5
434 | reasons.append('JavaScript协议')
435 | elif scheme == 'data':
436 | risk += 4
437 | reasons.append('Data URI')
438 | elif scheme in ('http', 'https'):
439 | risk += 1
440 | # 端口风险
441 | if parsed.port and parsed.port not in [80, 443, 8080, 8443]:
442 | risk += 2
443 | reasons.append('非标准端口')
444 | # 可疑后缀与短链服务
445 | suspicious_tlds = ['pro', 'xyz', 'pw', 'top', 'loan', 'win', 'bid', 'online']
446 | short_link_domains = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly', 'is.gd', 'adf.ly']
447 | if any(domain.endswith('.' + tld) for tld in suspicious_tlds):
448 | risk += 2
449 | reasons.append('高风险域名后缀')
450 | if any(domain.endswith(sl) or domain == sl for sl in short_link_domains):
451 | risk += 3
452 | reasons.append('短链接服务')
453 | # 路径随机性
454 | if re.search(r'/[a-zA-Z0-9]{8,}\.(?:js|php)$', parsed.path):
455 | risk += 1
456 | reasons.append('可疑随机路径')
457 | return {'risk_level': min(risk, 10), 'reason': ', '.join(reasons) or '普通URL'}
458 | except Exception as e:
459 | logger.error(f"URL风险评估失败: {url}, 错误: {str(e)}")
460 | return {'risk_level': 0, 'reason': '评估失败'}
461 |
462 | def extract_urls(text: str, context_type: Optional[str] = None) -> List[Dict[str, Any]]:
463 | """
464 | 从文本中提取所有URL
465 |
466 | Args:
467 | text: 要提取URL的文本
468 |
469 | Returns:
470 | 包含URL和上下文的字典列表
471 | """
472 | results = []
473 | urls_set = set() # 用于去重
474 |
475 | # 增加URL模式匹配
476 | url_patterns = [
477 | re.compile(r'(https?://[\w._~:/?#[\]@!$&\'()*+,-;=]+)', re.IGNORECASE),
478 | re.compile(r'(/[-\w./?%&=]+)', re.IGNORECASE),
479 | re.compile(r'([a-zA-Z0-9][a-zA-Z0-9-]{0,61}[a-zA-Z0-9]\.[a-zA-Z]{2,}(?:/[^\s<>"]*)?)', re.IGNORECASE),
480 | re.compile(r'(javascript:[\w./?%&=;(),\'"`-]+)', re.IGNORECASE),
481 | re.compile(r'(data:[^;]+;base64,[^\s<>"]+)', re.IGNORECASE),
482 | ]
483 |
484 | logger.info(f"开始提取URL,文本长度: {len(text)}")
485 |
486 | for i, pattern in enumerate(url_patterns):
487 | matches = pattern.finditer(text)
488 | match_count = 0
489 |
490 | for match in matches:
491 | match_count += 1
492 | url = match.group(1)
493 | start = max(0, match.start() - 50)
494 | end = min(len(text), match.end() + 50)
495 | context = text[start:end]
496 |
497 | # 清理URL
498 | url = url.strip('"\'')
499 |
500 | # 跳过空URL
501 | if not url or len(url) < 3:
502 | continue
503 |
504 | # 跳过纯数字或不包含有效字符的URL
505 | if re.match(r'^\d+$', url):
506 | continue
507 |
508 | # 基本过滤:非http且非根相对路径、非伪协议时需校验TLD
509 | if not url.lower().startswith(('http://','https://','javascript:','data:')) and not url.startswith('/'):
510 | domain_part = url.split('/', 1)[0]
511 | tld = domain_part.rsplit('.', 1)[-1].lower() if '.' in domain_part else ''
512 | allowed_tlds = {
513 | 'com','org','net','cn','cc','io','me','xyz','tk','ga','gq','ml','cf','edu','gov','mil','biz','info'
514 | }
515 | if tld not in allowed_tlds:
516 | continue
517 | # 去重
518 | if url not in urls_set:
519 | urls_set.add(url)
520 | results.append({
521 | 'url': url,
522 | 'context': context,
523 | 'position': (match.start(), match.end()),
524 | 'context_type': context_type or 'unknown'
525 | })
526 |
527 | logger.debug(f"模式 {i} 匹配到 {match_count} 个URL")
528 |
529 | logger.debug(f"共提取到 {len(results)} 个唯一URL")
530 | return results
531 | EXTRA_PATTERNS = [
532 | # 扩展的HTTP/HTTPS URL
533 | re.compile(r'https?://[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE),
534 | # 没有协议的域名
535 | re.compile(r'\b[\w\-\.]+(?:\.[\w\-]+)+\b(?::\d{1,5})?/[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]*', re.IGNORECASE),
536 | # JavaScript伪协议
537 | re.compile(r'javascript:[^\s"\'>]+', re.IGNORECASE),
538 | # data URI
539 | re.compile(r'data:[^;]+;base64,[^\s"\'>]+', re.IGNORECASE),
540 | # 相对路径
541 | re.compile(r'\/[^\s"\'>]+', re.IGNORECASE),
542 | ]
543 | # 模式去重:基于正则字符串与flags,避免重复匹配与性能开销
544 | _unique_patterns = []
545 | _seen = set()
546 | for _pat in URL_PATTERNS:
547 | _key = (_pat.pattern, _pat.flags)
548 | if _key not in _seen:
549 | _seen.add(_key)
550 | _unique_patterns.append(_pat)
551 | URL_PATTERNS = _unique_patterns
552 |
--------------------------------------------------------------------------------
/core/detector/headless_browser_detector.py:
--------------------------------------------------------------------------------
1 | """无头浏览器检测器模块
2 |
3 | 用于通过Chrome无头浏览器检测动态生成的暗链和隐藏内容。
4 | 支持检测JavaScript动态生成的内容、DOM操作、iframe内容等。
5 | """
6 | import logging
7 | from typing import List, Dict, Any
8 | from core.config import Config
9 |
10 | class HeadlessBrowserDetector:
11 | """无头浏览器检测器类"""
12 |
13 | def __init__(self, config: Config):
14 | """初始化无头浏览器检测器
15 |
16 | Args:
17 | config: 配置对象
18 | """
19 | self.config = config
20 | self.logger = logging.getLogger(__name__)
21 | self.driver = None
22 | self._initialize_driver()
23 |
24 | def _initialize_driver(self):
25 | """初始化Chrome无头浏览器驱动"""
26 | try:
27 | # 动态导入,避免在不使用时产生依赖问题
28 | from selenium import webdriver
29 | from selenium.webdriver.chrome.options import Options
30 | from selenium.webdriver.chrome.service import Service
31 | import os
32 | driver_path = getattr(self.config, 'headless_driver_path', None)
33 | binary_path = getattr(self.config, 'headless_binary', None)
34 |
35 | # 创建Chrome选项
36 | chrome_options = Options()
37 | if binary_path:
38 | chrome_options.binary_location = binary_path
39 | chrome_options.add_argument('--headless') # 无头模式
40 | chrome_options.add_argument('--disable-gpu') # 禁用GPU加速
41 | chrome_options.add_argument('--no-sandbox') # 禁用沙箱
42 | chrome_options.add_argument('--disable-dev-shm-usage') # 解决内存问题
43 | chrome_options.add_argument('--window-size=1920,1080') # 设置窗口大小
44 | chrome_options.add_argument('--log-level=3') # 减少日志输出
45 |
46 | # 选择驱动来源:优先本地路径;否则在允许时自动下载
47 | if driver_path and os.path.exists(driver_path):
48 | service = Service(driver_path)
49 | else:
50 | if getattr(self.config, 'headless_auto_download', False):
51 | from webdriver_manager.chrome import ChromeDriverManager
52 | service = Service(ChromeDriverManager().install())
53 | else:
54 | self.logger.error("未提供本地驱动路径且未启用自动下载,跳过无头浏览器初始化")
55 | return
56 |
57 | # 创建浏览器驱动
58 | self.driver = webdriver.Chrome(service=service, options=chrome_options)
59 |
60 | # 设置超时时间
61 | self.driver.set_page_load_timeout(self.config.headless_timeout)
62 | self.driver.set_script_timeout(self.config.headless_timeout)
63 |
64 | self.logger.info("Chrome无头浏览器初始化成功")
65 |
66 | except ImportError as e:
67 | self.logger.error(f"缺少无头浏览器相关依赖: {str(e)}")
68 | self.logger.error("请安装依赖: pip install selenium webdriver-manager")
69 | except Exception as e:
70 | self.logger.error(f"无头浏览器初始化失败: {str(e)}")
71 |
72 | def close(self):
73 | """释放浏览器驱动资源"""
74 | try:
75 | if self.driver:
76 | self.driver.quit()
77 | self.driver = None
78 | self.logger.info("已释放无头浏览器驱动")
79 | except Exception as e:
80 | self.logger.error(f"释放无头浏览器驱动失败: {str(e)}")
81 |
82 | def __del__(self):
83 | try:
84 | self.close()
85 | except Exception:
86 | pass
87 |
88 | def detect(self, url: str, content: str = None) -> List[Dict[str, Any]]:
89 | """使用无头浏览器检测暗链
90 |
91 | Args:
92 | url: 要检测的URL
93 | content: 可选,页面内容(如果已获取)
94 |
95 | Returns:
96 | 检测结果列表
97 | """
98 | results = []
99 |
100 | if not self.driver:
101 | self.logger.error("无头浏览器未初始化,跳过检测")
102 | return results
103 |
104 | try:
105 | from selenium.webdriver.support.ui import WebDriverWait
106 | # 加载页面
107 | self.logger.info(f"无头浏览器正在加载页面: {url}")
108 | self.driver.get(url)
109 |
110 | # 等待JavaScript执行完成
111 | try:
112 | WebDriverWait(self.driver, self.config.js_wait_time).until(
113 | lambda d: d.execute_script("return document.readyState") in ("complete", "interactive")
114 | )
115 | except Exception:
116 | pass
117 | self.logger.info(f"等待页面加载/JS执行完成 (<= {self.config.js_wait_time}秒)")
118 |
119 | # 执行各项检测
120 | self.logger.info("开始执行动态链接检测")
121 | dynamic_links = self._detect_dynamic_links()
122 | results.extend(dynamic_links)
123 |
124 | self.logger.info("开始执行DOM操作检测")
125 | dom_operations = self._detect_dom_manipulations()
126 | results.extend(dom_operations)
127 |
128 | self.logger.info("开始执行iframe内容检测")
129 | iframe_content = self._detect_iframe_content()
130 | results.extend(iframe_content)
131 |
132 | self.logger.info("开始执行隐藏元素检测")
133 | hidden_elements = self._detect_hidden_elements()
134 | results.extend(hidden_elements)
135 |
136 | self.logger.info(f"无头浏览器检测完成,发现 {len(results)} 个可疑项")
137 |
138 | except Exception as e:
139 | self.logger.error(f"无头浏览器检测过程中出错: {str(e)}")
140 |
141 | return results
142 |
143 | def _detect_dynamic_links(self) -> List[Dict[str, Any]]:
144 | """检测动态生成的链接
145 |
146 | Returns:
147 | 检测到的可疑链接列表
148 | """
149 | results = []
150 |
151 | try:
152 | from selenium.webdriver.common.by import By
153 | # 获取所有链接元素
154 | links = self.driver.find_elements(By.TAG_NAME, 'a')
155 | self.logger.info(f"发现 {len(links)} 个链接元素")
156 |
157 | for link in links:
158 | try:
159 | href = link.get_attribute('href')
160 | if href:
161 | # 分析链接风险(使用现有工具类)
162 | from utils.network_utils import analyze_url_risk
163 | risk_info = analyze_url_risk(href)
164 |
165 | if risk_info['risk_level'] > 0:
166 | text = link.text.strip()[:100] # 限制文本长度
167 | results.append({
168 | 'type': 'suspicious_url',
169 | 'url': href,
170 | 'risk_level': risk_info['risk_level'],
171 | 'context': f"动态生成链接: {text}",
172 | 'detection_method': 'headless_browser',
173 | 'element': 'a',
174 | 'risk_reason': risk_info.get('reason', '未知风险')
175 | })
176 | except Exception as e:
177 | self.logger.error(f"分析动态链接时出错: {str(e)}")
178 | except Exception as e:
179 | self.logger.error(f"获取链接元素时出错: {str(e)}")
180 |
181 | return results
182 |
183 | def _detect_dom_manipulations(self) -> List[Dict[str, Any]]:
184 | """检测可疑的DOM操作
185 |
186 | Returns:
187 | 检测到的可疑DOM操作列表
188 | """
189 | results = []
190 |
191 | # 注入JavaScript以检测可疑的DOM操作
192 | monitor_script = r"""
193 | (function() {
194 | const suspiciousPatterns = [];
195 |
196 | // 初始化正则表达式
197 | const eval_pattern = /eval[\s]*\(/;
198 | const doc_write_pattern = /document\.write[\s]*\(/;
199 | const innerhtml_pattern = /innerHTML[\s]*=/;
200 | const base64_pattern = /base64/i;
201 | const fromCharCode_pattern = /fromCharCode/;
202 | const escape_pattern = /escape[\s]*\(/;
203 | const unescape_pattern = /unescape[\s]*\(/;
204 |
205 | // 检测可疑的JavaScript代码模式
206 | const scriptElements = document.querySelectorAll('script');
207 | scriptElements.forEach(script => {
208 | if (script.textContent) {
209 | const content = script.textContent;
210 | if (eval_pattern.test(content) ||
211 | doc_write_pattern.test(content) ||
212 | innerhtml_pattern.test(content) ||
213 | base64_pattern.test(content) ||
214 | fromCharCode_pattern.test(content) ||
215 | escape_pattern.test(content) ||
216 | unescape_pattern.test(content)) {
217 | suspiciousPatterns.push({
218 | type: 'suspicious_script',
219 | content: content.substring(0, 200) + '...',
220 | lineCount: content.split('\n').length
221 | });
222 | }
223 | }
224 | });
225 |
226 | // 检测动态创建的元素
227 | const dynamicElements = [];
228 | document.querySelectorAll('*').forEach(element => {
229 | if (element.tagName === 'SCRIPT' && element.getAttribute('src') === null &&
230 | element.textContent.length > 50) {
231 | dynamicElements.push({tag: element.tagName, type: 'inline_script'});
232 | }
233 | if (element.tagName === 'IFRAME') {
234 | dynamicElements.push({tag: element.tagName, src: element.getAttribute('src')});
235 | }
236 | });
237 |
238 | return {suspiciousPatterns, dynamicElements};
239 | })();
240 | """
241 |
242 | try:
243 | result = self.driver.execute_script(monitor_script)
244 |
245 | # 分析可疑脚本模式
246 | for pattern in result['suspiciousPatterns']:
247 | risk_level = 8 # 较高风险
248 | results.append({
249 | 'type': 'suspicious_dom_operation',
250 | 'technique': pattern['type'],
251 | 'risk_level': risk_level,
252 | 'context': f"检测到可疑脚本模式: {pattern['content']}",
253 | 'detection_method': 'headless_browser',
254 | 'risk_reason': '包含可疑JavaScript操作函数'
255 | })
256 |
257 | # 分析动态创建的元素
258 | for element in result['dynamicElements']:
259 | if element['tag'] == 'IFRAME' and element.get('src'):
260 | from utils.network_utils import analyze_url_risk
261 | risk_info = analyze_url_risk(element['src'])
262 | if risk_info['risk_level'] > 0:
263 | results.append({
264 | 'type': 'suspicious_iframe',
265 | 'url': element['src'],
266 | 'risk_level': risk_info['risk_level'],
267 | 'context': f"动态创建的iframe",
268 | 'detection_method': 'headless_browser',
269 | 'risk_reason': risk_info.get('reason', '可疑iframe')
270 | })
271 | except Exception as e:
272 | self.logger.error(f"检测DOM操作时出错: {str(e)}")
273 |
274 | return results
275 |
276 | def _detect_iframe_content(self) -> List[Dict[str, Any]]:
277 | """检测iframe中的内容
278 |
279 | Returns:
280 | 检测到的iframe中的可疑内容列表
281 | """
282 | results = []
283 |
284 | try:
285 | from selenium.webdriver.common.by import By
286 | # 获取所有iframe
287 | iframes = self.driver.find_elements(By.TAG_NAME, 'iframe')
288 | self.logger.info(f"发现 {len(iframes)} 个iframe元素")
289 |
290 | for index, iframe in enumerate(iframes):
291 | try:
292 | iframe_src = iframe.get_attribute('src')
293 | self.logger.info(f"处理iframe {index + 1}/{len(iframes)}: {iframe_src or '无src属性'}")
294 |
295 | # 分析iframe的src属性
296 | if iframe_src:
297 | from utils.network_utils import analyze_url_risk
298 | risk_info = analyze_url_risk(iframe_src)
299 |
300 | if risk_info['risk_level'] > 0:
301 | results.append({
302 | 'type': 'suspicious_iframe',
303 | 'url': iframe_src,
304 | 'risk_level': risk_info['risk_level'],
305 | 'context': f"iframe中的可疑链接",
306 | 'detection_method': 'headless_browser',
307 | 'risk_reason': risk_info.get('reason', '可疑iframe源')
308 | })
309 |
310 | # 尝试切换到iframe上下文分析内容
311 | try:
312 | self.driver.switch_to.frame(iframe)
313 |
314 | # 获取iframe中的链接
315 | iframe_links = self.driver.find_elements(By.TAG_NAME, 'a')
316 | for link in iframe_links:
317 | href = link.get_attribute('href')
318 | if href:
319 | from utils.network_utils import analyze_url_risk
320 | risk_info = analyze_url_risk(href)
321 |
322 | if risk_info['risk_level'] > 0:
323 | results.append({
324 | 'type': 'suspicious_url',
325 | 'url': href,
326 | 'risk_level': risk_info['risk_level'],
327 | 'context': f"iframe内部的可疑链接",
328 | 'detection_method': 'headless_browser',
329 | 'risk_reason': risk_info.get('reason', 'iframe内部链接风险')
330 | })
331 | except Exception as iframe_e:
332 | self.logger.error(f"分析iframe内容时出错: {str(iframe_e)}")
333 | finally:
334 | # 确保切回主文档
335 | self.driver.switch_to.default_content()
336 |
337 | except Exception as e:
338 | self.logger.error(f"处理iframe时出错: {str(e)}")
339 |
340 | except Exception as e:
341 | self.logger.error(f"获取iframe元素时出错: {str(e)}")
342 |
343 | return results
344 |
345 | def _detect_hidden_elements(self) -> List[Dict[str, Any]]:
346 | """检测视觉上隐藏的元素
347 |
348 | Returns:
349 | 检测到的隐藏元素列表
350 | """
351 | results = []
352 |
353 | # 注入JavaScript获取隐藏元素
354 | hidden_elements_script = """
355 | (function() {
356 | const hiddenElements = [];
357 |
358 | // 获取所有元素
359 | const allElements = document.querySelectorAll('*');
360 |
361 | allElements.forEach(element => {
362 | const style = window.getComputedStyle(element);
363 | const rect = element.getBoundingClientRect();
364 |
365 | // 检查各种隐藏技术
366 | const isHidden =
367 | style.display === 'none' ||
368 | style.visibility === 'hidden' ||
369 | style.opacity === '0' ||
370 | rect.width <= 1 ||
371 | rect.height <= 1 ||
372 | parseInt(style.fontSize) <= 0 ||
373 | element.offsetParent === null;
374 |
375 | // 检查绝对定位隐藏
376 | const isAbsPosHidden =
377 | style.position === 'absolute' &&
378 | (parseInt(style.left) < -1000 || parseInt(style.top) < -1000 ||
379 | parseInt(style.right) < -1000 || parseInt(style.bottom) < -1000);
380 |
381 | // 检查文本颜色与背景色相同
382 | const textColor = style.color;
383 | const bgColor = style.backgroundColor || style.background;
384 | const isSameColor = textColor === bgColor && textColor !== 'rgba(0, 0, 0, 0)';
385 |
386 | // 检查是否包含链接或文本
387 | const hasLinks = element.querySelector('a') !== null;
388 | const hasText = element.textContent.trim().length > 0;
389 | const hasContent = hasLinks || hasText;
390 |
391 | if ((isHidden || isAbsPosHidden || isSameColor) && hasContent) {
392 | // 获取元素中的链接(如果有)
393 | const links = [];
394 | if (hasLinks) {
395 | const linkElements = element.querySelectorAll('a');
396 | linkElements.forEach(link => {
397 | const href = link.getAttribute('href');
398 | if (href) links.push(href);
399 | });
400 | }
401 |
402 | hiddenElements.push({
403 | tagName: element.tagName,
404 | id: element.id || '无ID',
405 | classes: element.className || '无类名',
406 | hiddenBy: isSameColor ? 'color_matching' :
407 | isAbsPosHidden ? 'absolute_position' : 'visibility',
408 | content: element.textContent.trim().substring(0, 200) + '...',
409 | hasLinks: hasLinks,
410 | links: links,
411 | textColor: textColor,
412 | bgColor: bgColor
413 | });
414 | }
415 | });
416 |
417 | return hiddenElements;
418 | })();
419 | """
420 |
421 | try:
422 | hidden_elements = self.driver.execute_script(hidden_elements_script)
423 | self.logger.info(f"发现 {len(hidden_elements)} 个隐藏元素")
424 |
425 | for elem in hidden_elements:
426 | # 计算风险等级
427 | risk_level = 8 if elem['hasLinks'] else 6
428 |
429 | # 构建风险描述
430 | context = f"隐藏元素 ({elem['tagName']}): {elem['content']}"
431 | if elem['hasLinks']:
432 | context += f" 包含 {len(elem['links'])} 个链接"
433 |
434 | result_item = {
435 | 'type': 'hidden_element',
436 | 'technique': elem['hiddenBy'],
437 | 'risk_level': risk_level,
438 | 'context': context,
439 | 'detection_method': 'headless_browser',
440 | 'risk_reason': '视觉上隐藏的元素可能包含暗链'
441 | }
442 |
443 | # 如果有链接,添加链接信息
444 | if elem['hasLinks'] and elem['links']:
445 | result_item['hidden_links'] = elem['links']
446 |
447 | results.append(result_item)
448 |
449 | # 对于包含链接的隐藏元素,分别记录每个链接
450 | if elem['hasLinks'] and elem['links']:
451 | for link in elem['links']:
452 | from utils.network_utils import analyze_url_risk
453 | risk_info = analyze_url_risk(link)
454 | results.append({
455 | 'type': 'suspicious_url',
456 | 'url': link,
457 | 'risk_level': max(risk_level, risk_info['risk_level']),
458 | 'context': f"隐藏元素中的链接: {link}",
459 | 'detection_method': 'headless_browser',
460 | 'risk_reason': f"隐藏在{elem['hiddenBy']}类型的{elem['tagName']}元素中"
461 | })
462 |
463 | except Exception as e:
464 | self.logger.error(f"检测隐藏元素时出错: {str(e)}")
465 |
466 | return results
467 |
468 | def close(self):
469 | """关闭无头浏览器驱动
470 |
471 | 清理资源,避免内存泄漏
472 | """
473 | if self.driver:
474 | try:
475 | self.driver.quit()
476 | self.logger.info("无头浏览器已关闭")
477 | except Exception as e:
478 | self.logger.error(f"关闭无头浏览器时出错: {str(e)}")
479 | finally:
480 | self.driver = None
481 |
482 | def __del__(self):
483 | """析构函数,确保资源被释放"""
484 | self.close()
485 |
--------------------------------------------------------------------------------
/core/detector/html_detector.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTML检测器模块
5 | """
6 |
7 | import re
8 | from typing import List, Dict, Any
9 | from urllib.parse import urlparse
10 |
11 | from utils.html_utils import (
12 | extract_script_tags,
13 | extract_link_tags,
14 | extract_meta_tags,
15 | extract_iframe_tags,
16 | find_hidden_elements,
17 | get_dom_structure,
18 | extract_comments
19 | )
20 | from utils.network_utils import (
21 | extract_urls,
22 | is_external_link,
23 | extract_domain
24 | )
25 | from utils.common_utils import (
26 | extract_text_between_markers,
27 | get_context
28 | )
29 |
30 | class HTMLDetector:
31 | """
32 | HTML内容检测器,用于检测HTML文件中的可疑链接和隐藏元素
33 | """
34 |
35 | def __init__(self, config):
36 | """
37 | 初始化HTML检测器
38 |
39 | Args:
40 | config: 扫描配置对象
41 | """
42 | self.config = config
43 | self.logger = config.logger
44 |
45 | # 可疑HTML模式
46 | self.suspicious_patterns = {
47 | 'suspicious_attributes': re.compile(r'\bon\w+\s*=\s*["\']?javascript:', re.IGNORECASE),
48 | 'eval_inline': re.compile(r'\beval\s*\(', re.IGNORECASE),
49 | 'document_write': re.compile(r'\bdocument\.write\s*\(', re.IGNORECASE),
50 | 'base64_decode': re.compile(r'\batob\s*\(|\bfromCharCode\s*\(', re.IGNORECASE),
51 | 'data_uri': re.compile(r'data:[^;]+;base64,', re.IGNORECASE),
52 | 'remote_iframe': re.compile(r'