├── targets_test.txt
├── requirements.txt
├── test.css
├── .gitignore
├── keywords_example.txt
├── core
    ├── config.py
    └── detector
    │   ├── keyword_detector.py
    │   ├── special_hiding_detector.py
    │   ├── headless_browser_detector.py
    │   ├── html_detector.py
    │   └── js_detector.py
├── utils
    ├── logging_utils.py
    ├── file_utils.py
    ├── html_utils.py
    ├── common_utils.py
    ├── css_utils.py
    ├── js_utils.py
    └── network_utils.py
├── README.md
├── test_dark_link.html
└── YuanZhao.py


/targets_test.txt:
--------------------------------------------------------------------------------
1 | ./test_dark_link.html


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4>=4.9.3
 2 | lxml>=4.6.3
 3 | requests>=2.25.1
 4 | selenium>=4.0.0
 5 | urllib3>=1.26.7
 6 | chardet>=4.0.0
 7 | 
 8 | # 无头浏览器依赖
 9 | webdriver-manager>=3.5.0  # 自动安装和管理ChromeDriver
10 | 


--------------------------------------------------------------------------------
/test.css:
--------------------------------------------------------------------------------
1 | /* normal css */
2 | @import url("https://fonts.googleapis.com/css?family=Roboto");
3 | .banner { background-image: url("https://cdn.example.com/images/hero.jpg"); }
4 | .icon { background-image: url(/assets/icon.png); }
5 | .cursor { cursor: url("https://static.example.com/cur.cur"), auto; }
6 | .hidden { display: none; }
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | 
 8 | # Temporary files
 9 | *.swp
10 | *.swo
11 | *~
12 | .project
13 | .settings
14 | .classpath
15 | *.launch
16 | 
17 | # IDE
18 | .idea/
19 | .vscode/
20 | *.sublime-project
21 | *.sublime-workspace
22 | 
23 | # OS
24 | .DS_Store
25 | Thumbs.db
26 | 
27 | # Testing
28 | .tox/
29 | .coverage
30 | .coverage.*
31 | .cache
32 | nosetests.xml
33 | pytest.xml
34 | 
35 | # Environment
36 | .env
37 | .env.local
38 | .env.development.local
39 | .env.test.local
40 | .env.production.local
41 | 
42 | # Build
43 | /build/
44 | dist/
45 | *.egg
46 | *.egg-info/
47 | bug*.md
48 | 
49 | # Reports and logs
50 | reports/
51 | logs/
52 | *.log
53 | # 调试临时文件会保存到reports目录并以时间戳命名，无需具体文件名忽略
54 | reports/*
55 | 
56 | # Virtual environments
57 | venv/
58 | env/
59 | pyvenv/
60 | 
61 | # OS generated files
62 | .DS_Store
63 | .DS_Store?
64 | ._*
65 | .Spotlight-V100
66 | .Trashes
67 | 


--------------------------------------------------------------------------------
/keywords_example.txt:
--------------------------------------------------------------------------------
 1 | bet365, gambling, 9
 2 | 皇冠体育, gambling, 9
 3 | 火博, gambling, 9
 4 | 金年会, gambling, 9
 5 | 威尼斯人, gambling, 8
 6 | 澳门赌场, gambling, 8
 7 | 时时彩, gambling, 10
 8 | 六合彩, gambling, 10
 9 | 赔率, gambling, 7
10 | 投注, gambling, 8
11 | 赌博, gambling, 10
12 | 博彩, gambling, 10
13 | 赌球, gambling, 10
14 | 彩金, gambling, 9
15 | 线上赌场, gambling, 10
16 | 真人娱乐, gambling, 8
17 | 百家乐, gambling, 9
18 | 轮盘, gambling, 8
19 | 老虎机, gambling, 9
20 | 快三, gambling, 9
21 | 彩票软件, gambling, 9
22 | 体彩预测, gambling, 8
23 | 色情, porn, 9
24 | 成人, porn, 8
25 | AV, porn, 9
26 | 黄色网站, porn, 10
27 | 性爱, porn, 9
28 | 黄色, porn, 10
29 | 肉棒, porn, 10
30 | 爆乳, porn, 10
31 | 射精, porn, 10
32 | H视频, porn, 10
33 | 裸聊, porn, 9
34 | 露骨, porn, 8
35 | 情色, porn, 9
36 | 木马, malware, 10
37 | 病毒, malware, 9
38 | 勒索软件, malware, 10
39 | 挖矿, malware, 8
40 | 黑客攻击, malware, 10
41 | DDoS, malware, 10
42 | 后门, malware, 10
43 | 远程控制, malware, 9
44 | 账号密码, phishing, 8
45 | 银行登录, phishing, 10
46 | 支付验证, phishing, 9
47 | 登录, phishing, 6
48 | 账号, phishing, 6
49 | 密码, phishing, 6
50 | 支付, phishing, 8
51 | 转账, phishing, 9
52 | 银行卡, phishing, 8
53 | 验证码, phishing, 7
54 | 高利贷, other, 10
55 | 网贷, other, 7
56 | 小额贷, other, 8
57 | 民间借贷, other, 7
58 | 校园贷, other, 10
59 | 私服, other, 7
60 | 外挂, other, 8
61 | 传奇私服, other, 9
62 | 新开私服, other, 8
63 | 破解版, other, 7
64 | 黑客, other, 8
65 | 渗透测试, other, 5
66 | 漏洞扫描, other, 6
67 | 破解软件, other, 8
68 | 注册机, other, 7
69 | 激活码, other, 6
70 | 黑客工具, other, 9
71 | .cm, other, 7
72 | .tk, other, 6
73 | .ga, other, 6
74 | .ml, other, 6
75 | .tf, other, 6
76 | .gq, other, 6
77 | display:none, other, 9
78 | visibility:hidden, other, 9
79 | opacity:0, other, 8
80 | position:absolute, other, 6
81 | z-index:-1, other, 7
82 | text-indent:-9999px, other, 8
83 | document.write, other, 7
84 | eval(, other, 9
85 | setTimeout("", other, 8
86 | location.href=, other, 7
87 | window.open(, other, 6
88 | XMLHttpRequest, other, 5
89 | fetch(, other, 5
90 | 翻墙, other, 7
91 | VPN, other, 6
92 | 暴力, other, 9
93 | 血腥, other, 8
94 | 恐怖, other, 7
95 | 毒品, other, 10
96 | 大麻, other, 10
97 | 冰毒, other, 10
98 | 摇头丸, other, 10
99 | 


--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 配置管理模块
  5 | """
  6 | 
  7 | class Config:
  8 |     """扫描配置类"""
  9 |     
 10 |     def __init__(self):
 11 |         # 扫描目标配置
 12 |         self.target_type = None  # 'local_file', 'local_directory', 'internal_url', 'external_url'
 13 |         self.target = None
 14 |         self.crawl_depth = 1
 15 |         self.depth = self.crawl_depth  # 兼容属性
 16 |         
 17 |         # 扫描模式配置
 18 |         self.scan_mode = 'standard'  # 'fast', 'standard', 'deep'
 19 |         self.mode = self.scan_mode  # 兼容属性
 20 |         self.threads = 4
 21 |         self.timeout = 30
 22 |         self.internal_timeout = 60  # 内网URL超时时间（秒）
 23 |         self.external_timeout = 30  # 公网URL超时时间（秒）
 24 |         self.proxy = None
 25 |         self.exclude = []
 26 |         
 27 |         # 关键字配置
 28 |         self.keywords_file = None
 29 |         
 30 |         # 报告配置
 31 |         self.report_type = 'txt'
 32 |         self.report_file = None
 33 |         
 34 |         # 调试模式
 35 |         self.debug = False
 36 |         # 调试日志读取参数
 37 |         self.debug_log_wait_ms = 1500
 38 |         self.debug_log_checks = 3
 39 |         self.debug_log_interval_ms = 500
 40 |         
 41 |         # 日志器
 42 |         import logging
 43 |         self.logger = logging.getLogger('YuanZhao')
 44 |         
 45 |         # 无头浏览器配置
 46 |         self.use_headless_browser = False  # 是否启用无头浏览器
 47 |         self.headless_browser = 'chrome'  # 无头浏览器类型
 48 |         self.js_wait_time = 3  # JavaScript执行等待时间（秒）
 49 |         self.headless_timeout = 60  # 无头浏览器超时时间（秒）
 50 |         self.headless_auto_download = False  # 是否自动下载驱动
 51 |         self.headless_driver_path = None  # 本地驱动路径
 52 |         
 53 |         # 文件类型配置
 54 |         self.html_extensions = ['.html', '.htm', '.shtml', '.xhtml', '.php', '.asp', '.aspx', '.jsp']
 55 |         self.css_extensions = ['.css', '.less', '.scss', '.sass']
 56 |         self.js_extensions = ['.js', '.jsx', '.ts', '.tsx']
 57 |         self.image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
 58 |         
 59 |         # 扫描配置项
 60 |         self.scan_html = True
 61 |         self.scan_js = True
 62 |         self.scan_css = True
 63 |         self.scan_comments = True
 64 |         self.scan_meta = True
 65 |         self.scan_iframe = True
 66 |         self.scan_dom = True
 67 |         self.scan_encoding = True
 68 |         self.scan_steganography = True
 69 |         self.scan_special_hiding = True
 70 |         self.scan_keywords = True
 71 |         
 72 |         # 根据扫描模式调整配置
 73 |         self._set_mode_config()
 74 |         # 计算当前模式下需要扫描的扩展名
 75 |         self.file_extensions = self.get_file_extensions_to_scan()
 76 |     
 77 |     def _set_mode_config(self):
 78 |         """根据扫描模式设置相应的配置"""
 79 |         if self.scan_mode == 'fast':
 80 |             # 快速模式：只进行基础扫描
 81 |             self.scan_html = True
 82 |             self.scan_js = True
 83 |             self.scan_css = True
 84 |             self.scan_comments = True
 85 |             self.scan_meta = True
 86 |             self.scan_iframe = False
 87 |             self.scan_dom = False
 88 |             self.scan_encoding = False
 89 |             self.scan_steganography = False
 90 |             self.scan_special_hiding = False
 91 |             self.scan_keywords = True
 92 |         
 93 |         elif self.scan_mode == 'standard':
 94 |             # 标准模式：进行大部分扫描
 95 |             self.scan_html = True
 96 |             self.scan_js = True
 97 |             self.scan_css = True
 98 |             self.scan_comments = True
 99 |             self.scan_meta = True
100 |             self.scan_iframe = True
101 |             self.scan_dom = True
102 |             self.scan_encoding = True
103 |             self.scan_steganography = False
104 |             self.scan_special_hiding = True
105 |             self.scan_keywords = True
106 |         
107 |         elif self.scan_mode == 'deep':
108 |             # 深度模式：进行所有扫描
109 |             self.scan_html = True
110 |             self.scan_js = True
111 |             self.scan_css = True
112 |             self.scan_comments = True
113 |             self.scan_meta = True
114 |             self.scan_iframe = True
115 |             self.scan_dom = True
116 |             self.scan_encoding = True
117 |             self.scan_steganography = True
118 |             self.scan_special_hiding = True
119 |             self.scan_keywords = True
120 |         # 同步更新扩展名列表
121 |         self.file_extensions = self.get_file_extensions_to_scan()
122 |     
123 |     def update_mode(self, mode):
124 |         """更新扫描模式"""
125 |         self.scan_mode = mode
126 |         self._set_mode_config()
127 |     
128 |     def get_file_extensions_to_scan(self):
129 |         """获取需要扫描的文件扩展名列表"""
130 |         extensions = []
131 |         
132 |         if self.scan_html:
133 |             extensions.extend(self.html_extensions)
134 |         
135 |         if self.scan_js:
136 |             extensions.extend(self.js_extensions)
137 |         
138 |         if self.scan_css:
139 |             extensions.extend(self.css_extensions)
140 |         
141 |         return list(set(extensions))  # 去重
142 |     
143 |     def get_proxy_dict(self):
144 |         """将代理字符串转换为requests使用的代理字典格式"""
145 |         if not self.proxy:
146 |             return None
147 |         
148 |         proxies = {
149 |             'http': self.proxy,
150 |             'https': self.proxy
151 |         }
152 |         return proxies
153 |     
154 |     def __str__(self):
155 |         """返回配置的字符串表示"""
156 |         return (
157 |             f"Config(" 
158 |             f"target_type={self.target_type}, "
159 |             f"target={self.target}, "
160 |             f"scan_mode={self.scan_mode}, "
161 |             f"threads={self.threads}, "
162 |             f"timeout={self.timeout}, "
163 |             f"internal_timeout={self.internal_timeout}, "
164 |             f"external_timeout={self.external_timeout}, "
165 |             f"report_type={self.report_type}, "
166 |             f"report_file={self.report_file})"
167 |         )
168 |     
169 |     def get_config_dict(self):
170 |         """返回配置的字典表示，用于日志记录"""
171 |         return {
172 |             'target_type': self.target_type,
173 |             'target': self.target,
174 |             'crawl_depth': self.crawl_depth,
175 |             'scan_mode': self.scan_mode,
176 |             'threads': self.threads,
177 |             'timeout': self.timeout,
178 |             'internal_timeout': self.internal_timeout,
179 |             'external_timeout': self.external_timeout,
180 |             'proxy': '***' if self.proxy else None,
181 |             'keywords_file': self.keywords_file,
182 |             'report_type': self.report_type,
183 |             'report_file': self.report_file,
184 |             'debug': self.debug
185 |         }
186 |         
187 | 


--------------------------------------------------------------------------------
/utils/logging_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 日志处理工具模块
  5 | """
  6 | 
  7 | import os
  8 | import logging
  9 | import sys
 10 | from datetime import datetime
 11 | 
 12 | class Logger:
 13 |     """
 14 |     自定义日志类
 15 |     """
 16 |     def __init__(self, name='YuanZhao', log_dir=None, level=logging.INFO, use_console=True):
 17 |         """
 18 |         初始化日志记录器
 19 |         
 20 |         Args:
 21 |             name (str): 日志名称
 22 |             log_dir (str): 日志文件目录
 23 |             level (int): 日志级别
 24 |             use_console (bool): 是否输出到控制台
 25 |         """
 26 |         self.logger = logging.getLogger(name)
 27 |         self.logger.setLevel(level)
 28 |         self.logger.handlers.clear()
 29 |         
 30 |         # 创建格式化器
 31 |         formatter = logging.Formatter(
 32 |             '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 33 |             datefmt='%Y-%m-%d %H:%M:%S'
 34 |         )
 35 |         
 36 |         # 控制台输出
 37 |         if use_console:
 38 |             console_handler = logging.StreamHandler(sys.stdout)
 39 |             console_handler.setLevel(level)
 40 |             console_handler.setFormatter(formatter)
 41 |             self.logger.addHandler(console_handler)
 42 |         
 43 |         # 文件输出
 44 |         if log_dir:
 45 |             os.makedirs(log_dir, exist_ok=True)
 46 |             timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
 47 |             log_file = os.path.join(log_dir, f'YuanZhao_{timestamp}.log')
 48 |             file_handler = logging.FileHandler(log_file, encoding='utf-8')
 49 |             file_handler.setLevel(level)
 50 |             file_handler.setFormatter(formatter)
 51 |             self.logger.addHandler(file_handler)
 52 |     
 53 |     def debug(self, message):
 54 |         """记录调试信息"""
 55 |         self.logger.debug(message)
 56 |     
 57 |     def info(self, message):
 58 |         """记录普通信息"""
 59 |         self.logger.info(message)
 60 |     
 61 |     def warning(self, message):
 62 |         """记录警告信息"""
 63 |         self.logger.warning(message)
 64 |     
 65 |     def error(self, message, exc_info=False):
 66 |         """记录错误信息"""
 67 |         self.logger.error(message, exc_info=exc_info)
 68 |     
 69 |     def critical(self, message, exc_info=False):
 70 |         """记录严重错误信息"""
 71 |         self.logger.critical(message, exc_info=exc_info)
 72 | 
 73 | def setup_logging(log_dir=None, level=logging.INFO):
 74 |     """
 75 |     全局日志配置
 76 |     
 77 |     Args:
 78 |         log_dir (str): 日志文件目录
 79 |         level (int): 日志级别
 80 |         
 81 |     Returns:
 82 |         Logger: 日志记录器实例
 83 |     """
 84 |     return Logger('YuanZhao', log_dir, level).logger
 85 | 
 86 | def log_exception(logger, exception, message="发生异常"):
 87 |     """
 88 |     记录异常信息
 89 |     
 90 |     Args:
 91 |         logger: 日志记录器
 92 |         exception: 异常对象
 93 |         message (str): 错误消息
 94 |     """
 95 |     logger.error(f"{message}: {str(exception)}", exc_info=True)
 96 | 
 97 | def log_progress(logger, current, total, message="处理进度"):
 98 |     """
 99 |     记录进度信息
100 |     
101 |     Args:
102 |         logger: 日志记录器
103 |         current (int): 当前进度
104 |         total (int): 总进度
105 |         message (str): 进度消息
106 |     """
107 |     if total > 0:
108 |         percentage = (current / total) * 100
109 |         logger.info(f"{message}: {current}/{total} ({percentage:.1f}%)")
110 | 
111 | def log_scan_result(logger, file_path, issues):
112 |     """
113 |     记录扫描结果
114 |     
115 |     Args:
116 |         logger: 日志记录器
117 |         file_path (str): 文件路径
118 |         issues (list): 发现的问题列表
119 |     """
120 |     if issues:
121 |         logger.warning(f"文件 {file_path} 发现 {len(issues)} 个问题")
122 |         import logging as _logging
123 |         if logger.level <= _logging.DEBUG:
124 |             for issue in issues:
125 |                 logger.warning(f"  - {issue}")
126 |         else:
127 |             # 聚合重复项，仅输出前若干项
128 |             counts = {}
129 |             for issue in issues:
130 |                 counts[issue] = counts.get(issue, 0) + 1
131 |             shown = 0
132 |             for text, cnt in counts.items():
133 |                 logger.warning(f"  - {text} x{cnt}")
134 |                 shown += 1
135 |                 if shown >= 8:
136 |                     break
137 |             if len(counts) > shown:
138 |                 logger.warning(f"  ... 还有 {len(counts) - shown} 项未展示（非verbose模式）")
139 |     else:
140 |         logger.debug(f"文件 {file_path} 未发现问题")
141 | 
142 | def log_keyword_match(logger, file_path, keyword, category, weight, context):
143 |     """
144 |     记录关键字匹配信息
145 |     
146 |     Args:
147 |         logger: 日志记录器
148 |         file_path (str): 文件路径
149 |         keyword (str): 匹配的关键字
150 |         category (str): 关键字类别
151 |         weight (int): 风险权重
152 |         context (str): 上下文信息
153 |     """
154 |     logger.warning(
155 |         f"关键字匹配 - 文件: {file_path}, "
156 |         f"关键字: {keyword}, 类别: {category}, 风险权重: {weight}\n"
157 |         f"上下文: {context}"
158 |     )
159 | 
160 | def log_suspicious_url(logger, file_path, url, risk_level, context):
161 |     """
162 |     记录可疑URL信息
163 |     
164 |     Args:
165 |         logger: 日志记录器
166 |         file_path (str): 文件路径
167 |         url (str): 可疑URL
168 |         risk_level (str): 风险等级
169 |         context (str): 上下文信息
170 |     """
171 |     logger.warning(
172 |         f"可疑URL - 文件: {file_path}, "
173 |         f"URL: {url}, 风险等级: {risk_level}\n"
174 |         f"上下文: {context}"
175 |     )
176 | 
177 | def log_hidden_technique(logger, file_path, technique, risk_level, context):
178 |     """
179 |     记录隐藏技术信息
180 |     
181 |     Args:
182 |         logger: 日志记录器
183 |         file_path (str): 文件路径
184 |         technique (str): 隐藏技术
185 |         risk_level (str): 风险等级
186 |         context (str): 上下文信息
187 |     """
188 |     logger.warning(
189 |         f"隐藏技术 - 文件: {file_path}, "
190 |         f"技术: {technique}, 风险等级: {risk_level}\n"
191 |         f"上下文: {context}"
192 |     )
193 | 
194 | def log_file_skipped(logger, file_path, reason):
195 |     """
196 |     记录跳过的文件信息
197 |     
198 |     Args:
199 |         logger: 日志记录器
200 |         file_path (str): 文件路径
201 |         reason (str): 跳过原因
202 |     """
203 |     logger.debug(f"跳过文件 {file_path}: {reason}")
204 | 
205 | def log_config(logger, config_dict):
206 |     """
207 |     记录配置信息
208 |     
209 |     Args:
210 |         logger: 日志记录器
211 |         config_dict (dict): 配置字典
212 |     """
213 |     logger.info("扫描配置:")
214 |     for key, value in config_dict.items():
215 |         logger.info(f"  {key}: {value}")
216 | 
217 | def log_summary(logger, total_files, scanned_files, issues_found, scan_time):
218 |     """
219 |     记录扫描总结信息
220 |     
221 |     Args:
222 |         logger: 日志记录器
223 |         total_files (int): 文件总数
224 |         scanned_files (int): 已扫描文件数
225 |         issues_found (int): 发现的问题数
226 |         scan_time (float): 扫描耗时（秒）
227 |     """
228 |     logger.info("扫描总结:")
229 |     logger.info(f"  总文件数: {total_files}")
230 |     logger.info(f"  已扫描文件: {scanned_files}")
231 |     logger.info(f"  发现问题: {issues_found}")
232 |     logger.info(f"  扫描耗时: {scan_time:.2f} 秒")
233 |     try:
234 |         if scan_time > 0:
235 |             logger.info(f"  平均速度: {scanned_files/scan_time:.2f} 文件/秒")
236 |         else:
237 |             logger.info("  平均速度: N/A (耗时为0)")
238 |     except Exception:
239 |         logger.info("  平均速度: N/A")
240 |     
241 |     # 根据问题数量给出警告级别
242 |     if issues_found > 50:
243 |         logger.critical(f"发现大量问题 ({issues_found})，建议立即检查")
244 |     elif issues_found > 10:
245 |         logger.error(f"发现较多问题 ({issues_found})，需要关注")
246 |     elif issues_found > 0:
247 |         logger.warning(f"发现少量问题 ({issues_found})，建议查看")
248 |     else:
249 |         logger.info("未发现明显问题")
250 |         
251 | 


--------------------------------------------------------------------------------
/utils/file_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 文件处理工具模块
  5 | """
  6 | 
  7 | import os
  8 | import logging
  9 | import chardet
 10 | from typing import List
 11 | 
 12 | logger = logging.getLogger('YuanZhao.utils.file')
 13 | 
 14 | def read_file(file_path: str, max_size: int = 10 * 1024 * 1024) -> str:
 15 |     """
 16 |     读取文件内容，自动检测编码
 17 |     
 18 |     Args:
 19 |         file_path: 文件路径
 20 |         max_size: 最大文件大小（默认10MB）
 21 |     
 22 |     Returns:
 23 |         文件内容
 24 |     """
 25 |     try:
 26 |         # 检查文件大小
 27 |         file_size = os.path.getsize(file_path)
 28 |         if file_size > max_size:
 29 |             logger.warning(f"文件过大，将读取前{max_size/1024/1024:.1f}MB: {file_path}")
 30 |             
 31 |         # 检测文件编码
 32 |         with open(file_path, 'rb') as f:
 33 |             raw_data = f.read(min(file_size, 10000))
 34 |             result = chardet.detect(raw_data)
 35 |             encoding = result['encoding'] or 'utf-8'
 36 |         
 37 |         # 读取文件内容
 38 |         with open(file_path, 'r', encoding=encoding, errors='replace') as f:
 39 |             content = f.read(max_size)
 40 |         
 41 |         return content
 42 |         
 43 |     except Exception as e:
 44 |         logger.error(f"读取文件失败: {file_path}, 错误: {str(e)}")
 45 |         return ''
 46 | 
 47 | def get_files_to_scan(directory: str, extensions: List[str]) -> List[str]:
 48 |     """
 49 |     递归获取目录中所有指定扩展名的文件
 50 |     
 51 |     Args:
 52 |         directory: 目录路径
 53 |         extensions: 需要扫描的文件扩展名列表
 54 |     
 55 |     Returns:
 56 |         文件路径列表
 57 |     """
 58 |     files_to_scan = []
 59 |     
 60 |     try:
 61 |         for root, dirs, files in os.walk(directory):
 62 |             # 过滤掉隐藏目录
 63 |             dirs[:] = [d for d in dirs if not d.startswith('.')]
 64 |             
 65 |             for file in files:
 66 |                 # 过滤掉隐藏文件
 67 |                 if file.startswith('.'):
 68 |                     continue
 69 |                 
 70 |                 # 检查文件扩展名
 71 |                 _, ext = os.path.splitext(file.lower())
 72 |                 if ext in extensions:
 73 |                     file_path = os.path.join(root, file)
 74 |                     files_to_scan.append(file_path)
 75 |         
 76 |         logger.info(f"找到 {len(files_to_scan)} 个需要扫描的文件")
 77 |         
 78 |     except Exception as e:
 79 |         logger.error(f"获取文件列表失败: {str(e)}")
 80 |     
 81 |     return files_to_scan
 82 | 
 83 | def is_binary_file(file_path: str) -> bool:
 84 |     """
 85 |     检查文件是否为二进制文件
 86 |     
 87 |     Args:
 88 |         file_path: 文件路径
 89 |     
 90 |     Returns:
 91 |         是否为二进制文件
 92 |     """
 93 |     try:
 94 |         with open(file_path, 'rb') as f:
 95 |             chunk = f.read(1024)
 96 |             
 97 |             # 检查是否包含null字节
 98 |             if b'\x00' in chunk:
 99 |                 return True
100 |             
101 |             # 检查非文本字符的比例
102 |             text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)))
103 |             non_text = sum(1 for byte in chunk if byte not in text_chars)
104 |             
105 |             # 如果超过30%的字符是非文本字符，则认为是二进制文件
106 |             return non_text / len(chunk) > 0.3
107 |             
108 |     except Exception as e:
109 |         logger.error(f"检查文件类型失败: {file_path}, 错误: {str(e)}")
110 |         return False
111 | 
112 | def get_file_info(file_path: str) -> dict:
113 |     """
114 |     获取文件信息
115 |     
116 |     Args:
117 |         file_path: 文件路径
118 |     
119 |     Returns:
120 |         文件信息字典
121 |     """
122 |     try:
123 |         stat_info = os.stat(file_path)
124 |         
125 |         info = {
126 |             'path': file_path,
127 |             'size': stat_info.st_size,
128 |             'created_time': stat_info.st_ctime,
129 |             'modified_time': stat_info.st_mtime,
130 |             'is_binary': is_binary_file(file_path)
131 |         }
132 |         
133 |         return info
134 |         
135 |     except Exception as e:
136 |         logger.error(f"获取文件信息失败: {file_path}, 错误: {str(e)}")
137 |         return {}
138 | 
139 | def ensure_directory(directory: str):
140 |     """
141 |     确保目录存在，如果不存在则创建
142 |     
143 |     Args:
144 |         directory: 目录路径
145 |     """
146 |     try:
147 |         if not os.path.exists(directory):
148 |             os.makedirs(directory)
149 |             logger.info(f"创建目录: {directory}")
150 |     except Exception as e:
151 |         logger.error(f"创建目录失败: {directory}, 错误: {str(e)}")
152 |         raise
153 | 
154 | def get_relative_path(file_path: str, base_directory: str) -> str:
155 |     """
156 |     获取文件相对于基础目录的路径
157 |     
158 |     Args:
159 |         file_path: 文件路径
160 |         base_directory: 基础目录
161 |     
162 |     Returns:
163 |         相对路径
164 |     """
165 |     try:
166 |         return os.path.relpath(file_path, base_directory)
167 |     except Exception as e:
168 |         logger.error(f"获取相对路径失败: {str(e)}")
169 |         return file_path
170 | 
171 | def filter_files_by_size(files: List[str], min_size: int = 0, max_size: int = None) -> List[str]:
172 |     """
173 |     根据文件大小过滤文件列表
174 |     
175 |     Args:
176 |         files: 文件路径列表
177 |         min_size: 最小文件大小（字节）
178 |         max_size: 最大文件大小（字节）
179 |     
180 |     Returns:
181 |         过滤后的文件列表
182 |     """
183 |     filtered_files = []
184 |     
185 |     for file_path in files:
186 |         try:
187 |             file_size = os.path.getsize(file_path)
188 |             
189 |             if file_size < min_size:
190 |                 continue
191 |             
192 |             if max_size is not None and file_size > max_size:
193 |                 continue
194 |             
195 |             filtered_files.append(file_path)
196 |             
197 |         except Exception as e:
198 |             logger.warning(f"获取文件大小失败: {file_path}, 错误: {str(e)}")
199 |     
200 |     return filtered_files
201 | 
202 | def _match_exclude(path: str, exclude_patterns: List[str]) -> bool:
203 |     try:
204 |         import fnmatch
205 |         for pattern in exclude_patterns or []:
206 |             if fnmatch.fnmatch(path, pattern) or (pattern.endswith('/') and path.replace('\\','/').startswith(pattern.rstrip('/'))):
207 |                 return True
208 |     except Exception:
209 |         pass
210 |     return False
211 | 
212 | # 兼容性函数，为了支持scanner.py中的导入（扩展签名）
213 | def get_file_list(directory: str, recursive: bool = True, depth: int = 1, extensions: List[str] = None, exclude: List[str] = None) -> List[str]:
214 |     """
215 |     获取目录中的文件列表，支持递归、深度限制与排除模式
216 |     
217 |     Args:
218 |         directory: 目录路径
219 |         recursive: 是否递归
220 |         depth: 递归深度（包含根层级）
221 |         extensions: 需要扫描的文件扩展名列表
222 |         exclude: 排除的文件或目录通配符列表
223 |     Returns:
224 |         文件路径列表
225 |     """
226 |     results: List[str] = []
227 |     try:
228 |         extensions = [ext.lower() for ext in (extensions or [])]
229 |         base_depth = directory.rstrip('\\/').count(os.sep)
230 |         for root, dirs, files in os.walk(directory):
231 |             # 处理深度
232 |             current_depth = root.rstrip('\\/').count(os.sep) - base_depth
233 |             if not recursive or current_depth >= depth:
234 |                 dirs[:] = []
235 |             # 排除目录
236 |             if exclude:
237 |                 dirs[:] = [d for d in dirs if not _match_exclude(os.path.join(root, d), exclude)]
238 |             for file in files:
239 |                 path = os.path.join(root, file)
240 |                 if exclude and _match_exclude(path, exclude):
241 |                     continue
242 |                 if file.startswith('.'):
243 |                     continue
244 |                 _, ext = os.path.splitext(file.lower())
245 |                 if not extensions or ext in extensions:
246 |                     results.append(path)
247 |         logger.info(f"找到 {len(results)} 个需要扫描的文件")
248 |     except Exception as e:
249 |         logger.error(f"获取文件列表失败: {str(e)}")
250 |     return results
251 |     
252 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 渊照 - 专业暗链扫描工具
  2 | 
  3 | 「渊照」是一款功能强大的专业暗链扫描工具，专注于检测网站、HTML文件或目录中的隐蔽链接、隐藏元素和恶意代码。该工具能够智能识别扫描目标类型（本地文件/目录、内网URL、公网URL），并自动调整扫描策略以获得最佳效果，是安全人员进行网站安全审计和应急响应的理想工具。
  4 | 
  5 | ## 功能特性
  6 | 
  7 | ### 智能目标识别与处理
  8 | - **多类型目标支持**：自动识别和扫描本地文件、本地目录、内网URL和公网URL
  9 | - **差异化扫描策略**：根据目标类型应用最优扫描策略
 10 | - **递归目录扫描**：支持可配置的扫描深度
 11 | - **文件过滤机制**：支持通过通配符排除特定文件或目录
 12 | 
 13 | ### 核心扫描能力
 14 | - **多层次检测机制**：HTML代码检测、JavaScript代码分析、CSS代码检测、元标签扫描、注释内容分析
 15 | - **高级威胁识别**：加密/编码链接检测、可疑域名检测、随机生成域名检测、短链接服务检测、非标准端口检测、可疑查询参数检测
 16 | - **特殊隐藏手法检测**：CSS隐藏技术、颜色隐藏、零宽字符隐藏、字体大小隐藏等
 17 | - **关键字匹配系统**：支持CSV格式自定义关键字文件，包含关键字、类别和风险权重
 18 | - **智能风险评分**：基于多维度风险评估
 19 | 
 20 | ### 无头浏览器增强检测
 21 | - **动态内容捕获**：使用Chrome无头浏览器执行JavaScript并捕获动态内容
 22 | - **DOM操作监控**：跟踪动态DOM修改
 23 | - **iframe深度分析**：渲染和分析iframe内容
 24 | - **网络请求捕获**：监控HTTP请求和重定向链
 25 | 
 26 | ### 全面的报告系统
 27 | - **多种报告格式**：文本报告(txt)、HTML报告(html)、JSON报告(json)、CSV报告(csv)
 28 | - **丰富的报告内容**：扫描概览、问题详情、风险评估、上下文展示
 29 | - **来源类型标注**：在可疑链接中增加 `context_type` 字段（如 `html/js/css/comments`），用于区分链接的来源场景，便于后续数据分析与过滤
 30 |  - **来源标签与位置**：统一输出 `source_tag`（如 `debug/normal`）与定位范围 `position (start,end)`，HTML/CSV/JSON 报告保持一致
 31 |  - **风险排序与阈值展示**：HTML报告对“可疑链接”按风险降序展示，并默认仅展示风险≥4的项，减少噪音；关键字匹配表支持从上下文提取可点击链接
 32 | 
 33 | ### 灵活的配置选项
 34 | - **多种扫描模式**：fast/standard/deep
 35 | - **性能优化选项**：可配置并发线程数、请求超时设置、代理服务器支持
 36 |  - **关键词来源**：支持从 `keywords_example.txt` 或自定义 `--keyword-file` 读取，文件允许 `#` 注释行，CSV格式：`关键字,类别,风险权重`
 37 | 
 38 | ## 安装指南
 39 | 
 40 | ### 环境要求
 41 | - Python 3.8+
 42 | 
 43 | ### 安装依赖
 44 | ```bash
 45 | pip install -r requirements.txt
 46 | ```
 47 | 
 48 | ## 使用方法
 49 | 
 50 | ### 查看帮助信息
 51 | ```bash
 52 | python YuanZhao.py --help
 53 | ```
 54 | 
 55 | ### 完整使用案例命令
 56 | 
 57 | #### 1. 本地文件扫描场景
 58 | ```bash
 59 | # 基本扫描 - 单个HTML文件
 60 | python YuanZhao.py /path/to/file.html
 61 | 
 62 | # 高级扫描 + HTML报告
 63 | python YuanZhao.py /path/to/file.html -m standard -f html
 64 | 
 65 | # 详细日志模式
 66 | python YuanZhao.py /path/to/suspicious.html --verbose
 67 | 
 68 | # 自定义输出目录
 69 | python YuanZhao.py /path/to/file.html -o /custom/report/dir
 70 | 
 71 | # 特定报告格式（JSON）
 72 | python YuanZhao.py /path/to/file.html -f json
 73 | ```
 74 | 
 75 | #### 2. 本地目录扫描场景
 76 | ```bash
 77 | # 默认深度扫描目录
 78 | python YuanZhao.py /path/to/website
 79 | 
 80 | # 自定义深度扫描（仅当前目录和一级子目录）
 81 | python YuanZhao.py /path/to/website -d 1
 82 | 
 83 | # 深度递归扫描
 84 | python YuanZhao.py /path/to/website -d 5
 85 | 
 86 | # 排除特定文件/目录
 87 | python YuanZhao.py /path/to/website --exclude "*.jpg" "*.png" "logs/*" "vendor/"
 88 | 
 89 | # 调整线程数（提高性能）
 90 | python YuanZhao.py /path/to/website -t 16
 91 | 
 92 | # 完整模式 + 多格式报告
 93 | python YuanZhao.py /path/to/website -m deep -f html -o security_reports --threads 12
 94 | ```
 95 | 
 96 | #### 3. 网络URL扫描场景
 97 | ```bash
 98 | # 基本网站扫描
 99 | python YuanZhao.py https://example.com
100 | 
101 | # 内网地址扫描
102 | python YuanZhao.py http://192.168.1.100
103 | 
104 | # 本地开发服务器扫描
105 | python YuanZhao.py http://localhost:8080
106 | 
107 | # 带路径的URL扫描
108 | python YuanZhao.py https://example.com/news/article
109 | 
110 | # 设置超时时间（公网默认使用全局超时，内网未显式设置时会按较长超时）
111 | python YuanZhao.py https://example.com --timeout 60
112 | 
113 | # 使用代理服务器
114 | python YuanZhao.py https://example.com --proxy http://127.0.0.1:8080
115 | 
116 | # 带认证的代理
117 | python YuanZhao.py https://example.com --proxy http://username:password@proxy.example.com:8080
118 | ```
119 | 
120 | #### 4. 高级功能场景
121 | ```bash
122 | # 无头浏览器扫描（动态内容）
123 | python YuanZhao.py https://dynamic-website.com --headless
124 | 
125 | # 无头浏览器 + 延长等待时间
126 | python YuanZhao.py https://heavy-js-website.com --headless --js-wait 10
127 | 
128 | # 无头浏览器超时时间
129 | python YuanZhao.py https://example.com --headless --headless-timeout 120
130 | 
131 | # 自定义关键字检测
132 | python YuanZhao.py /path/to/target --keyword-file custom_keywords.txt
133 | 
134 | # 基础模式快速扫描
135 | python YuanZhao.py https://example.com -m fast -d 1 -t 5
136 | 
137 | # 全部模式深度扫描
138 | python YuanZhao.py /path/to/important-site -m deep -d 3 -f html --verbose
139 | ```
140 | 
141 | #### 5. 批量目标扫描（多链接/多路径）
142 | ```bash
143 | # 方式A：指定列表文件（每行一个目标：URL/文件/目录）
144 | python YuanZhao.py --target-file e:\targets.txt -m deep -f html -o reports --verbose
145 | 
146 | # 方式B：直接把 .txt 作为 target 传入（同样按列表处理）
147 | python YuanZhao.py e:\targets.txt -m deep -f html -o reports --verbose
148 | 
149 | # 示例列表文件内容
150 | # https://example.com
151 | # e:\webroot
152 | # e:\webroot\index.html
153 | ```
154 | 
155 | #### 6. 特定场景优化命令
156 | ```bash
157 | # 应急响应场景
158 | python YuanZhao.py /compromised/webroot -m deep -f html -o incident_response --keyword-file malware_keywords.txt --verbose
159 | 
160 | # 定期安全审计
161 | python YuanZhao.py /path/to/webroot -d 3 -m standard -f json -o weekly_scan_$(date +%Y%m%d)
162 | 
163 | # 新闻页面专项扫描
164 | python YuanZhao.py https://example.com/news -m deep -d 1 -t 8 --verbose
165 | 
166 | # 大规模并行扫描
167 | python YuanZhao.py /large/website -d 2 -t 20 --exclude "*.zip" "*.rar" "backup/*"
168 | 
169 | # 自动化集成扫描（生成JSON报告）
170 | python YuanZhao.py https://example.com -f json -o automated_scan_results --no-color
171 | ```
172 | ### 自定义关键字文件格式
173 | ```
174 | 关键字文件为CSV格式，每行包含三个字段：
175 | 
176 | 关键字,类别,风险权重
177 | poker,gambling,8
178 | casino,gambling,9
179 | malware,malware,10
180 | phishing,phishing,9
181 | ```
182 | 
183 | 类别可选值：gambling (博彩)、porn (色情)、malware (恶意软件)、phishing (钓鱼)、other (其他)
184 | 风险权重范围：1-10（10为最高风险）
185 |  默认关键字文件：项目根目录 `keywords_example.txt`（若未指定 `--keyword-file` 将自动加载）。文件允许以 `#` 开头的注释行。
186 | 
187 | ## 主要参数说明
188 | 
189 | ### 基本参数
190 | - `target`: 扫描目标（文件路径、目录路径或URL）- 必需参数
191 | - `-d, --depth`: 递归扫描深度（默认：3，0表示仅扫描当前文件/目录）
192 | - `-m, --mode`: 扫描模式（fast/standard/deep，默认：deep）
193 | - `-t, --threads`: 并发线程数（默认：8）
194 | 
195 | ### 报告相关参数
196 | - `-o, --output`: 报告输出目录（默认：./reports）
197 | - `-f, --format`: 报告格式（txt/html/json/csv，默认：txt）
198 | 
199 | ### 网络相关参数
200 | - `--timeout`: 请求超时时间（秒，默认：30）。公网目标默认使用此值，内网目标未显式设置 `internal_timeout` 时按较长超时（约为全局超时的两倍）。
201 | - `--proxy`: 代理设置（支持带认证与不带认证的HTTP代理），示例：`http://127.0.0.1:8080` 或 `http://user:pass@host:8080`
202 | 
203 | ### 高级参数
204 | - `--keyword-file`: 自定义关键字文件路径
205 | - `--target-file`: 批量目标列表文件路径（每行一个目标：URL/文件/目录）
206 | - `--exclude`: 排除的文件或目录
207 | - `--verbose`: 显示详细日志信息
208 | - `--no-color`: 禁用彩色输出（适用于自动化脚本）
209 | 
210 | ### 无头浏览器参数
211 | - `--headless`: 启用无头浏览器扫描
212 | - `--browser-type`: 无头浏览器类型（支持: chrome，默认: chrome）
213 | - `--js-wait`: JavaScript执行等待时间（秒，默认: 3）
214 | - `--headless-timeout`: 无头浏览器超时时间（秒，默认: 60）
215 |  - `--headless-binary`: Chrome二进制路径（例如：`C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe`）
216 |  - `--headless-driver`: ChromeDriver路径（例如：`C:\\drivers\\chromedriver.exe`）
217 | 
218 | ## 常见问题解答
219 | 
220 | **Q: 扫描结果中的误报如何处理？**
221 | A: 可通过以下方式降低噪音：
222 | - 使用自定义关键字文件调整权重
223 | - 利用报告的风险阈值（HTML默认展示风险≥4）聚焦高风险项
224 | - 依赖优化后的CSS检测逻辑与可信CDN白名单，避免将正常资源识别为可疑
225 | 
226 | **Q: 如何提高大型网站的扫描效率？**
227 | A: 增加线程数、设置合理的爬取深度，或先使用基础模式（`fast`）进行初步筛选。对于公网网站，建议控制扫描范围。
228 | 
229 | **Q: 为什么有些动态生成的链接没被检测到？**
230 | A: 启用无头浏览器模式`--headless`并适当增加JavaScript执行等待时间`--js-wait`。
231 | 
232 | **Q: 使用无头浏览器时需要注意什么？**
233 | A: 使用无头浏览器会增加资源消耗和时间，建议适当降低线程数，为复杂页面增加等待时间，仅在必要时启用。
234 | 
235 | ## 项目结构
236 | 
237 | ```
238 | YuanZhao/
239 | ├── YuanZhao.py           # 主程序入口
240 | ├── requirements.txt      # 依赖列表
241 | ├── README.md             # 项目说明
242 | ├── core/                 # 核心模块
243 | │   ├── scanner.py        # 扫描引擎
244 | │   ├── detector/         # 各类检测器
245 | │   ├── reporter.py       # 报告生成器
246 | │   └── config.py         # 配置管理
247 | ├── utils/                # 工具类
248 | └── keywords_example.txt  # 关键字示例文件
249 | ```
250 | 
251 | ## 许可证与免责声明
252 | 
253 | 本工具仅供安全测试和应急响应使用，请确保您有足够的授权对目标进行扫描，避免对未经授权的系统进行测试。
254 | 
255 | ## 开发者提示（工具接口）
256 | - CSS工具正式接口：`extract_css_properties/remove_css_comments/extract_css_comments`
257 | - 统一正式接口（`extract_css_properties/remove_css_comments/extract_css_comments`）。
258 | 
259 | ## 开发者选项（日志与报告）
260 | - `debug_log_wait_ms`：调试读取日志的初始等待时间（毫秒），默认 1500
261 | - `debug_log_checks`：日志稳定性检查次数，默认 3
262 | - `debug_log_interval_ms`：每次稳定性检查的间隔（毫秒），默认 500
263 | - 提取统计日志级别：常规运行为 `debug`（匹配数与总提取数），在 `--verbose` 场景下查看更详细日志
264 | - 报告来源字段：`context_type`（html/js/css/comments）与 `source_tag`（debug/normal）用于区分来源与路径
265 |  - 关键字匹配报告：支持从上下文自动提取 `http(s)` 链接并渲染为可点击链接
266 |  - HTML报告“可疑链接详情”默认按风险降序并过滤低风险项（≥4显示）
267 | ## Star History
268 | 
269 | <a href="https://www.star-history.com/#BINBIN02/YuanZhao&type=date&legend=top-left">
270 |  <picture>
271 |    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=BINBIN02/YuanZhao&type=date&theme=dark&legend=top-left" />
272 |    <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=BINBIN02/YuanZhao&type=date&legend=top-left" />
273 |    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=BINBIN02/YuanZhao&type=date&legend=top-left" />
274 |  </picture>
275 | </a>
276 | 


--------------------------------------------------------------------------------
/test_dark_link.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="description" content="这是一个测试页面，用于验证渊照暗链扫描工具和无头浏览器检测能力">
  6 |     <meta name="keywords" content="测试,暗链,扫描,安全,无头浏览器">
  7 |     <title>暗链测试页面 - 无头浏览器检测测试</title>
  8 |     <style>
  9 |         /* 测试CSS中的隐藏技术 */
 10 |         .hidden1 {
 11 |             display: none;
 12 |         }
 13 |         .hidden2 {
 14 |             visibility: hidden;
 15 |         }
 16 |         .hidden3 {
 17 |             opacity: 0;
 18 |         }
 19 |         .hidden4 {
 20 |             position: absolute;
 21 |             left: -9999px;
 22 |             top: -9999px;
 23 |         }
 24 |         .hidden5 {
 25 |             z-index: -1;
 26 |         }
 27 |         .hidden6 {
 28 |             text-indent: -9999px;
 29 |         }
 30 |         .tiny-text {
 31 |             font-size: 0.1px;
 32 |         }
 33 |         .same-color {
 34 |             color: white;
 35 |             background-color: white;
 36 |         }
 37 |         /* 正常样式 */
 38 |         body {
 39 |             font-family: Arial, sans-serif;
 40 |             line-height: 1.6;
 41 |             color: #333;
 42 |             max-width: 1200px;
 43 |             margin: 0 auto;
 44 |             padding: 20px;
 45 |         }
 46 |         h1 {
 47 |             color: #2c3e50;
 48 |         }
 49 |         .normal-section {
 50 |             margin: 20px 0;
 51 |             padding: 20px;
 52 |             border: 1px solid #ddd;
 53 |             border-radius: 5px;
 54 |         }
 55 |         .dynamic-content {
 56 |             background-color: #f8f9fa;
 57 |             padding: 10px;
 58 |             margin: 10px 0;
 59 |             border-left: 3px solid #6c757d;
 60 |         }
 61 |     </style>
 62 |     <script>
 63 |         // 测试可疑的JavaScript代码
 64 |         function normalFunction() {
 65 |             console.log('正常函数');
 66 |         }
 67 |         
 68 |         // 可疑的document.write
 69 |         document.write('<div style="display:none">http://evil-gambling-site.tk/promotion</div>');
 70 |         
 71 |         // 可疑的eval
 72 |         var encodedCode = 'alert("test");';
 73 |         eval(encodedCode);
 74 |         
 75 |         // 可疑的setTimeout
 76 |         setTimeout("window.location='http://malicious-phishing.ga/login'", 3000);
 77 |         
 78 |         // 可疑的window.open
 79 |         function openNormalLink() {
 80 |             window.open('https://www.example.com');
 81 |         }
 82 |         
 83 |         // 无头浏览器测试 - 动态生成链接
 84 |         document.addEventListener('DOMContentLoaded', function() {
 85 |             // 1. 延迟生成的可疑链接（无头浏览器应该能捕获）
 86 |             setTimeout(function() {
 87 |                 const div = document.createElement('div');
 88 |                 div.style.display = 'none';
 89 |                 div.innerHTML = '<a href="http://dynamic-evil-link.tk/headless-test">无头浏览器测试 - 延迟生成链接</a>';
 90 |                 document.body.appendChild(div);
 91 |                 console.log('动态生成的隐藏链接已添加');
 92 |             }, 1000);
 93 |             
 94 |             // 2. 使用不同方式拼接的可疑URL
 95 |             const domainParts = ['gambling', '-hidden', '-site', '.tk'];
 96 |             const fullDomain = 'http://' + domainParts.join('');
 97 |             const linkElement = document.createElement('a');
 98 |             linkElement.href = fullDomain;
 99 |             linkElement.textContent = '拼接域名测试';
100 |             linkElement.style.display = 'none';
101 |             document.body.appendChild(linkElement);
102 |             
103 |             // 3. 模拟用户交互触发的隐藏链接
104 |             function simulateUserInteraction() {
105 |                 const hiddenDiv = document.createElement('div');
106 |                 hiddenDiv.className = 'dynamic-content';
107 |                 hiddenDiv.style.display = 'none';
108 |                 hiddenDiv.innerHTML = '<a href="http://interaction-triggered.ml/click-me">交互触发链接</a>';
109 |                 document.body.appendChild(hiddenDiv);
110 |             }
111 |             
112 |             // 模拟鼠标移动事件
113 |             const mouseMoveEvent = new MouseEvent('mousemove', {
114 |                 bubbles: true,
115 |                 cancelable: true,
116 |                 view: window
117 |             });
118 |             document.dispatchEvent(mouseMoveEvent);
119 |             
120 |             // 鼠标事件处理器
121 |             document.addEventListener('mousemove', simulateUserInteraction);
122 |         });
123 |         
124 |         // 4. 使用fetch API加载可疑内容
125 |         async function loadDynamicContent() {
126 |             try {
127 |                 // 模拟fetch成功（实际环境中不会真正请求）
128 |                 const response = { ok: true, json: async () => ({
129 |                     link: 'http://api-loaded-evil.cm/data',
130 |                     content: '通过API加载的恶意内容'
131 |                 })};
132 |                 
133 |                 // 创建隐藏元素显示API返回的内容
134 |                 const apiContentDiv = document.createElement('div');
135 |                 apiContentDiv.style.opacity = '0';
136 |                 apiContentDiv.innerHTML = `<a href="${(await response.json()).link}">API加载的链接</a>`;
137 |                 document.body.appendChild(apiContentDiv);
138 |             } catch (error) {
139 |                 console.log('模拟API请求失败');
140 |             }
141 |         }
142 |         
143 |         // 在页面加载后执行
144 |         window.onload = function() {
145 |             loadDynamicContent();
146 |             
147 |             // 5. 动态修改DOM属性
148 |             setTimeout(function() {
149 |                 const dynamicElement = document.createElement('div');
150 |                 dynamicElement.setAttribute('data-malicious', 'http://dom-modified-attack.tf/exploit');
151 |                 document.body.appendChild(dynamicElement);
152 |             }, 500);
153 |             
154 |             // 6. 模拟网络请求
155 |             const xhr = new XMLHttpRequest();
156 |             xhr.open('GET', 'http://network-request-test.ml/track');
157 |             xhr.setRequestHeader('X-Test-Header', 'headless-browser-detection');
158 |             xhr.send();
159 |         };
160 |     </script>
161 | </head>
162 | <body>
163 |     <h1>正常网站内容</h1>
164 |     
165 |     <div class="normal-section">
166 |         <h2>欢迎访问我们的网站</h2>
167 |         <p>这是一个正常的网站内容段落，介绍网站的主要功能和服务。</p>
168 |         <a href="https://www.example.com">正常链接</a>
169 |     </div>
170 |     
171 |     <!-- 测试正常HTML注释 -->
172 |     <!-- 这是一个正常的注释 -->
173 |     
174 |     <!-- 测试可疑的HTML注释中的链接 -->
175 |     <!-- http://gambling-site.cm/register -->
176 |     <!-- <a href="http://porn-site.ga/video">隐藏的链接</a> -->
177 |     
178 |     <!-- 测试iframe隐藏 -->
179 |     <iframe src="http://malware-site.ml/install" style="width:0;height:0;border:none;display:none"></iframe>
180 |     
181 |     <!-- 测试各种CSS隐藏技术 -->
182 |     <div class="hidden1">
183 |         <a href="http://illegal-gambling.tf">隐藏的博彩链接</a>
184 |     </div>
185 |     
186 |     <div class="hidden2">
187 |         <a href="http://porn-site.ga">隐藏的色情链接</a>
188 |     </div>
189 |     
190 |     <div class="hidden3">
191 |         赌博网站推荐：http://bet365-clone.gq
192 |     </div>
193 |     
194 |     <div class="hidden4">
195 |         私服游戏下载：http://game-private-server.tk
196 |     </div>
197 |     
198 |     <div class="hidden5">
199 |         黑客工具：http://hacker-tools.ml
200 |     </div>
201 |     
202 |     <div class="hidden6">
203 |         时时彩预测：http://lottery-predict.cm
204 |     </div>
205 |     
206 |     <div class="tiny-text">
207 |         色情视频：http://adult-videos.ga
208 |     </div>
209 |     
210 |     <div class="same-color">
211 |         高利贷服务：http://loan-shark.tf
212 |     </div>
213 |     
214 |     <!-- 测试meta标签中的可疑内容 -->
215 |     <meta name="keywords" content="赌博 博彩 时时彩 六合彩 黄色网站 色情 黑客">
216 |     
217 |     <!-- 测试零宽字符隐藏（使用零宽空格、零宽连接符等） -->
218 |     <p>正常文本<span style="display:none">http://hidden-evil-site.tk</span>继续正常文本</p>
219 |     
220 |     <!-- 测试空白字符堆积 -->
221 |     <p>大量空格后的隐藏内容                                                                       http://hidden-link-using-spaces.ga</p>
222 |     
223 |     <!-- 测试多层嵌套隐藏 -->
224 |     <div style="display:block">
225 |         <div style="display:block">
226 |             <div style="display:none">
227 |                 <a href="http://multi-level-hidden.ml">多层嵌套隐藏的链接</a>
228 |             </div>
229 |         </div>
230 |     </div>
231 |     
232 |     <script>
233 |         // 更多可疑JavaScript
234 |         var xhr = new XMLHttpRequest();
235 |         xhr.open('GET', 'http://data-collector.tk/track');
236 |         xhr.send();
237 |         
238 |         // 测试fetch API
239 |         fetch('http://analytics-evil.ga/collect')
240 |             .then(response => response.json())
241 |             .then(data => console.log(data));
242 |         
243 |         // 测试可疑的DOM操作
244 |         var hiddenDiv = document.createElement('div');
245 |         hiddenDiv.style.display = 'none';
246 |         hiddenDiv.innerHTML = '<a href="http://dom-based-hidden.cm">DOM创建的隐藏链接</a>';
247 |         document.body.appendChild(hiddenDiv);
248 |     </script>
249 |     
250 |     <noscript>
251 |         <!-- 测试noscript中的隐藏内容 -->
252 |         <div style="display:none">
253 |             <a href="http://noscript-hidden.tf">NoScript中的隐藏链接</a>
254 |         </div>
255 |     </noscript>
256 | </body>
257 | 
258 | </html>


--------------------------------------------------------------------------------
/core/detector/keyword_detector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 关键字检测器模块
  5 | """
  6 | 
  7 | import re
  8 | import logging
  9 | from typing import List, Dict, Tuple
 10 | import chardet
 11 | 
 12 | logger = logging.getLogger('YuanZhao.detector.keyword')
 13 | 
 14 | class KeywordDetector:
 15 |     """关键字检测器"""
 16 |     
 17 |     def __init__(self, config):
 18 |         self.config = config
 19 |         self.keywords = []  # 存储关键字列表 [(keyword, category, weight), ...]
 20 |         self.keyword_patterns = []  # 编译后的正则表达式模式列表
 21 |     
 22 |     def load_keywords(self, keywords_file: str) -> bool:
 23 |         """从文件加载关键字"""
 24 |         try:
 25 |             # 检测文件编码
 26 |             with open(keywords_file, 'rb') as f:
 27 |                 raw_data = f.read(10000)
 28 |                 result = chardet.detect(raw_data)
 29 |                 encoding = result['encoding'] or 'utf-8'
 30 |             
 31 |             # 读取关键字文件
 32 |             with open(keywords_file, 'r', encoding=encoding) as f:
 33 |                 import csv
 34 |                 reader = csv.reader(f)
 35 |                 for line_num, parts in enumerate(reader, 1):
 36 |                     # 去除空行
 37 |                     if not parts or all((p.strip() == '' for p in parts)):
 38 |                         continue
 39 |                     # 忽略注释行
 40 |                     if parts and parts[0].strip().startswith('#'):
 41 |                         continue
 42 |                     if len(parts) < 3:
 43 |                         logger.warning(f"关键字文件第{line_num}行格式错误，跳过: {parts}")
 44 |                         continue
 45 |                     keyword = parts[0].strip()
 46 |                     category = parts[1].strip()
 47 |                     # 验证风险权重
 48 |                     try:
 49 |                         weight = int(parts[2].strip())
 50 |                         if not 1 <= weight <= 10:
 51 |                             logger.warning(f"关键字文件第{line_num}行风险权重超出范围(1-10)，使用默认值5: {parts}")
 52 |                             weight = 5
 53 |                     except Exception:
 54 |                         logger.warning(f"关键字文件第{line_num}行风险权重不是数字，使用默认值5: {parts}")
 55 |                         weight = 5
 56 |                     valid_categories = ['gambling', 'porn', 'malware', 'phishing', 'other']
 57 |                     if category not in valid_categories:
 58 |                         logger.warning(f"关键字文件第{line_num}行类别无效，使用默认类别other: {parts}")
 59 |                         category = 'other'
 60 |                     self.keywords.append((keyword, category, weight))
 61 |             
 62 |             # 编译正则表达式模式
 63 |             self._compile_keyword_patterns()
 64 |             
 65 |             logger.info(f"成功加载 {len(self.keywords)} 个关键字")
 66 |             return True
 67 |             
 68 |         except Exception as e:
 69 |             logger.error(f"加载关键字文件失败: {str(e)}", exc_info=True)
 70 |             # 如果加载失败，使用内置的默认关键字
 71 |             self._load_default_keywords()
 72 |             return False
 73 |     
 74 |     def _load_default_keywords(self):
 75 |         """默认从项目根目录读取 keywords_example.txt"""
 76 |         import os
 77 |         try:
 78 |             root = os.getcwd()
 79 |             path = os.path.join(root, 'keywords_example.txt')
 80 |             if os.path.exists(path):
 81 |                 self.load_keywords(path)
 82 |                 return
 83 |             logger.warning("未找到默认关键字文件 keywords_example.txt，关键字功能将受限")
 84 |             self.keywords = []
 85 |             self.keyword_patterns = []
 86 |         except Exception as e:
 87 |             logger.error(f"加载默认关键字失败: {str(e)}")
 88 |             self.keywords = []
 89 |             self.keyword_patterns = []
 90 |     
 91 |     def _compile_keyword_patterns(self):
 92 |         """编译关键字正则表达式模式"""
 93 |         self.keyword_patterns = []
 94 |         
 95 |         for keyword, category, weight in self.keywords:
 96 |             if keyword.isascii() and re.fullmatch(r'[A-Za-z]+', keyword) and len(keyword) <= 2:
 97 |                 pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
 98 |             else:
 99 |                 pattern = re.compile(re.escape(keyword), re.IGNORECASE)
100 |             self.keyword_patterns.append((pattern, keyword, category, weight))
101 |     
102 |     def detect(self, content: str, source: str) -> List[Dict]:
103 |         """检测内容中的关键字匹配"""
104 |         results = []
105 |         
106 |         # 如果没有加载关键字，使用默认关键字
107 |         if not self.keywords:
108 |             self._load_default_keywords()
109 |         
110 |         try:
111 |             # 对每个关键字模式进行匹配
112 |             for pattern, original_keyword, category, weight in self.keyword_patterns:
113 |                 for match in pattern.finditer(content):
114 |                     # 获取匹配上下文
115 |                     context = self._get_context(content, match.start(), match.end())
116 |                     
117 |                     # 构建结果
118 |                     result = {
119 |                         'keyword': original_keyword,
120 |                         'category': self._get_category_name(category),
121 |                         'weight': weight,
122 |                         'source': source,
123 |                         'context': context,
124 |                         'match_position': match.start()
125 |                     }
126 |                     
127 |                     # 避免重复添加相同位置的匹配
128 |                     if not self._is_duplicate_match(results, result):
129 |                         results.append(result)
130 |             
131 |             # 按风险权重排序
132 |             results.sort(key=lambda x: x['weight'], reverse=True)
133 |             
134 |         except Exception as e:
135 |             logger.error(f"关键字检测失败: {str(e)}", exc_info=True)
136 |         
137 |         return results
138 |     
139 |     def _get_category_name(self, category: str) -> str:
140 |         """获取类别的中文名称"""
141 |         category_names = {
142 |             'gambling': '博彩',
143 |             'porn': '色情',
144 |             'malware': '恶意软件',
145 |             'phishing': '钓鱼',
146 |             'other': '其他'
147 |         }
148 |         
149 |         return category_names.get(category, '其他')
150 |     
151 |     def _get_context(self, content: str, start: int, end: int, context_size: int = 50) -> str:
152 |         """获取匹配内容的上下文"""
153 |         start_context = max(0, start - context_size)
154 |         end_context = min(len(content), end + context_size)
155 |         
156 |         context = content[start_context:end_context]
157 |         context = context.replace('\n', ' ').replace('\r', ' ')
158 |         
159 |         # 截断过长的上下文
160 |         if len(context) > 200:
161 |             context = context[:100] + '...' + context[-100:]
162 |         
163 |         return context
164 |     
165 |     def _is_duplicate_match(self, existing_results: List[Dict], new_result: Dict) -> bool:
166 |         """检查是否为重复的匹配"""
167 |         # 检查是否在相同位置附近有相同关键字的匹配
168 |         position = new_result['match_position']
169 |         keyword = new_result['keyword']
170 |         source = new_result['source']
171 |         
172 |         for result in existing_results:
173 |             if (result['keyword'] == keyword and 
174 |                 result['source'] == source and 
175 |                 abs(result['match_position'] - position) < 10):
176 |                 return True
177 |         
178 |         return False
179 |     
180 |     def get_keyword_statistics(self) -> Dict:
181 |         """获取关键字统计信息"""
182 |         stats = {
183 |             'total_keywords': len(self.keywords),
184 |             'by_category': {}
185 |         }
186 |         
187 |         # 按类别统计
188 |         for _, category, _ in self.keywords:
189 |             category_name = self._get_category_name(category)
190 |             if category_name not in stats['by_category']:
191 |                 stats['by_category'][category_name] = 0
192 |             stats['by_category'][category_name] += 1
193 |         
194 |         return stats
195 |     
196 |     def add_keyword(self, keyword: str, category: str = 'other', weight: int = 5):
197 |         """动态添加关键字"""
198 |         # 验证参数
199 |         if not keyword or not keyword.strip():
200 |             logger.warning("尝试添加空关键字，跳过")
201 |             return False
202 |         
203 |         weight = max(1, min(10, weight))  # 限制在1-10范围内
204 |         
205 |         valid_categories = ['gambling', 'porn', 'malware', 'phishing', 'other']
206 |         if category not in valid_categories:
207 |             category = 'other'
208 |         
209 |         # 检查是否已存在
210 |         for existing_keyword, _, _ in self.keywords:
211 |             if existing_keyword == keyword:
212 |                 logger.warning(f"关键字 '{keyword}' 已存在")
213 |                 return False
214 |         
215 |         # 添加关键字
216 |         self.keywords.append((keyword, category, weight))
217 |         
218 |         # 编译新的模式
219 |         if keyword.isascii() and re.fullmatch(r'[A-Za-z]+', keyword) and len(keyword) <= 2:
220 |             pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
221 |         else:
222 |             pattern = re.compile(re.escape(keyword), re.IGNORECASE)
223 |         self.keyword_patterns.append((pattern, keyword, category, weight))
224 |         
225 |         logger.info(f"成功添加关键字: {keyword} (类别: {category}, 权重: {weight})")
226 |         return True
227 |     
228 |     def clear_keywords(self):
229 |         """清空所有关键字"""
230 |         self.keywords = []
231 |         self.keyword_patterns = []
232 |         logger.info("已清空所有关键字")
233 |         
234 | 


--------------------------------------------------------------------------------
/utils/html_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | HTML处理工具模块
  5 | """
  6 | 
  7 | import re
  8 | import logging
  9 | from typing import List, Dict, Optional
 10 | from bs4 import BeautifulSoup, Comment
 11 | 
 12 | logger = logging.getLogger('YuanZhao.utils.html')
 13 | 
 14 | def clean_html(html_content: str) -> str:
 15 |     """
 16 |     清理HTML内容，去除空白字符等
 17 |     
 18 |     Args:
 19 |         html_content: HTML内容
 20 |     
 21 |     Returns:
 22 |         清理后的HTML内容
 23 |     """
 24 |     try:
 25 |         # 移除多余的空白字符
 26 |         html_content = re.sub(r'\s+', ' ', html_content)
 27 |         # 移除首尾空白
 28 |         html_content = html_content.strip()
 29 |         return html_content
 30 |     except Exception as e:
 31 |         logger.error(f"清理HTML失败: {str(e)}")
 32 |         return html_content
 33 | 
 34 | def extract_html_comments(html_content: str) -> List[Dict[str, str]]:
 35 |     """
 36 |     提取HTML注释
 37 |     
 38 |     Args:
 39 |         html_content: HTML内容
 40 |     
 41 |     Returns:
 42 |         注释列表，每项包含注释内容和位置
 43 |     """
 44 |     comments = []
 45 |     
 46 |     try:
 47 |         # 使用正则表达式提取注释
 48 |         comment_pattern = re.compile(r'<!--(.*?)-->', re.DOTALL)
 49 |         matches = comment_pattern.finditer(html_content)
 50 |         
 51 |         for match in matches:
 52 |             comment_content = match.group(1)
 53 |             start_pos = match.start(0)
 54 |             end_pos = match.end(0)
 55 |             
 56 |             comments.append({
 57 |                 'content': comment_content.strip(),
 58 |                 'position': (start_pos, end_pos)
 59 |             })
 60 |     
 61 |     except Exception as e:
 62 |         logger.error(f"提取HTML注释失败: {str(e)}")
 63 |     
 64 |     return comments
 65 | 
 66 | def extract_script_tags(html_content: str) -> List[Dict[str, str]]:
 67 |     """
 68 |     提取HTML中的script标签
 69 |     
 70 |     Args:
 71 |         html_content: HTML内容
 72 |     
 73 |     Returns:
 74 |         script标签列表
 75 |     """
 76 |     scripts = []
 77 |     
 78 |     try:
 79 |         soup = BeautifulSoup(html_content, 'lxml')
 80 |         script_tags = soup.find_all('script')
 81 |         
 82 |         for script in script_tags:
 83 |             script_info = {
 84 |                 'src': script.get('src', ''),
 85 |                 'content': script.string or '',
 86 |                 'type': script.get('type', ''),
 87 |                 'language': script.get('language', '')
 88 |             }
 89 |             
 90 |             # 获取script标签的原始字符串
 91 |             if script:  # 确保script不为None
 92 |                 script_info['original_tag'] = str(script)
 93 |             else:
 94 |                 script_info['original_tag'] = ''
 95 |                 
 96 |             scripts.append(script_info)
 97 |     
 98 |     except Exception as e:
 99 |         logger.error(f"提取script标签失败: {str(e)}")
100 |         
101 |         # 如果BeautifulSoup失败，尝试使用正则表达式
102 |         try:
103 |             script_pattern = re.compile(r'<script[^>]*>(.*?)</script>', re.DOTALL | re.IGNORECASE)
104 |             matches = script_pattern.finditer(html_content)
105 |             
106 |             for match in matches:
107 |                 scripts.append({
108 |                     'src': '',
109 |                     'content': match.group(1) or '',
110 |                     'type': '',
111 |                     'language': '',
112 |                     'original_tag': match.group(0)
113 |                 })
114 |         except Exception as fallback_error:
115 |             logger.error(f"正则提取script标签也失败: {str(fallback_error)}")
116 |     
117 |     return scripts
118 | 
119 | def extract_link_tags(html_content: str) -> List[Dict[str, str]]:
120 |     """
121 |     提取HTML中的link标签
122 |     
123 |     Args:
124 |         html_content: HTML内容
125 |     
126 |     Returns:
127 |         link标签列表
128 |     """
129 |     links = []
130 |     
131 |     try:
132 |         soup = BeautifulSoup(html_content, 'lxml')
133 |         link_tags = soup.find_all('link')
134 |         
135 |         for link in link_tags:
136 |             links.append({
137 |                 'href': link.get('href', ''),
138 |                 'rel': link.get('rel', ''),
139 |                 'type': link.get('type', ''),
140 |                 'original_tag': str(link) if link else ''
141 |             })
142 |     
143 |     except Exception as e:
144 |         logger.error(f"提取link标签失败: {str(e)}")
145 |     
146 |     return links
147 | 
148 | def extract_meta_tags(html_content: str) -> List[Dict[str, str]]:
149 |     """
150 |     提取HTML中的meta标签
151 |     
152 |     Args:
153 |         html_content: HTML内容
154 |     
155 |     Returns:
156 |         meta标签列表
157 |     """
158 |     metas = []
159 |     
160 |     try:
161 |         soup = BeautifulSoup(html_content, 'lxml')
162 |         meta_tags = soup.find_all('meta')
163 |         
164 |         for meta in meta_tags:
165 |             meta_info = {
166 |                 'name': meta.get('name', ''),
167 |                 'content': meta.get('content', ''),
168 |                 'http-equiv': meta.get('http-equiv', ''),
169 |                 'charset': meta.get('charset', ''),
170 |                 'original_tag': str(meta) if meta else ''
171 |             }
172 |             metas.append(meta_info)
173 |     
174 |     except Exception as e:
175 |         logger.error(f"提取meta标签失败: {str(e)}")
176 |     
177 |     return metas
178 | 
179 | def extract_iframe_tags(html_content: str) -> List[Dict[str, str]]:
180 |     """
181 |     提取HTML中的iframe标签
182 |     
183 |     Args:
184 |         html_content: HTML内容
185 |     
186 |     Returns:
187 |         iframe标签列表
188 |     """
189 |     iframes = []
190 |     
191 |     try:
192 |         soup = BeautifulSoup(html_content, 'lxml')
193 |         iframe_tags = soup.find_all('iframe')
194 |         
195 |         for iframe in iframe_tags:
196 |             iframes.append({
197 |                 'src': iframe.get('src', ''),
198 |                 'width': iframe.get('width', ''),
199 |                 'height': iframe.get('height', ''),
200 |                 'style': iframe.get('style', ''),
201 |                 'original_tag': str(iframe) if iframe else ''
202 |             })
203 |     
204 |     except Exception as e:
205 |         logger.error(f"提取iframe标签失败: {str(e)}")
206 |     
207 |     return iframes
208 | 
209 | def extract_all_tags(html_content: str, tag_name: str) -> List[BeautifulSoup]:
210 |     """
211 |     提取指定标签的所有实例
212 |     
213 |     Args:
214 |         html_content: HTML内容
215 |         tag_name: 标签名称
216 |     
217 |     Returns:
218 |         标签列表
219 |     """
220 |     tags = []
221 |     
222 |     try:
223 |         soup = BeautifulSoup(html_content, 'lxml')
224 |         tags = soup.find_all(tag_name)
225 |     except Exception as e:
226 |         logger.error(f"提取{tag_name}标签失败: {str(e)}")
227 |     
228 |     return tags
229 | 
230 | def get_dom_structure(html_content: str, max_depth: int = 3) -> Dict:
231 |     """
232 |     获取DOM结构概览
233 |     
234 |     Args:
235 |         html_content: HTML内容
236 |         max_depth: 最大深度
237 |     
238 |     Returns:
239 |         DOM结构字典
240 |     """
241 |     try:
242 |         soup = BeautifulSoup(html_content, 'lxml')
243 |         
244 |         def _process_element(element, depth):
245 |             if depth > max_depth:
246 |                 return {}
247 |             
248 |             tag_info = {
249 |                 'tag': element.name,
250 |                 'attributes': {k: v for k, v in element.attrs.items()},
251 |                 'children': []
252 |             }
253 |             
254 |             for child in element.children:
255 |                 if hasattr(child, 'name') and child.name:
256 |                     tag_info['children'].append(_process_element(child, depth + 1))
257 |             
258 |             return tag_info
259 |         
260 |         return _process_element(soup.find('html') or soup, 0)
261 |         
262 |     except Exception as e:
263 |         logger.error(f"获取DOM结构失败: {str(e)}")
264 |         return {}
265 | 
266 | def find_hidden_elements(html_content: str) -> List[Dict[str, str]]:
267 |     """
268 |     查找可能被隐藏的元素
269 |     
270 |     Args:
271 |         html_content: HTML内容
272 |     
273 |     Returns:
274 |         隐藏元素列表
275 |     """
276 |     hidden_elements = []
277 |     
278 |     try:
279 |         soup = BeautifulSoup(html_content, 'lxml')
280 |         
281 |         # 查找可能隐藏的元素
282 |         for element in soup.find_all():
283 |             # 检查style属性
284 |             style = element.get('style', '').lower()
285 |             
286 |             if any(hidden in style for hidden in ['display:none', 'visibility:hidden', 'opacity:0']):
287 |                 hidden_elements.append({
288 |                     'tag': element.name,
289 |                     'style': style,
290 |                     'content': element.get_text(),
291 |                     'original_tag': str(element) if element else ''
292 |                 })
293 |             
294 |             # 检查hidden属性
295 |             if element.get('hidden') is not None:
296 |                 hidden_elements.append({
297 |                     'tag': element.name,
298 |                     'reason': 'hidden attribute',
299 |                     'content': element.get_text(),
300 |                     'original_tag': str(element) if element else ''
301 |                 })
302 |     
303 |     except Exception as e:
304 |         logger.error(f"查找隐藏元素失败: {str(e)}")
305 |     
306 |     return hidden_elements
307 | 
308 | def extract_text_from_html(html_content: str) -> str:
309 |     """
310 |     从HTML中提取纯文本
311 |     
312 |     Args:
313 |         html_content: HTML内容
314 |     
315 |     Returns:
316 |         提取的纯文本
317 |     """
318 |     try:
319 |         soup = BeautifulSoup(html_content, 'lxml')
320 |         
321 |         # 移除script和style标签
322 |         for script in soup(['script', 'style']):
323 |             if script:
324 |                 script.decompose()
325 |         
326 |         # 提取文本
327 |         text = soup.get_text(separator=' ', strip=True)
328 |         
329 |         # 清理空白字符
330 |         text = re.sub(r'\s+', ' ', text)
331 |         
332 |         return text
333 |         
334 |     except Exception as e:
335 |         logger.error(f"提取HTML文本失败: {str(e)}")
336 |         return html_content
337 | 
338 | def remove_html_tags(html_content: str, keep_whitespace: bool = False) -> str:
339 |     """
340 |     移除HTML标签
341 |     
342 |     Args:
343 |         html_content: HTML内容
344 |         keep_whitespace: 是否保留空白
345 |     
346 |     Returns:
347 |         移除标签后的文本
348 |     """
349 |     try:
350 |         # 使用正则表达式移除标签
351 |         text = re.sub(r'<[^>]+>', '', html_content)
352 |         
353 |         if not keep_whitespace:
354 |             # 移除多余的空白字符
355 |             text = re.sub(r'\s+', ' ', text).strip()
356 |         
357 |         return text
358 |         
359 |     except Exception as e:
360 |         logger.error(f"移除HTML标签失败: {str(e)}")
361 |         return html_content
362 | 
363 | def get_character_encoding(html_content: str) -> Optional[str]:
364 |     """
365 |     获取HTML文档的字符编码
366 |     
367 |     Args:
368 |         html_content: HTML内容
369 |     
370 |     Returns:
371 |         字符编码
372 |     """
373 |     try:
374 |         # 检查meta标签中的charset
375 |         charset_match = re.search(r'<meta[^>]+charset=["\']?([^"\'>\s]+)', html_content, re.IGNORECASE)
376 |         if charset_match:
377 |             return charset_match.group(1).lower()
378 |         
379 |         # 检查http-equiv中的content-type
380 |         content_type_match = re.search(r'<meta[^>]+http-equiv=["\']?content-type["\']?[^>]*content=["\']?[^"\']*charset=([^"\'>\s;]+)', html_content, re.IGNORECASE)
381 |         if content_type_match:
382 |             return content_type_match.group(1).lower()
383 |         
384 |         return None
385 |         
386 |     except Exception as e:
387 |         logger.error(f"获取字符编码失败: {str(e)}")
388 |         return None
389 | 
390 | # 兼容性函数，为了支持html_detector.py中的导入
391 | def extract_comments(html_content: str) -> List[Dict[str, str]]:
392 |     """
393 |     提取HTML注释（extract_html_comments的别名）
394 |     
395 |     Args:
396 |         html_content: HTML内容
397 |     
398 |     Returns:
399 |         注释列表
400 |     """
401 |     return extract_html_comments(html_content)
402 |     


--------------------------------------------------------------------------------
/utils/common_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 通用工具模块
  5 | """
  6 | 
  7 | import re
  8 | import time
  9 | import hashlib
 10 | import logging
 11 | import os
 12 | from typing import List, Dict, Any, Optional, Tuple
 13 | 
 14 | logger = logging.getLogger('YuanZhao.utils.common')
 15 | 
 16 | def calculate_file_hash(file_path: str, hash_type: str = 'md5') -> Optional[str]:
 17 |     """
 18 |     计算文件哈希值
 19 |     
 20 |     Args:
 21 |         file_path: 文件路径
 22 |         hash_type: 哈希算法类型 (md5, sha1, sha256)
 23 |     
 24 |     Returns:
 25 |         哈希值字符串
 26 |     """
 27 |     try:
 28 |         hash_func = getattr(hashlib, hash_type)
 29 |         hash_obj = hash_func()
 30 |         
 31 |         with open(file_path, 'rb') as f:
 32 |             while True:
 33 |                 data = f.read(65536)  # 64KB chunks
 34 |                 if not data:
 35 |                     break
 36 |                 hash_obj.update(data)
 37 |         
 38 |         return hash_obj.hexdigest()
 39 |         
 40 |     except Exception as e:
 41 |         logger.error(f"计算文件哈希失败: {file_path}, 错误: {str(e)}")
 42 |         return None
 43 | 
 44 | def calculate_string_hash(string: str, hash_type: str = 'md5') -> Optional[str]:
 45 |     """
 46 |     计算字符串哈希值
 47 |     
 48 |     Args:
 49 |         string: 输入字符串
 50 |         hash_type: 哈希算法类型
 51 |     
 52 |     Returns:
 53 |         哈希值字符串
 54 |     """
 55 |     try:
 56 |         hash_func = getattr(hashlib, hash_type)
 57 |         return hash_func(string.encode('utf-8')).hexdigest()
 58 |     except Exception as e:
 59 |         logger.error(f"计算字符串哈希失败: {str(e)}")
 60 |         return None
 61 | 
 62 | def clean_text(text: str) -> str:
 63 |     """
 64 |     清理文本，去除控制字符和多余空白
 65 |     
 66 |     Args:
 67 |         text: 输入文本
 68 |     
 69 |     Returns:
 70 |         清理后的文本
 71 |     """
 72 |     try:
 73 |         # 移除控制字符，保留换行和制表符
 74 |         text = ''.join(char for char in text if char.isprintable() or char in '\n\t')
 75 |         # 清理多余空白
 76 |         text = re.sub(r'\s+', ' ', text)
 77 |         return text.strip()
 78 |     except Exception as e:
 79 |         logger.error(f"清理文本失败: {str(e)}")
 80 |         return text
 81 | 
 82 | def extract_text_between(text: str, start_marker: str, end_marker: str) -> List[str]:
 83 |     """
 84 |     提取两个标记之间的文本
 85 |     
 86 |     Args:
 87 |         text: 原始文本
 88 |         start_marker: 开始标记
 89 |         end_marker: 结束标记
 90 |     
 91 |     Returns:
 92 |         提取的文本列表
 93 |     """
 94 |     try:
 95 |         pattern = re.compile(re.escape(start_marker) + '(.*?)' + re.escape(end_marker), re.DOTALL)
 96 |         return pattern.findall(text)
 97 |     except Exception as e:
 98 |         logger.error(f"提取文本失败: {str(e)}")
 99 |         return []
100 | 
101 | def detect_encoding(text: str) -> Optional[str]:
102 |     """
103 |     检测文本编码（传入为 str 时返回默认编码）
104 |     """
105 |     try:
106 |         # 对已解码的 str 返回 utf-8，避免误导性“探测”
107 |         return 'utf-8'
108 |     except Exception as e:
109 |         logger.error(f"检测编码失败: {str(e)}")
110 |         return None
111 | 
112 | def safe_decode(bytes_data: bytes, default_encoding: str = 'utf-8') -> str:
113 |     """
114 |     安全解码字节数据
115 |     
116 |     Args:
117 |         bytes_data: 字节数据
118 |         default_encoding: 默认编码
119 |     
120 |     Returns:
121 |         解码后的字符串
122 |     """
123 |     try:
124 |         # 尝试多种编码
125 |         encodings = [default_encoding, 'gbk', 'gb2312', 'iso-8859-1']
126 |         
127 |         for encoding in encodings:
128 |             try:
129 |                 return bytes_data.decode(encoding)
130 |             except UnicodeDecodeError:
131 |                 continue
132 |         
133 |         # 如果都失败，使用replace模式
134 |         return bytes_data.decode(default_encoding, errors='replace')
135 |         
136 |     except Exception as e:
137 |         logger.error(f"安全解码失败: {str(e)}")
138 |         return str(bytes_data)
139 | 
140 | def format_size(size_bytes: int) -> str:
141 |     """
142 |     格式化文件大小
143 |     
144 |     Args:
145 |         size_bytes: 字节大小
146 |     
147 |     Returns:
148 |         格式化的大小字符串
149 |     """
150 |     try:
151 |         for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
152 |             if size_bytes < 1024.0:
153 |                 return f"{size_bytes:.2f} {unit}"
154 |             size_bytes /= 1024.0
155 |         return f"{size_bytes:.2f} PB"
156 |     except Exception as e:
157 |         logger.error(f"格式化大小失败: {str(e)}")
158 |         return f"{size_bytes} B"
159 | 
160 | def format_time(seconds: float) -> str:
161 |     """
162 |     格式化时间
163 |     
164 |     Args:
165 |         seconds: 秒数
166 |     
167 |     Returns:
168 |         格式化的时间字符串
169 |     """
170 |     try:
171 |         if seconds < 1:
172 |             return f"{seconds * 1000:.2f} ms"
173 |         elif seconds < 60:
174 |             return f"{seconds:.2f} s"
175 |         elif seconds < 3600:
176 |             minutes, seconds = divmod(seconds, 60)
177 |             return f"{int(minutes)} m {seconds:.2f} s"
178 |         else:
179 |             hours, remainder = divmod(seconds, 3600)
180 |             minutes, seconds = divmod(remainder, 60)
181 |             return f"{int(hours)} h {int(minutes)} m {seconds:.2f} s"
182 |     except Exception as e:
183 |         logger.error(f"格式化时间失败: {str(e)}")
184 |         return f"{seconds} s"
185 | 
186 | def get_file_extension(file_path: str) -> str:
187 |     """
188 |     获取文件扩展名
189 |     
190 |     Args:
191 |         file_path: 文件路径
192 |     
193 |     Returns:
194 |         扩展名（小写）
195 |     """
196 |     try:
197 |         _, ext = os.path.splitext(file_path.lower())
198 |         return ext
199 |     except Exception as e:
200 |         logger.error(f"获取文件扩展名失败: {str(e)}")
201 |         return ''
202 | 
203 | def validate_ip_address(ip: str) -> bool:
204 |     """
205 |     验证IP地址格式
206 |     
207 |     Args:
208 |         ip: IP地址字符串
209 |     
210 |     Returns:
211 |         是否为有效IP地址
212 |     """
213 |     try:
214 |         # IPv4地址验证
215 |         pattern = re.compile(r'^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
216 |         return bool(pattern.match(ip))
217 |     except Exception as e:
218 |         logger.error(f"验证IP地址失败: {str(e)}")
219 |         return False
220 | 
221 | def count_occurrences(text: str, keyword: str, case_sensitive: bool = False) -> int:
222 |     """
223 |     统计关键字出现次数
224 |     
225 |     Args:
226 |         text: 文本内容
227 |         keyword: 关键字
228 |         case_sensitive: 是否区分大小写
229 |     
230 |     Returns:
231 |         出现次数
232 |     """
233 |     try:
234 |         if not case_sensitive:
235 |             text = text.lower()
236 |             keyword = keyword.lower()
237 |         
238 |         return text.count(keyword)
239 |     except Exception as e:
240 |         logger.error(f"统计关键字失败: {str(e)}")
241 |         return 0
242 | 
243 | def is_valid_email(email: str) -> bool:
244 |     """
245 |     验证邮箱格式
246 |     
247 |     Args:
248 |         email: 邮箱地址
249 |     
250 |     Returns:
251 |         是否为有效邮箱
252 |     """
253 |     try:
254 |         pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
255 |         return bool(pattern.match(email))
256 |     except Exception as e:
257 |         logger.error(f"验证邮箱失败: {str(e)}")
258 |         return False
259 | 
260 | def sanitize_filename(filename: str) -> str:
261 |     """
262 |     清理文件名，移除特殊字符
263 |     
264 |     Args:
265 |         filename: 原始文件名
266 |     
267 |     Returns:
268 |         清理后的文件名
269 |     """
270 |     try:
271 |         # 移除或替换特殊字符
272 |         sanitized = re.sub(r'[\\/:*?"<>|]', '_', filename)
273 |         # 移除控制字符
274 |         sanitized = ''.join(char for char in sanitized if char.isprintable() or char.isspace())
275 |         # 限制长度
276 |         max_length = 200
277 |         if len(sanitized) > max_length:
278 |             name, ext = os.path.splitext(sanitized)
279 |             sanitized = name[:max_length - len(ext)] + ext
280 |         return sanitized.strip() or 'unnamed'
281 |     except Exception as e:
282 |         logger.error(f"清理文件名失败: {str(e)}")
283 |         return 'unnamed'
284 | 
285 | def merge_dicts(dict1: Dict, dict2: Dict, deep: bool = True) -> Dict:
286 |     """
287 |     合并两个字典
288 |     
289 |     Args:
290 |         dict1: 第一个字典
291 |         dict2: 第二个字典
292 |         deep: 是否深度合并
293 |     
294 |     Returns:
295 |         合并后的字典
296 |     """
297 |     try:
298 |         result = dict1.copy()
299 |         
300 |         if deep:
301 |             for key, value in dict2.items():
302 |                 if key in result and isinstance(result[key], dict) and isinstance(value, dict):
303 |                     result[key] = merge_dicts(result[key], value, deep=True)
304 |                 else:
305 |                     result[key] = value
306 |         else:
307 |             result.update(dict2)
308 |         
309 |         return result
310 |     except Exception as e:
311 |         logger.error(f"合并字典失败: {str(e)}")
312 |         return dict1
313 | 
314 | def remove_duplicates_preserve_order(items: List) -> List:
315 |     """
316 |     移除列表中的重复项，保留原始顺序
317 |     
318 |     Args:
319 |         items: 输入列表
320 |     
321 |     Returns:
322 |         去重后的列表
323 |     """
324 |     try:
325 |         seen = set()
326 |         return [item for item in items if not (item in seen or seen.add(item))]
327 |     except Exception as e:
328 |         logger.error(f"去重失败: {str(e)}")
329 |         return items
330 | 
331 | def truncate_text(text: str, max_length: int, suffix: str = '...') -> str:
332 |     """
333 |     截断文本
334 |     
335 |     Args:
336 |         text: 输入文本
337 |         max_length: 最大长度
338 |         suffix: 后缀
339 |     
340 |     Returns:
341 |         截断后的文本
342 |     """
343 |     try:
344 |         if len(text) <= max_length:
345 |             return text
346 |         return text[:max_length - len(suffix)] + suffix
347 |     except Exception as e:
348 |         logger.error(f"截断文本失败: {str(e)}")
349 |         return text
350 | 
351 | def retry(func, max_retries: int = 3, delay: float = 1.0, exceptions: tuple = (Exception,)) -> Any:
352 |     """
353 |     重试装饰器
354 |     
355 |     Args:
356 |         func: 要重试的函数
357 |         max_retries: 最大重试次数
358 |         delay: 重试间隔（秒）
359 |         exceptions: 捕获的异常类型
360 |     
361 |     Returns:
362 |         函数执行结果
363 |     """
364 |     def wrapper(*args, **kwargs):
365 |         last_exception = None
366 |         
367 |         for attempt in range(max_retries):
368 |             try:
369 |                 return func(*args, **kwargs)
370 |             except exceptions as e:
371 |                 last_exception = e
372 |                 if attempt < max_retries - 1:
373 |                     logger.warning(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}, {delay}秒后重试...")
374 |                     time.sleep(delay)
375 |         
376 |         logger.error(f"所有尝试都失败了: {str(last_exception)}")
377 |         raise last_exception
378 |     
379 |     return wrapper
380 | 
381 | # 移除末尾的导入语句
382 | 
383 | # 兼容性函数，为了支持html_detector.py中的导入
384 | def extract_text_between_markers(text: str, start_marker: str, end_marker: str) -> List[str]:
385 |     """
386 |     提取两个标记之间的文本（extract_text_between的别名）
387 |     
388 |     Args:
389 |         text: 原始文本
390 |         start_marker: 开始标记
391 |         end_marker: 结束标记
392 |     
393 |     Returns:
394 |         提取的文本列表
395 |     """
396 |     return extract_text_between(text, start_marker, end_marker)
397 | 
398 | def get_context(text: str, position: int, context_length: int = 50) -> str:
399 |     """
400 |     获取文本中指定位置的上下文
401 |     
402 |     Args:
403 |         text: 原始文本
404 |         position: 目标位置
405 |         context_length: 上下文长度
406 |     
407 |     Returns:
408 |         包含上下文的文本
409 |     """
410 |     try:
411 |         # 计算上下文的起始和结束位置
412 |         context_start = max(0, position - context_length)
413 |         context_end = min(len(text), position + context_length)
414 |         
415 |         # 提取上下文
416 |         context = text[context_start:context_end]
417 |         
418 |         # 添加省略号
419 |         prefix = '...' if context_start > 0 else ''
420 |         suffix = '...' if context_end < len(text) else ''
421 |         
422 |         return f"{prefix}{context}{suffix}"
423 |     except Exception as e:
424 |         logger.error(f"获取上下文失败: {str(e)}")
425 |         return text
426 | 
427 | def calculate_entropy(text: str) -> float:
428 |     """
429 |     计算文本的熵值
430 |     
431 |     Args:
432 |         text: 输入文本
433 |     
434 |     Returns:
435 |         熵值
436 |     """
437 |     try:
438 |         import math
439 |         
440 |         # 计算字符频率
441 |         frequency = {}
442 |         for char in text:
443 |             if char in frequency:
444 |                 frequency[char] += 1
445 |             else:
446 |                 frequency[char] = 1
447 |         
448 |         # 计算熵
449 |         entropy = 0.0
450 |         total_chars = len(text)
451 |         
452 |         for count in frequency.values():
453 |             probability = count / total_chars
454 |             entropy -= probability * math.log2(probability)
455 |         
456 |         return entropy
457 |     except Exception as e:
458 |         logger.error(f"计算熵值失败: {str(e)}")
459 |         return 0.0
460 |         
461 | 


--------------------------------------------------------------------------------
/YuanZhao.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 渊照 - 暗链扫描工具
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import argparse
 11 | import logging
 12 | import re
 13 | from datetime import datetime
 14 | from urllib.parse import urlparse
 15 | 
 16 | # 添加项目根目录到Python路径
 17 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 18 | 
 19 | from utils.logging_utils import setup_logging, log_config, log_summary
 20 | from core.config import Config
 21 | from core.scanner import Scanner
 22 | from core.reporter import Reporter
 23 | 
 24 | def parse_arguments():
 25 |     """
 26 |     解析命令行参数
 27 |     """
 28 |     description = '''渊照 - 专业暗链扫描工具
 29 |     
 30 |     用于智能检测网站、HTML文件或目录中的可疑暗链、隐藏元素和恶意代码。
 31 |     支持自动识别扫描目标类型（本地文件/目录、内网URL、公网URL），并应用最优扫描策略。
 32 |     提供多种扫描模式和报告格式，具备强大的检测能力和灵活的配置选项。
 33 |     
 34 |     主要功能：
 35 |     - 基础扫描：HTML代码、JavaScript代码、CSS代码、元标签、注释扫描
 36 |     - 高级扫描：加密/编码链接检测、隐写术检测、DOM操作检测、iframe检测
 37 |     - 特殊隐藏手法检测：颜色隐藏、绝对定位隐藏、零宽字符隐藏、字体大小隐藏等
 38 |     - 关键字匹配：支持自定义关键字文件，按类别组织关键字，进行多语言匹配
 39 |     - 优化的HTML报告：清晰展示可疑链接信息，上下文列直接显示从日志中检测到的完整问题
 40 |     '''
 41 |     
 42 |     parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)
 43 |     
 44 |     # 扫描目标
 45 |     parser.add_argument('target', help='扫描目标：文件路径、目录路径或URL（支持http/https协议）')
 46 |     
 47 |     # 扫描配置
 48 |     parser.add_argument('-d', '--depth', type=int, default=3, 
 49 |                         help='递归扫描深度（默认：3，0表示仅扫描当前文件/目录）')
 50 |     parser.add_argument('-m', '--mode', choices=['fast', 'standard', 'deep'], default='deep', 
 51 |                         help='''扫描模式：
 52 |                         fast（基础）：仅检测基本的暗链与明显可疑元素，快速
 53 |                         standard（高级）：增加JS/HTML/CSS分析与隐藏元素检测
 54 |                         deep（完整）：执行全部检测模块，适合深度审计''')
 55 |     parser.add_argument('-t', '--threads', type=int, default=8, 
 56 |                         help='并发线程数（默认：8，范围1-100）')
 57 |     parser.add_argument('-o', '--output', help='报告输出目录（默认：./reports）')
 58 |     parser.add_argument('-f', '--format', choices=['txt', 'html', 'json', 'csv'], default='txt', 
 59 |                         help='''报告格式（默认：txt）：
 60 |                         txt：简洁的文本报告，适合快速查看和日志记录
 61 |                         html：详细的网页报告，包含样式和表格，上下文列直接显示问题链接
 62 |                         json：结构化数据，适合程序处理和自动化集成
 63 |                         csv：表格数据，适合导入电子表格软件进行进一步分析''')
 64 |     
 65 |     # 高级配置
 66 |     parser.add_argument('--timeout', type=int, default=30, 
 67 |                         help='请求超时时间（秒，默认：30）。注意：工具会根据目标类型（内网/公网）自动优化超时设置')
 68 |     parser.add_argument('--proxy', help='''代理设置，格式：
 69 |                         http://username:password@host:port（有认证）或
 70 |                         http://host:port（无认证）''')
 71 |     parser.add_argument('--keyword-file', help='''自定义关键字文件路径（CSV格式）
 72 |                         格式示例：关键字,类别,风险权重
 73 |                         类别可选：gambling(博彩), porn(色情), malware(恶意软件), phishing(钓鱼), other(其他)
 74 |                         风险权重范围：1-10，10为最高风险''')
 75 |     parser.add_argument('--exclude', nargs='+', help='排除的文件或目录（支持通配符，如 "*.log" "node_modules/"）')
 76 |     parser.add_argument('--no-color', action='store_true', help='禁用彩色输出')
 77 |     parser.add_argument('--verbose', action='store_true', default=False, help='显示详细日志信息，包括检测过程和调试内容')
 78 |     
 79 |     # 无头浏览器选项
 80 |     parser.add_argument('--headless', action='store_true', help='启用无头浏览器扫描 (增强动态内容检测)')
 81 |     parser.add_argument('--browser-type', choices=['chrome'], default='chrome', help='无头浏览器类型 (默认: chrome)')
 82 |     parser.add_argument('--js-wait', type=int, default=3, help='JavaScript执行等待时间 (秒, 默认: 3)')
 83 |     parser.add_argument('--headless-timeout', type=int, default=60, help='无头浏览器超时时间 (秒, 默认: 60)')
 84 |     parser.add_argument('--headless-binary', help='Chrome二进制路径 (例如: C\\Program Files\\Google\\Chrome\\Application\\chrome.exe)')
 85 |     parser.add_argument('--headless-driver', help='ChromeDriver路径 (例如: C\\drivers\\chromedriver.exe)')
 86 |     parser.add_argument('--target-file', help='目标列表文件，每行一个目标')
 87 |     
 88 |     # 添加使用示例
 89 |     parser.epilog = '''
 90 | 使用示例：
 91 |   # 扫描单个HTML文件
 92 |   python YuanZhao.py test.html
 93 |   
 94 |   # 扫描目录及其子目录（深度为2）
 95 |   python YuanZhao.py ./website -d 2
 96 |   
 97 |   # 扫描URL，使用高级模式，保存为HTML格式报告
 98 |   python YuanZhao.py https://example.com -m standard -f html
 99 |   
100 |   # 使用自定义关键字文件，禁用彩色输出
101 |   python YuanZhao.py ./website --keyword-file custom_keywords.txt --no-color
102 |   
103 |   # 完整扫描公网网站并生成HTML报告（优化后格式，在上下文列显示完整问题链接）
104 |   python YuanZhao.py https://example.com -m deep -d 1 -t 8 --timeout 30 -f html --verbose
105 |   
106 |   # 扫描特定新闻页面并在可疑链接详情中显示问题信息
107 |   python YuanZhao.py https://example.com/news.php -m deep -d 1 -t 8 --timeout 30 -f html --verbose
108 |   
109 |   # 对内网网站进行深度扫描，使用较长超时时间
110 |   python YuanZhao.py http://192.168.1.100 -d 4 -m deep --timeout 60 -f html -o intranet_reports
111 |   
112 |   # 扫描并排除特定文件类型
113 |   python YuanZhao.py ./website --exclude "*.log" "temp/*" "node_modules/"
114 |   
115 |   # 使用无头浏览器增强扫描动态内容
116 |   python YuanZhao.py https://example.com --headless --js-wait 5
117 |   '''
118 |     
119 |     return parser.parse_args()
120 | 
121 | def validate_arguments(args):
122 |     """
123 |     验证命令行参数
124 |     """
125 |     # 验证目标是否存在（如果是文件或目录）
126 |     if not args.target.startswith(('http://', 'https://')):
127 |         if not os.path.exists(args.target):
128 |             print(f"错误：目标 '{args.target}' 不存在")
129 |             return False
130 |         if args.target.lower().endswith('.txt'):
131 |             try:
132 |                 with open(args.target, 'r', encoding='utf-8') as f:
133 |                     lines = [line.strip() for line in f.readlines() if line.strip()]
134 |                 if not lines:
135 |                     print("错误：目标列表文件为空")
136 |                     return False
137 |             except Exception:
138 |                 print("错误：无法读取目标列表文件")
139 |                 return False
140 |     
141 |     # 验证关键字文件
142 |     if args.keyword_file and not os.path.exists(args.keyword_file):
143 |         print(f"错误：关键字文件 '{args.keyword_file}' 不存在")
144 |         return False
145 |     if args.target_file and not os.path.exists(args.target_file):
146 |         print(f"错误：目标列表文件 '{args.target_file}' 不存在")
147 |         return False
148 |     
149 |     # 验证线程数
150 |     if args.threads < 1 or args.threads > 100:
151 |         print("错误：线程数必须在1-100之间")
152 |         return False
153 |     
154 |     # 验证扫描深度
155 |     if args.depth < 0:
156 |         print("错误：扫描深度不能为负数")
157 |         return False
158 |     
159 |     return True
160 | 
161 | def main():
162 |     """
163 |     主函数
164 |     """
165 |     # 解析参数
166 |     args = parse_arguments()
167 |     
168 |     # 验证参数
169 |     if not validate_arguments(args):
170 |         sys.exit(1)
171 |     
172 |     # 创建报告目录
173 |     report_dir = args.output or os.path.join(os.getcwd(), 'reports')
174 |     os.makedirs(report_dir, exist_ok=True)
175 |     
176 |     # 设置日志
177 |     log_level = logging.DEBUG if args.verbose else logging.INFO
178 |     logger = setup_logging(log_dir=report_dir, level=log_level)
179 |     
180 |     # 记录开始时间
181 |     start_time = datetime.now()
182 |     logger.info(f"开始扫描：{args.target}")
183 |     logger.info(f"扫描模式：{args.mode}")
184 |     
185 |     # 创建配置
186 |     config = Config()
187 |     
188 |     # 设置配置属性
189 |     # 判断目标类型
190 |     if args.target.startswith(('http://', 'https://')):
191 |         # 检查是否为内网链接
192 |         parsed_url = urlparse(args.target)
193 |         domain = parsed_url.netloc
194 |         # 内网域名/IP特征
195 |         if (re.match(r'^127\.0\.0\.1(:\d+)?$', domain) or 
196 |             re.match(r'^localhost(:\d+)?$', domain) or
197 |             re.match(r'^10\.\d+\.\d+\.\d+(:\d+)?$', domain) or
198 |             re.match(r'^172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+(:\d+)?$', domain) or
199 |             re.match(r'^192\.168\.\d+\.\d+(:\d+)?$', domain)):
200 |             config.target_type = 'internal_url'
201 |         else:
202 |             config.target_type = 'external_url'
203 |     elif os.path.isfile(args.target):
204 |         config.target_type = 'local_file'
205 |     elif os.path.isdir(args.target):
206 |         config.target_type = 'local_directory'
207 |     else:
208 |         config.target_type = 'unknown'
209 |     
210 |     config.target = args.target
211 |     config.crawl_depth = args.depth
212 |     config.depth = args.depth  # 同步更新depth属性
213 |     
214 |     # 映射扫描模式（仅使用新名称）
215 |     mode_mapping = {
216 |         'fast': 'fast',
217 |         'standard': 'standard',
218 |         'deep': 'deep'
219 |     }
220 |     config.scan_mode = mode_mapping.get(args.mode, 'standard')
221 |     config.mode = config.scan_mode  # 同步更新mode属性
222 |     config._set_mode_config()  # 更新模式相关配置
223 |     
224 |     config.threads = args.threads
225 |     config.timeout = args.timeout
226 |     config.proxy = args.proxy
227 |     config.keywords_file = args.keyword_file
228 |     config.report_type = args.format
229 |     config.report_file = os.path.join(report_dir, f"scan_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{args.format}")
230 |     config.debug = args.verbose
231 |     # 排除规则
232 |     config.exclude = args.exclude or []
233 |     
234 |     # 设置无头浏览器配置
235 |     config.use_headless_browser = args.headless
236 |     config.headless_browser = args.browser_type
237 |     config.js_wait_time = args.js_wait
238 |     config.headless_timeout = args.headless_timeout
239 |     config.headless_binary = args.headless_binary
240 |     config.headless_driver_path = args.headless_driver
241 |     if args.headless:
242 |         config.headless_auto_download = True
243 |     
244 |     # 记录配置
245 |     log_config(logger, config.get_config_dict())
246 |     
247 |     try:
248 |         targets = []
249 |         if args.target_file:
250 |             with open(args.target_file, 'r', encoding='utf-8') as f:
251 |                 targets = [line.strip() for line in f.readlines() if line.strip()]
252 |             summary_target = f"目标列表: {args.target_file} ({len(targets)} 项)"
253 |         elif not args.target.startswith(('http://', 'https://')) and args.target.lower().endswith('.txt'):
254 |             with open(args.target, 'r', encoding='utf-8') as f:
255 |                 targets = [line.strip() for line in f.readlines() if line.strip()]
256 |             summary_target = f"目标列表: {args.target} ({len(targets)} 项)"
257 |         else:
258 |             targets = [args.target]
259 |             summary_target = args.target
260 |         agg = {
261 |             'total_files': 0,
262 |             'scanned_files': 0,
263 |             'scanned_urls': 0,
264 |             'total_issues': 0,
265 |             'suspicious_links': [],
266 |             'hidden_elements': [],
267 |             'keyword_matches': [],
268 |             'js_issues': [],
269 |             'css_issues': [],
270 |             'scan_time': 0
271 |         }
272 |         for tgt in targets:
273 |             if tgt.startswith(('http://', 'https://')):
274 |                 parsed_url = urlparse(tgt)
275 |                 domain = parsed_url.netloc
276 |                 if (re.match(r'^127\.0\.0\.1(:\d+)?$', domain) or 
277 |                     re.match(r'^localhost(:\d+)?$', domain) or
278 |                     re.match(r'^10\.\d+\.\d+\.\d+(:\d+)?$', domain) or
279 |                     re.match(r'^172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+(:\d+)?$', domain) or
280 |                     re.match(r'^192\.168\.\d+\.\d+(:\d+)?$', domain)):
281 |                     config.target_type = 'internal_url'
282 |                 else:
283 |                     config.target_type = 'external_url'
284 |             elif os.path.isfile(tgt):
285 |                 config.target_type = 'local_file'
286 |             elif os.path.isdir(tgt):
287 |                 config.target_type = 'local_directory'
288 |             else:
289 |                 continue
290 |             config.target = tgt
291 |             scanner = Scanner(config)
292 |             res = scanner.scan()
293 |             agg['total_files'] += res.get('total_files', 0)
294 |             agg['scanned_files'] += res.get('scanned_files', 0)
295 |             agg['scanned_urls'] += res.get('scanned_urls', 0)
296 |             agg['total_issues'] += res.get('total_issues', 0)
297 |             agg['suspicious_links'].extend(res.get('suspicious_links', []))
298 |             agg['hidden_elements'].extend(res.get('hidden_elements', []))
299 |             agg['keyword_matches'].extend(res.get('keyword_matches', []))
300 |             agg['js_issues'].extend(res.get('js_issues', []))
301 |             agg['css_issues'].extend(res.get('css_issues', []))
302 |         end_time = datetime.now()
303 |         duration = str(end_time - start_time)
304 |         config.target = summary_target
305 |         reporter = Reporter(config)
306 |         report_file = reporter.generate_report(agg, duration)
307 |         scan_time = (end_time - start_time).total_seconds()
308 |         log_summary(
309 |             logger,
310 |             total_files=agg.get('total_files', 0),
311 |             scanned_files=agg.get('scanned_files', 0),
312 |             issues_found=agg.get('total_issues', 0),
313 |             scan_time=scan_time
314 |         )
315 |         logger.info(f"扫描完成！报告已保存至：{report_file}")
316 |         print(f"\n扫描完成！报告已保存至：{report_file}")
317 |         
318 |     except Exception as e:
319 |         logger.error(f"扫描过程中发生错误：{str(e)}", exc_info=True)
320 |         print(f"错误：扫描过程中发生错误 - {str(e)}")
321 |         sys.exit(1)
322 | 
323 | if __name__ == '__main__':
324 |     main()
325 |     
326 | 


--------------------------------------------------------------------------------
/utils/css_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | CSS处理工具模块
  5 | """
  6 | 
  7 | import re
  8 | import logging
  9 | from typing import List, Dict, Optional
 10 | 
 11 | logger = logging.getLogger('YuanZhao.utils.css')
 12 | 
 13 | def extract_css_urls(css_content: str) -> List[Dict[str, str]]:
 14 |     """
 15 |     提取CSS中的URL
 16 |     
 17 |     Args:
 18 |         css_content: CSS内容
 19 |     
 20 |     Returns:
 21 |         URL列表
 22 |     """
 23 |     urls = []
 24 |     
 25 |     try:
 26 |         # 匹配CSS中的url()函数
 27 |         url_pattern = re.compile(r'url\s*\(\s*(["\']?)([^"\'\)]+)\1\s*\)', re.IGNORECASE)
 28 |         matches = url_pattern.finditer(css_content)
 29 |         
 30 |         for match in matches:
 31 |             url = match.group(2)
 32 |             start_pos = match.start(0)
 33 |             end_pos = match.end(0)
 34 |             
 35 |             # 获取上下文
 36 |             context_start = max(0, start_pos - 50)
 37 |             context_end = min(len(css_content), end_pos + 50)
 38 |             context = css_content[context_start:context_end]
 39 |             
 40 |             urls.append({
 41 |                 'url': url,
 42 |                 'original': match.group(0),
 43 |                 'context': context,
 44 |                 'position': (start_pos, end_pos)
 45 |             })
 46 |     
 47 |     except Exception as e:
 48 |         logger.error(f"提取CSS URL失败: {str(e)}")
 49 |     
 50 |     return urls
 51 | 
 52 | def extract_import_rules(css_content: str) -> List[Dict[str, str]]:
 53 |     """
 54 |     提取CSS中的@import规则
 55 |     
 56 |     Args:
 57 |         css_content: CSS内容
 58 |     
 59 |     Returns:
 60 |         @import规则列表
 61 |     """
 62 |     import_rules = []
 63 |     
 64 |     try:
 65 |         # 匹配@import规则
 66 |         import_pattern = re.compile(r'@import\s+(["\']?)([^"\';\n]+)\1\s*([^;\n]*)\s*;', re.IGNORECASE)
 67 |         matches = import_pattern.finditer(css_content)
 68 |         
 69 |         for match in matches:
 70 |             url = match.group(2)
 71 |             media = match.group(3)
 72 |             start_pos = match.start(0)
 73 |             end_pos = match.end(0)
 74 |             
 75 |             import_rules.append({
 76 |                 'url': url,
 77 |                 'media': media,
 78 |                 'original': match.group(0),
 79 |                 'position': (start_pos, end_pos)
 80 |             })
 81 |     
 82 |     except Exception as e:
 83 |         logger.error(f"提取CSS @import规则失败: {str(e)}")
 84 |     
 85 |     return import_rules
 86 | 
 87 | def extract_selectors(css_content: str) -> List[Dict[str, str]]:
 88 |     """
 89 |     提取CSS选择器
 90 |     
 91 |     Args:
 92 |         css_content: CSS内容
 93 |     
 94 |     Returns:
 95 |         选择器列表
 96 |     """
 97 |     selectors = []
 98 |     
 99 |     try:
100 |         # 移除注释
101 |         css_content = remove_css_comments(css_content)
102 |         
103 |         # 匹配CSS规则
104 |         rule_pattern = re.compile(r'([^{]+)\s*{[^}]*}', re.DOTALL)
105 |         rules = rule_pattern.finditer(css_content)
106 |         
107 |         for rule in rules:
108 |             selector_text = rule.group(1).strip()
109 |             
110 |             # 分割多个选择器
111 |             for selector in selector_text.split(','):
112 |                 selector = selector.strip()
113 |                 if selector:
114 |                     selectors.append({
115 |                         'selector': selector,
116 |                         'position': (rule.start(1), rule.end(1))
117 |                     })
118 |     
119 |     except Exception as e:
120 |         logger.error(f"提取CSS选择器失败: {str(e)}")
121 |     
122 |     return selectors
123 | 
124 | def extract_css_properties(css_content: str) -> List[Dict[str, str]]:
125 |     """
126 |     提取CSS属性
127 |     
128 |     Args:
129 |         css_content: CSS内容
130 |     
131 |     Returns:
132 |         CSS属性列表
133 |     """
134 |     properties = []
135 |     
136 |     try:
137 |         # 移除注释
138 |         css_content = remove_css_comments(css_content)
139 |         
140 |         # 匹配CSS规则体
141 |         body_pattern = re.compile(r'\{([^}]*)\}', re.DOTALL)
142 |         bodies = body_pattern.finditer(css_content)
143 |         
144 |         for body in bodies:
145 |             body_content = body.group(1)
146 |             body_start = body.start(1)
147 |             
148 |             # 匹配属性
149 |             prop_pattern = re.compile(r'([^:;\s]+)\s*:\s*([^;]+);')
150 |             props = prop_pattern.finditer(body_content)
151 |             
152 |             for prop in props:
153 |                 prop_name = prop.group(1).strip()
154 |                 prop_value = prop.group(2).strip()
155 |                 
156 |                 properties.append({
157 |                     'property': prop_name,
158 |                     'value': prop_value,
159 |                     'position': (body_start + prop.start(1), body_start + prop.end(1))
160 |                 })
161 |     
162 |     except Exception as e:
163 |         logger.error(f"提取CSS属性失败: {str(e)}")
164 |     
165 |     return properties
166 | 
167 | def detect_hidden_elements(css_content: str) -> List[Dict[str, str]]:
168 |     """
169 |     检测可能用于隐藏元素的CSS规则
170 |     
171 |     Args:
172 |         css_content: CSS内容
173 |     
174 |     Returns:
175 |         隐藏规则列表
176 |     """
177 |     hidden_rules = []
178 |     
179 |     # 隐藏元素的属性模式
180 |     hiding_patterns = [
181 |         (r'display\s*:\s*none', 'display: none'),
182 |         (r'visibility\s*:\s*hidden', 'visibility: hidden'),
183 |         (r'opacity\s*:\s*0', 'opacity: 0'),
184 |         (r'position\s*:\s*absolute.*left\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*top\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*width\s*:\s*\d+px\s*;.*height\s*:\s*\d+px', 'absolute positioned tiny element'),
185 |         (r'position\s*:\s*absolute.*left\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*top\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)', 'absolute positioned'),
186 |         (r'overflow\s*:\s*hidden', 'overflow: hidden'),
187 |         (r'clip\s*:\s*rect\(0\s*px\s*0\s*px\s*0\s*px\s*0\s*px\)', 'clip: rect'),
188 |         (r'font-size\s*:\s*0(?:px)?', 'font-size: 0'),
189 |         (r'line-height\s*:\s*0(?:px)?', 'line-height: 0'),
190 |         (r'text-indent\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)', 'text-indent'),
191 |         (r'color\s*:\s*transparent', 'color: transparent'),
192 |         (r'background-color\s*:\s*transparent', 'background-color: transparent'),
193 |         (r'height\s*:\s*0(?:px)?', 'height: 0'),
194 |         (r'width\s*:\s*0(?:px)?', 'width: 0'),
195 |     ]
196 |     
197 |     try:
198 |         # 移除注释
199 |         css_content = remove_css_comments(css_content)
200 |         
201 |         # 匹配CSS规则
202 |         rule_pattern = re.compile(r'([^{]+)\s*{([^}]*)}', re.DOTALL)
203 |         rules = rule_pattern.finditer(css_content)
204 |         
205 |         for rule in rules:
206 |             selector = rule.group(1).strip()
207 |             body = rule.group(2)
208 |             start_pos = rule.start(0)
209 |             end_pos = rule.end(0)
210 |             
211 |             # 检查每个隐藏模式
212 |             for pattern_str, hiding_type in hiding_patterns:
213 |                 pattern = re.compile(pattern_str, re.DOTALL | re.IGNORECASE)
214 |                 
215 |                 if pattern.search(body):
216 |                     hidden_rules.append({
217 |                         'type': hiding_type,
218 |                         'selector': selector,
219 |                         'css': body.strip(),
220 |                         'original_rule': rule.group(0),
221 |                         'position': (start_pos, end_pos)
222 |                     })
223 |                     break  # 每个规则只记录一次
224 |     
225 |     except Exception as e:
226 |         logger.error(f"检测隐藏元素失败: {str(e)}")
227 |     
228 |     return hidden_rules
229 | 
230 | def detect_suspicious_selectors(css_content: str) -> List[Dict[str, str]]:
231 |     """
232 |     检测可疑的CSS选择器
233 |     
234 |     Args:
235 |         css_content: CSS内容
236 |     Returns:
237 |         可疑选择器列表
238 |     """
239 |     suspicious_selectors = []
240 |     
241 |     # 可疑选择器模式
242 |     suspicious_patterns = [
243 |         # 随机字符串类名或ID
244 |         (r'\.(\w{8,})[^\w\-]', 'long_random_class'),
245 |         (r'#(\w{8,})[^\w\-]', 'long_random_id'),
246 |         # 连续数字类名或ID
247 |         (r'\.(\d{4,})[^\w\-]', 'numeric_class'),
248 |         (r'#(\d{4,})[^\w\-]', 'numeric_id'),
249 |         # 特殊字符选择器
250 |         (r'[\[\*\+\~\^\$\|]', 'complex_selector'),
251 |     ]
252 |     
253 |     try:
254 |         # 移除注释
255 |         css_content = remove_css_comments(css_content)
256 |         
257 |         # 匹配CSS规则
258 |         rule_pattern = re.compile(r'([^{]+)\s*{[^}]*}', re.DOTALL)
259 |         rules = rule_pattern.finditer(css_content)
260 |         
261 |         for rule in rules:
262 |             selector_text = rule.group(1).strip()
263 |             
264 |             # 检查每个可疑模式
265 |             for pattern_str, selector_type in suspicious_patterns:
266 |                 pattern = re.compile(pattern_str, re.DOTALL)
267 |                 
268 |                 if pattern.search(selector_text):
269 |                     suspicious_selectors.append({
270 |                         'type': selector_type,
271 |                         'selector': selector_text,
272 |                         'position': (rule.start(1), rule.end(1))
273 |                     })
274 |                     break  # 每个选择器只记录一次
275 |     
276 |     except Exception as e:
277 |         logger.error(f"检测可疑选择器失败: {str(e)}")
278 |     
279 |     return suspicious_selectors
280 | 
281 | def remove_css_comments(css_content: str) -> str:
282 |     """
283 |     移除CSS注释
284 |     
285 |     Args:
286 |         css_content: CSS内容
287 |     
288 |     Returns:
289 |         移除注释后的CSS内容
290 |     """
291 |     try:
292 |         # 移除CSS注释
293 |         css_content = re.sub(r'/\*.*?\*/', '', css_content, flags=re.DOTALL)
294 |         return css_content
295 |     except Exception as e:
296 |         logger.error(f"移除CSS注释失败: {str(e)}")
297 |         return css_content
298 | 
299 | 
300 | def analyze_complexity(css_content: str) -> Dict[str, int]:
301 |     """
302 |     分析CSS复杂度
303 |     
304 |     Args:
305 |         css_content: CSS内容
306 |     
307 |     Returns:
308 |         包含复杂度指标的字典
309 |     """
310 |     complexity = {
311 |         'rules_count': 0,
312 |         'selectors_count': 0,
313 |         'properties_count': 0,
314 |         'imports_count': 0,
315 |         'media_queries_count': 0
316 |     }
317 |     
318 |     try:
319 |         # 移除注释
320 |         css_content = remove_css_comments(css_content)
321 |         
322 |         # 计算规则数量
323 |         rule_pattern = re.compile(r'\{[^}]*\}', re.DOTALL)
324 |         complexity['rules_count'] = len(rule_pattern.findall(css_content))
325 |         
326 |         # 计算选择器数量
327 |         selectors = extract_selectors(css_content)
328 |         complexity['selectors_count'] = len(selectors)
329 |         
330 |         # 计算属性数量
331 |         properties = extract_css_properties(css_content)
332 |         complexity['properties_count'] = len(properties)
333 |         
334 |         # 计算导入规则数量
335 |         imports = extract_import_rules(css_content)
336 |         complexity['imports_count'] = len(imports)
337 |         
338 |         # 计算媒体查询数量
339 |         media_query_pattern = re.compile(r'@media\s+[^\{]*\{[^}]*\}', re.DOTALL)
340 |         complexity['media_queries_count'] = len(media_query_pattern.findall(css_content))
341 |         
342 |     except Exception as e:
343 |         logger.error(f"分析CSS复杂度失败: {str(e)}")
344 |     
345 |     return complexity
346 | 
347 | def extract_css_comments(css_content: str) -> List[Dict[str, str]]:
348 |     """
349 |     提取CSS注释
350 |     
351 |     Args:
352 |         css_content: CSS内容
353 |     
354 |     Returns:
355 |         注释列表
356 |     """
357 |     comments = []
358 |     
359 |     try:
360 |         comment_pattern = re.compile(r'/\*(.*?)\*/', re.DOTALL)
361 |         matches = comment_pattern.finditer(css_content)
362 |         
363 |         for match in matches:
364 |             comment_content = match.group(1).strip()
365 |             start_pos = match.start(0)
366 |             end_pos = match.end(0)
367 |             
368 |             comments.append({
369 |                 'content': comment_content,
370 |                 'position': (start_pos, end_pos)
371 |             })
372 |     
373 |     except Exception as e:
374 |         logger.error(f"提取CSS注释失败: {str(e)}")
375 |     
376 |     return comments
377 | 
378 | def analyze_css_complexity(css_content: str) -> Dict[str, int]:
379 |     """
380 |     分析CSS复杂度
381 |     
382 |     Args:
383 |         css_content: CSS内容
384 |     
385 |     Returns:
386 |         复杂度指标
387 |     """
388 |     try:
389 |         # 移除注释
390 |         css_content = remove_css_comments(css_content)
391 |         
392 |         # 计算规则数量
393 |         rule_pattern = re.compile(r'[^\s\n\r]+\s*{[^}]*}', re.DOTALL)
394 |         rules = rule_pattern.findall(css_content)
395 |         rule_count = len(rules)
396 |         
397 |         # 计算选择器数量
398 |         selectors = extract_selectors(css_content)
399 |         selector_count = len(selectors)
400 |         
401 |         # 计算属性数量
402 |         properties = extract_css_properties(css_content)
403 |         property_count = len(properties)
404 |         
405 |         # 计算URL数量
406 |         urls = extract_css_urls(css_content)
407 |         url_count = len(urls)
408 |         
409 |         return {
410 |             'rule_count': rule_count,
411 |             'selector_count': selector_count,
412 |             'property_count': property_count,
413 |             'url_count': url_count,
414 |             'file_size': len(css_content),
415 |         }
416 |         
417 |     except Exception as e:
418 |         logger.error(f"分析CSS复杂度失败: {str(e)}")
419 |         return {}
420 | 
421 | def find_duplicate_rules(css_content: str) -> List[Dict[str, str]]:
422 |     """
423 |     查找重复的CSS规则
424 |     
425 |     Args:
426 |         css_content: CSS内容
427 |     
428 |     Returns:
429 |         重复规则列表
430 |     """
431 |     duplicate_rules = []
432 |     seen_rules = {}
433 |     
434 |     try:
435 |         # 移除注释
436 |         css_content = remove_css_comments(css_content)
437 |         
438 |         # 匹配CSS规则
439 |         rule_pattern = re.compile(r'([^{]+)\s*{([^}]*)}', re.DOTALL)
440 |         rules = rule_pattern.finditer(css_content)
441 |         
442 |         for rule in rules:
443 |             selector = rule.group(1).strip()
444 |             body = rule.group(2).strip()
445 |             
446 |             # 使用body作为键，查找重复
447 |             if body in seen_rules:
448 |                 duplicate_rules.append({
449 |                     'selector': selector,
450 |                     'duplicate_selector': seen_rules[body],
451 |                     'css_body': body
452 |                 })
453 |             else:
454 |                 seen_rules[body] = selector
455 |                 
456 |     except Exception as e:
457 |         logger.error(f"查找重复规则失败: {str(e)}")
458 |     
459 |     return duplicate_rules
460 |     


--------------------------------------------------------------------------------
/core/detector/special_hiding_detector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 特殊隐藏技术检测器模块
  5 | """
  6 | 
  7 | import re
  8 | import logging
  9 | from typing import List, Dict
 10 | 
 11 | logger = logging.getLogger('YuanZhao.detector.special_hiding')
 12 | 
 13 | class SpecialHidingDetector:
 14 |     """特殊隐藏技术检测器"""
 15 |     
 16 |     def __init__(self, config):
 17 |         self.config = config
 18 |         self._init_patterns()
 19 |     
 20 |     def _init_patterns(self):
 21 |         """初始化正则表达式模式"""
 22 |         # 零宽字符模式
 23 |         self.zero_width_chars = [
 24 |             '\u200B',  # 零宽空格
 25 |             '\u200C',  # 零宽不连字
 26 |             '\u200D',  # 零宽连字
 27 |             '\u2060',  # 字连接符
 28 |             '\uFEFF',  # 字节顺序标记
 29 |         ]
 30 |         self.zero_width_pattern = re.compile('|'.join(re.escape(c) for c in self.zero_width_chars))
 31 |         
 32 |         # 空白字符堆积
 33 |         self.whitespace_pattern = re.compile(r'(\s|\t|\r|\n){10,}')
 34 |         
 35 |         # 颜色隐藏（颜色接近背景色）
 36 |         self.color_pattern = re.compile(
 37 |             r'color\s*:\s*(#\w{3,6}|rgba?\([^)]+\))',
 38 |             re.IGNORECASE
 39 |         )
 40 |         self.background_color_pattern = re.compile(
 41 |             r'background-color\s*:\s*(#\w{3,6}|rgba?\([^)]+\))',
 42 |             re.IGNORECASE
 43 |         )
 44 |         
 45 |         # 绝对定位隐藏（离屏元素）
 46 |         self.absolute_position_pattern = re.compile(
 47 |             r'position\s*:\s*absolute.*?(left|top|bottom|right)\s*:\s*(-?\d+(?:\.\d+)?(?:px|em|%)?)',
 48 |             re.IGNORECASE | re.DOTALL
 49 |         )
 50 |         
 51 |         # 字体大小隐藏
 52 |         self.font_size_pattern = re.compile(
 53 |             r'font-size\s*:\s*(0|0\.\d+)',
 54 |             re.IGNORECASE
 55 |         )
 56 |         
 57 |         # 文本缩进隐藏
 58 |         self.text_indent_pattern = re.compile(
 59 |             r'text-indent\s*:\s*(-\d+(?:\.\d+)?(?:px|em|%))',
 60 |             re.IGNORECASE
 61 |         )
 62 |         
 63 |         # 透明度隐藏
 64 |         self.opacity_pattern = re.compile(
 65 |             r'opacity\s*:\s*(0|0\.\d+)',
 66 |             re.IGNORECASE
 67 |         )
 68 |         self.visibility_pattern = re.compile(
 69 |             r'visibility\s*:\s*hidden',
 70 |             re.IGNORECASE
 71 |         )
 72 |         self.display_none_pattern = re.compile(
 73 |             r'display\s*:\s*none',
 74 |             re.IGNORECASE
 75 |         )
 76 |         
 77 |         # 多层嵌套隐藏
 78 |         self.nested_elements_pattern = re.compile(
 79 |             r'<(div|span|p|a)[^>]*>\s*<(div|span|p|a)[^>]*>\s*<(div|span|p|a)[^>]*>',
 80 |             re.IGNORECASE
 81 |         )
 82 |         
 83 |         # HTML实体编码隐藏
 84 |         self.html_entity_pattern = re.compile(r'&#(\d+);|&#x([0-9a-f]+);')
 85 |         
 86 |         # 可疑的编码混合
 87 |         self.mixed_encoding_pattern = re.compile(
 88 |             r'https?://(?:[\w\-._~:/?#[\]@!$&\'()*+,;=]|%[0-9a-fA-F]{2})+',
 89 |             re.IGNORECASE
 90 |         )
 91 |     
 92 |     def detect(self, content: str, source: str) -> List[Dict]:
 93 |         """检测特殊隐藏技术"""
 94 |         results = []
 95 |         
 96 |         try:
 97 |             # 检测零宽字符
 98 |             zero_width_results = self._detect_zero_width_chars(content, source)
 99 |             results.extend(zero_width_results)
100 |             
101 |             # 检测空白字符堆积
102 |             whitespace_results = self._detect_whitespace(content, source)
103 |             results.extend(whitespace_results)
104 |             
105 |             # 检测颜色隐藏
106 |             color_results = self._detect_color_hiding(content, source)
107 |             results.extend(color_results)
108 |             
109 |             # 检测绝对定位隐藏
110 |             position_results = self._detect_position_hiding(content, source)
111 |             results.extend(position_results)
112 |             
113 |             # 检测字体大小隐藏
114 |             font_size_results = self._detect_font_size_hiding(content, source)
115 |             results.extend(font_size_results)
116 |             
117 |             # 检测文本缩进隐藏
118 |             indent_results = self._detect_text_indent_hiding(content, source)
119 |             results.extend(indent_results)
120 |             
121 |             # 检测透明度隐藏
122 |             opacity_results = self._detect_opacity_hiding(content, source)
123 |             results.extend(opacity_results)
124 |             
125 |             # 检测多层嵌套隐藏
126 |             nested_results = self._detect_nested_elements(content, source)
127 |             results.extend(nested_results)
128 |             
129 |             # 检测HTML实体编码隐藏
130 |             entity_results = self._detect_html_entities(content, source)
131 |             results.extend(entity_results)
132 |             
133 |         except Exception as e:
134 |             logger.error(f"特殊隐藏技术检测失败: {str(e)}", exc_info=True)
135 |         
136 |         return results
137 |     
138 |     def _detect_zero_width_chars(self, content: str, source: str) -> List[Dict]:
139 |         """检测零宽字符"""
140 |         results = []
141 |         
142 |         matches = list(self.zero_width_pattern.finditer(content))
143 |         if matches:
144 |             # 收集所有零宽字符的上下文
145 |             context = self._get_context(content, matches[0].start(), matches[-1].end(), 100)
146 |             
147 |             # 解码隐藏内容（如果可能）
148 |             hidden_content = self._extract_hidden_content(content, self.zero_width_chars)
149 |             
150 |             results.append({
151 |                 'link': f'零宽字符隐藏 ({len(matches)}个字符)',
152 |                 'source': source,
153 |                 'type': 'zero_width_hiding',
154 |                 'detection_method': 'regex',
155 |                 'risk_level': '高',
156 |                 'context': context,
157 |                 'hidden_content': hidden_content if hidden_content else None
158 |             })
159 |         
160 |         return results
161 |     
162 |     def _detect_whitespace(self, content: str, source: str) -> List[Dict]:
163 |         """检测空白字符堆积"""
164 |         results = []
165 |         
166 |         for match in self.whitespace_pattern.finditer(content):
167 |             # 检查是否在HTML标签之间或注释中
168 |             context = self._get_context(content, match.start(), match.end(), 50)
169 |             
170 |             # 只有在标签之间大量空白才认为可疑
171 |             if '<' not in context and '>' not in context:
172 |                 results.append({
173 |                     'link': f'空白字符堆积 ({len(match.group(1))}个字符)',
174 |                     'source': source,
175 |                     'type': 'whitespace_hiding',
176 |                     'detection_method': 'regex',
177 |                     'risk_level': '中',
178 |                     'context': context
179 |                 })
180 |         
181 |         return results
182 |     
183 |     def _detect_color_hiding(self, content: str, source: str) -> List[Dict]:
184 |         """检测颜色隐藏"""
185 |         results = []
186 |         
187 |         # 找到所有颜色定义
188 |         for color_match in self.color_pattern.finditer(content):
189 |             color = color_match.group(1)
190 |             
191 |             # 在同一段落中查找背景颜色
192 |             start_pos = max(0, color_match.start() - 200)
193 |             end_pos = min(len(content), color_match.end() + 200)
194 |             segment = content[start_pos:end_pos]
195 |             
196 |             bg_match = self.background_color_pattern.search(segment)
197 |             if bg_match:
198 |                 bg_color = bg_match.group(1)
199 |                 
200 |                 # 如果颜色非常接近背景色，标记为可疑
201 |                 if self._colors_are_similar(color, bg_color):
202 |                     results.append({
203 |                         'link': f'颜色隐藏 (文字:{color}, 背景:{bg_color})',
204 |                         'source': source,
205 |                         'type': 'color_hiding',
206 |                         'detection_method': 'regex',
207 |                         'risk_level': '高',
208 |                         'context': self._get_context(content, color_match.start(), color_match.end())
209 |                     })
210 |         
211 |         return results
212 |     
213 |     def _detect_position_hiding(self, content: str, source: str) -> List[Dict]:
214 |         """检测绝对定位隐藏"""
215 |         results = []
216 |         
217 |         for match in self.absolute_position_pattern.finditer(content):
218 |             direction = match.group(1).lower()
219 |             value = match.group(2)
220 |             
221 |             # 提取数值部分
222 |             num_value = float(re.search(r'([-\d.]+)', value).group(1))
223 |             
224 |             # 如果位置在屏幕外（非常大的负值或正值）
225 |             if abs(num_value) > 1000:
226 |                 results.append({
227 |                     'link': f'绝对定位隐藏 ({direction}:{value})',
228 |                     'source': source,
229 |                     'type': 'position_hiding',
230 |                     'detection_method': 'regex',
231 |                     'risk_level': '高',
232 |                     'context': self._get_context(content, match.start(), match.end())
233 |                 })
234 |         
235 |         return results
236 |     
237 |     def _detect_font_size_hiding(self, content: str, source: str) -> List[Dict]:
238 |         """检测字体大小隐藏"""
239 |         results = []
240 |         
241 |         for match in self.font_size_pattern.finditer(content):
242 |             size = match.group(1)
243 |             
244 |             results.append({
245 |                 'link': f'字体大小隐藏 (size:{size})',
246 |                 'source': source,
247 |                 'type': 'font_size_hiding',
248 |                 'detection_method': 'regex',
249 |                 'risk_level': '高',
250 |                 'context': self._get_context(content, match.start(), match.end())
251 |             })
252 |         
253 |         return results
254 |     
255 |     def _detect_text_indent_hiding(self, content: str, source: str) -> List[Dict]:
256 |         """检测文本缩进隐藏"""
257 |         results = []
258 |         
259 |         for match in self.text_indent_pattern.finditer(content):
260 |             indent = match.group(1)
261 |             
262 |             # 提取数值部分
263 |             num_value = float(re.search(r'([-\d.]+)', indent).group(1))
264 |             
265 |             # 如果缩进很大（负值），可能是隐藏文本
266 |             if num_value < -50:
267 |                 results.append({
268 |                     'link': f'文本缩进隐藏 (indent:{indent})',
269 |                     'source': source,
270 |                     'type': 'text_indent_hiding',
271 |                     'detection_method': 'regex',
272 |                     'risk_level': '高',
273 |                     'context': self._get_context(content, match.start(), match.end())
274 |                 })
275 |         
276 |         return results
277 |     
278 |     def _detect_opacity_hiding(self, content: str, source: str) -> List[Dict]:
279 |         """检测透明度隐藏"""
280 |         results = []
281 |         
282 |         # 检测opacity
283 |         for match in self.opacity_pattern.finditer(content):
284 |             opacity = match.group(1)
285 |             results.append({
286 |                 'link': f'透明度隐藏 (opacity:{opacity})',
287 |                 'source': source,
288 |                 'type': 'opacity_hiding',
289 |                 'detection_method': 'regex',
290 |                 'risk_level': '高',
291 |                 'context': self._get_context(content, match.start(), match.end())
292 |             })
293 |         
294 |         # 检测visibility:hidden
295 |         for match in self.visibility_pattern.finditer(content):
296 |             results.append({
297 |                 'link': '可见性隐藏 (visibility:hidden)',
298 |                 'source': source,
299 |                 'type': 'visibility_hiding',
300 |                 'detection_method': 'regex',
301 |                 'risk_level': '高',
302 |                 'context': self._get_context(content, match.start(), match.end())
303 |             })
304 |         
305 |         # 检测display:none
306 |         for match in self.display_none_pattern.finditer(content):
307 |             results.append({
308 |                 'link': '显示隐藏 (display:none)',
309 |                 'source': source,
310 |                 'type': 'display_hiding',
311 |                 'detection_method': 'regex',
312 |                 'risk_level': '高',
313 |                 'context': self._get_context(content, match.start(), match.end())
314 |             })
315 |         
316 |         return results
317 |     
318 |     def _detect_nested_elements(self, content: str, source: str) -> List[Dict]:
319 |         """检测多层嵌套隐藏"""
320 |         results = []
321 |         
322 |         for match in self.nested_elements_pattern.finditer(content):
323 |             results.append({
324 |                 'link': '多层嵌套隐藏',
325 |                 'source': source,
326 |                 'type': 'nested_hiding',
327 |                 'detection_method': 'regex',
328 |                 'risk_level': '中',
329 |                 'context': self._get_context(content, match.start(), match.end())
330 |             })
331 |         
332 |         return results
333 |     
334 |     def _detect_html_entities(self, content: str, source: str) -> List[Dict]:
335 |         """检测HTML实体编码隐藏"""
336 |         results = []
337 |         
338 |         # 计算HTML实体的密度
339 |         entity_matches = list(self.html_entity_pattern.finditer(content))
340 |         
341 |         # 如果在较短的文本中有大量实体编码，可能是隐藏内容
342 |         if len(entity_matches) > 10:
343 |             # 尝试解码一些实体看看是否包含可疑内容
344 |             sample = content[max(0, entity_matches[0].start() - 20):entity_matches[min(5, len(entity_matches)-1)].end() + 20]
345 |             
346 |             results.append({
347 |                 'link': f'HTML实体编码隐藏 ({len(entity_matches)}个实体)',
348 |                 'source': source,
349 |                 'type': 'entity_hiding',
350 |                 'detection_method': 'regex',
351 |                 'risk_level': '中',
352 |                 'context': sample
353 |             })
354 |         
355 |         return results
356 |     
357 |     def _colors_are_similar(self, color1: str, color2: str) -> bool:
358 |         """检查两个颜色是否相似"""
359 |         # 这是一个简化的实现，实际应用中可能需要更复杂的颜色比较
360 |         # 在这里我们只是检查是否完全相同或都是深色/浅色
361 |         
362 |         # 转换为小写以便比较
363 |         color1 = color1.lower()
364 |         color2 = color2.lower()
365 |         
366 |         # 如果完全相同，肯定是相似的
367 |         if color1 == color2:
368 |             return True
369 |         
370 |         # 检查是否都是深色（简化判断）
371 |         dark_colors = ['#000', '#000000', 'black', 'rgb(0,0,0)']
372 |         if color1 in dark_colors and color2 in dark_colors:
373 |             return True
374 |         
375 |         # 检查是否都是白色
376 |         white_colors = ['#fff', '#ffffff', 'white', 'rgb(255,255,255)']
377 |         if color1 in white_colors and color2 in white_colors:
378 |             return True
379 |         
380 |         return False
381 |     
382 |     def _extract_hidden_content(self, content: str, markers: List[str]) -> str:
383 |         """从内容中提取使用特定标记隐藏的内容"""
384 |         # 这个方法可以进一步扩展来提取使用零宽字符编码的隐藏内容
385 |         # 目前只是一个简单的实现
386 |         
387 |         # 移除所有标记字符，看看是否有剩余的有意义内容
388 |         clean_content = content
389 |         for marker in markers:
390 |             clean_content = clean_content.replace(marker, '')
391 |         
392 |         # 如果清理后的内容与原内容不同，返回清理后的内容（限制长度）
393 |         if clean_content != content:
394 |             return clean_content.strip()[:200]
395 |         
396 |         return None
397 |     
398 |     def _get_context(self, content: str, start: int, end: int, context_size: int = 50) -> str:
399 |         """获取匹配内容的上下文"""
400 |         start_context = max(0, start - context_size)
401 |         end_context = min(len(content), end + context_size)
402 |         
403 |         context = content[start_context:end_context]
404 |         context = context.replace('\n', ' ').replace('\r', ' ')
405 |         
406 |         # 移除零宽字符以便显示
407 |         for char in self.zero_width_chars:
408 |             context = context.replace(char, '')
409 |         
410 |         return context
411 |         


--------------------------------------------------------------------------------
/utils/js_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | JavaScript处理工具模块
  5 | """
  6 | 
  7 | import re
  8 | import logging
  9 | from typing import List, Dict, Any
 10 | 
 11 | logger = logging.getLogger('YuanZhao.utils.js')
 12 | 
 13 | # 常见的可疑JavaScript模式
 14 | SUSPICIOUS_PATTERNS = [
 15 |     # 文档写入相关
 16 |     r'document\.write\s*\(',
 17 |     r'document\.writeln\s*\(',
 18 |     r'document\.createElement\s*\(\s*["\']script["\']\s*\)',
 19 |     
 20 |     # DOM操作相关
 21 |     r'appendChild\s*\(',
 22 |     r'insertBefore\s*\(',
 23 |     r'innerHTML\s*=',
 24 |     r'outerHTML\s*=',
 25 |     
 26 |     # 编码解码相关
 27 |     r'decodeURIComponent\s*\(',
 28 |     r'decodeURI\s*\(',
 29 |     r'eval\s*\(',
 30 |     r'Function\s*\(',
 31 |     r'fromCharCode\s*\(',
 32 |     
 33 |     # URL相关
 34 |     r'location\.href\s*=',
 35 |     r'window\.location\s*=',
 36 |     r'location\.replace\s*\(',
 37 |     r'location\.assign\s*\(',
 38 |     
 39 |     # 定时器相关
 40 |     r'setTimeout\s*\(',
 41 |     r'setInterval\s*\(',
 42 |     
 43 |     # AJAX相关
 44 |     r'XMLHttpRequest',
 45 |     r'fetch\s*\(',
 46 |     r'axios',
 47 |     
 48 |     # 混淆相关
 49 |     r'\+\s*"',  # 字符串拼接
 50 |     r'["\']\s*\+\s*["\']',  # 空字符串拼接
 51 |     r'\[\d+\]',  # 数字索引访问
 52 | ]
 53 | 
 54 | def extract_suspicious_patterns(js_content: str) -> List[Dict[str, str]]:
 55 |     """
 56 |     提取可疑的JavaScript模式
 57 |     
 58 |     Args:
 59 |         js_content: JavaScript代码
 60 |     
 61 |     Returns:
 62 |         可疑模式列表
 63 |     """
 64 |     suspicious_matches = []
 65 |     
 66 |     try:
 67 |         for pattern_str in SUSPICIOUS_PATTERNS:
 68 |             pattern = re.compile(pattern_str, re.IGNORECASE)
 69 |             matches = pattern.finditer(js_content)
 70 |             
 71 |             for match in matches:
 72 |                 code_segment = match.group(0)
 73 |                 start_pos = match.start(0)
 74 |                 end_pos = match.end(0)
 75 |                 
 76 |                 # 获取上下文
 77 |                 context = get_code_context(js_content, start_pos, end_pos)
 78 |                 
 79 |                 suspicious_matches.append({
 80 |                     'pattern': pattern_str,
 81 |                     'code_segment': code_segment,
 82 |                     'context': context,
 83 |                     'position': (start_pos, end_pos)
 84 |                 })
 85 |     
 86 |     except Exception as e:
 87 |         logger.error(f"提取可疑模式失败: {str(e)}")
 88 |     
 89 |     return suspicious_matches
 90 | 
 91 | def get_code_context(js_content: str, start_pos: int, end_pos: int, context_lines: int = 3) -> str:
 92 |     """
 93 |     获取代码上下文
 94 |     
 95 |     Args:
 96 |         js_content: 完整代码
 97 |         start_pos: 开始位置
 98 |         end_pos: 结束位置
 99 |         context_lines: 上下文行数
100 |     
101 |     Returns:
102 |         包含上下文的代码段
103 |     """
104 |     try:
105 |         # 获取行号
106 |         lines = js_content.split('\n')
107 |         current_line = 0
108 |         char_count = 0
109 |         
110 |         for i, line in enumerate(lines):
111 |             char_count += len(line) + 1  # +1 for newline
112 |             if char_count > start_pos:
113 |                 current_line = i
114 |                 break
115 |         
116 |         # 获取上下文行
117 |         start_line = max(0, current_line - context_lines)
118 |         end_line = min(len(lines), current_line + context_lines + 1)
119 |         
120 |         context_lines = lines[start_line:end_line]
121 |         
122 |         return '\n'.join(context_lines)
123 |         
124 |     except Exception as e:
125 |         logger.error(f"获取代码上下文失败: {str(e)}")
126 |         # 回退到简单的字符上下文
127 |         context_start = max(0, start_pos - 100)
128 |         context_end = min(len(js_content), end_pos + 100)
129 |         return js_content[context_start:context_end]
130 | 
131 | def detect_dynamic_urls(js_content: str) -> List[Dict[str, str]]:
132 |     """
133 |     检测动态生成的URL
134 |     
135 |     Args:
136 |         js_content: JavaScript代码
137 |     
138 |     Returns:
139 |         动态URL列表
140 |     """
141 |     dynamic_urls = []
142 |     
143 |     # 检测常见的URL赋值模式
144 |     url_patterns = [
145 |         re.compile(r'(?:href|src|url)\s*=\s*([^;\n]+);', re.DOTALL),
146 |         re.compile(r'(?:location\.href|window\.location)\s*=\s*([^;\n]+);', re.DOTALL),
147 |         re.compile(r'fetch\s*\(\s*([^)]+)\s*\)', re.DOTALL),
148 |         re.compile(r'\.open\s*\(\s*["\'](get|post|put|delete)["\']\s*,\s*([^)]+)\s*\)', re.DOTALL),
149 |     ]
150 |     
151 |     try:
152 |         for pattern in url_patterns:
153 |             matches = pattern.finditer(js_content)
154 |             
155 |             for match in matches:
156 |                 code_segment = match.group(0)
157 |                 start_pos = match.start(0)
158 |                 end_pos = match.end(0)
159 |                 
160 |                 # 判断是否包含变量或表达式
161 |                 if any(ch in code_segment for ch in ['+', '\'', '"', '`', '[', ']', '(', ')']):
162 |                     # 优先尝试从表达式中提取规范化URL常量
163 |                     url_const = None
164 |                     m_http = re.search(r'["\'`]\s*(https?://[^"\'`\s]+)\s*["\'`]', code_segment)
165 |                     if m_http:
166 |                         url_const = m_http.group(1)
167 |                     m_proto = re.search(r'["\'`]\s*(//[^"\'`\s]+)\s*["\'`]', code_segment)
168 |                     if (not url_const) and m_proto:
169 |                         url_const = 'https:' + m_proto.group(1)
170 |                     dynamic_urls.append({
171 |                         'url': url_const if url_const else None,
172 |                         'expression': code_segment,
173 |                         'reason': '动态构建的URL',
174 |                         'context': get_code_context(js_content, start_pos, end_pos),
175 |                         'position': (start_pos, end_pos)
176 |                     })
177 |     
178 |     except Exception as e:
179 |         logger.error(f"检测动态URL失败: {str(e)}")
180 |     
181 |     return dynamic_urls
182 | 
183 | def detect_obfuscated_code(js_content: str) -> List[Dict[str, str]]:
184 |     """
185 |     检测混淆的JavaScript代码
186 |     
187 |     Args:
188 |         js_content: JavaScript代码
189 |     
190 |     Returns:
191 |         混淆代码列表
192 |     """
193 |     obfuscated_segments = []
194 |     
195 |     # 检测常见的混淆模式
196 |     obfuscation_patterns = [
197 |         # 大量的字符串拼接
198 |         (r'("[^"\\]*(?:\\.[^"\\]*)*"\s*\+\s*){3,}', 'multiple_string_concatenation'),
199 |         # 长的十六进制字符串
200 |         (r'(\\x[0-9a-fA-F]{2}){10,}', 'hex_encoding'),
201 |         # Unicode编码
202 |         (r'(\\u[0-9a-fA-F]{4}){5,}', 'unicode_encoding'),
203 |         # 数组混淆
204 |         (r'(\[\s*\d+\s*\]\s*\+){3,}', 'array_obfuscation'),
205 |         # eval + 字符串
206 |         (r'eval\s*\(\s*["\'](?:[^"\'\\]|\\.)*["\']\s*\)', 'eval_with_string'),
207 |         # 大量的变量替换
208 |         (r'(var|let|const)\s+[a-z]\s*=\s*[^;]+;\s*[a-z]\s*\+\s*=[^;]+;', 'variable_replacement'),
209 |     ]
210 |     
211 |     try:
212 |         for pattern_str, obfuscation_type in obfuscation_patterns:
213 |             pattern = re.compile(pattern_str, re.DOTALL)
214 |             matches = pattern.finditer(js_content)
215 |             
216 |             for match in matches:
217 |                 code_segment = match.group(0)
218 |                 start_pos = match.start(0)
219 |                 end_pos = match.end(0)
220 |                 
221 |                 obfuscated_segments.append({
222 |                     'type': obfuscation_type,
223 |                     'code_segment': code_segment,
224 |                     'context': get_code_context(js_content, start_pos, end_pos),
225 |                     'position': (start_pos, end_pos)
226 |                 })
227 |     
228 |     except Exception as e:
229 |         logger.error(f"检测混淆代码失败: {str(e)}")
230 |     
231 |     return obfuscated_segments
232 | 
233 | def extract_function_calls(js_content: str, function_name: str) -> List[Dict[str, str]]:
234 |     """
235 |     提取特定函数调用
236 |     
237 |     Args:
238 |         js_content: JavaScript代码
239 |         function_name: 函数名
240 |     
241 |     Returns:
242 |         函数调用列表
243 |     """
244 |     function_calls = []
245 |     
246 |     try:
247 |         # 构建函数调用的正则表达式
248 |         pattern_str = rf'{re.escape(function_name)}\s*\(\s*([^)]*)\s*\)'  # 避免 re.escape 对 \ 进行转义
249 |         pattern = re.compile(pattern_str, re.DOTALL)
250 |         matches = pattern.finditer(js_content)
251 |         
252 |         for match in matches:
253 |             full_call = match.group(0)
254 |             arguments = match.group(1)
255 |             start_pos = match.start(0)
256 |             end_pos = match.end(0)
257 |             
258 |             function_calls.append({
259 |                 'function': function_name,
260 |                 'arguments': arguments,
261 |                 'full_call': full_call,
262 |                 'context': get_code_context(js_content, start_pos, end_pos),
263 |                 'position': (start_pos, end_pos)
264 |             })
265 |     
266 |     except Exception as e:
267 |         logger.error(f"提取函数调用失败: {str(e)}")
268 |     
269 |     return function_calls
270 | 
271 | def detect_document_modification(js_content: str) -> List[Dict[str, str]]:
272 |     """
273 |     检测文档修改操作
274 |     
275 |     Args:
276 |         js_content: JavaScript代码
277 |     
278 |     Returns:
279 |         文档修改操作列表
280 |     """
281 |     modifications = []
282 |     
283 |     # 文档修改相关的模式
284 |     modification_patterns = [
285 |         (r'document\.write\s*\(', 'document.write'),
286 |         (r'document\.writeln\s*\(', 'document.writeln'),
287 |         (r'innerHTML\s*=', 'innerHTML assignment'),
288 |         (r'outerHTML\s*=', 'outerHTML assignment'),
289 |         (r'appendChild\s*\(', 'appendChild'),
290 |         (r'insertBefore\s*\(', 'insertBefore'),
291 |         (r'insertAdjacentHTML\s*\(', 'insertAdjacentHTML'),
292 |         (r'createElement\s*\(', 'createElement'),
293 |     ]
294 |     
295 |     try:
296 |         for pattern_str, modification_type in modification_patterns:
297 |             pattern = re.compile(pattern_str, re.IGNORECASE)
298 |             matches = pattern.finditer(js_content)
299 |             
300 |             for match in matches:
301 |                 code_segment = match.group(0)
302 |                 start_pos = match.start(0)
303 |                 end_pos = match.end(0)
304 |                 
305 |                 target = modification_type
306 |                 value = code_segment
307 |                 modifications.append({
308 |                     'action': 'modify_document',
309 |                     'target': target,
310 |                     'value': value,
311 |                     'description': modification_type,
312 |                     'context': get_code_context(js_content, start_pos, end_pos),
313 |                     'position': (start_pos, end_pos)
314 |                 })
315 |     
316 |     except Exception as e:
317 |         logger.error(f"检测文档修改失败: {str(e)}")
318 |     
319 |     return modifications
320 | 
321 | def extract_variable_assignments(js_content: str, variable_name: str) -> List[Dict[str, str]]:
322 |     """
323 |     提取变量赋值
324 |     
325 |     Args:
326 |         js_content: JavaScript代码
327 |         variable_name: 变量名
328 |     
329 |     Returns:
330 |         变量赋值列表
331 |     """
332 |     assignments = []
333 |     
334 |     try:
335 |         # 构建变量赋值的正则表达式
336 |         pattern_str = rf'(?:var|let|const)?\s*{re.escape(variable_name)}\s*=\s*([^;\n]+)'  # 避免 re.escape 对 \ 进行转义
337 |         pattern = re.compile(pattern_str, re.DOTALL)
338 |         matches = pattern.finditer(js_content)
339 |         
340 |         for match in matches:
341 |             full_assignment = match.group(0)
342 |             value = match.group(1)
343 |             start_pos = match.start(0)
344 |             end_pos = match.end(0)
345 |             
346 |             assignments.append({
347 |                 'variable': variable_name,
348 |                 'value': value,
349 |                 'full_assignment': full_assignment,
350 |                 'context': get_code_context(js_content, start_pos, end_pos),
351 |                 'position': (start_pos, end_pos)
352 |             })
353 |     
354 |     except Exception as e:
355 |         logger.error(f"提取变量赋值失败: {str(e)}")
356 |     
357 |     return assignments
358 | 
359 | def extract_comments(js_content: str) -> List[Dict[str, Any]]:
360 |     """
361 |     提取JavaScript注释
362 |     
363 |     Args:
364 |         js_content: JavaScript代码
365 |     
366 |     Returns:
367 |         注释列表
368 |     """
369 |     comments = []
370 |     
371 |     try:
372 |         # 匹配单行注释
373 |         single_line_pattern = re.compile(r'//(.*?)$', re.MULTILINE)
374 |         single_line_matches = single_line_pattern.finditer(js_content)
375 |         
376 |         for match in single_line_matches:
377 |             comment_content = match.group(1).strip()
378 |             start_pos = match.start(0)
379 |             end_pos = match.end(0)
380 |             
381 |             comments.append({
382 |                 'type': 'single_line',
383 |                 'content': comment_content,
384 |                 'position': (start_pos, end_pos)
385 |             })
386 |         
387 |         # 匹配多行注释
388 |         multi_line_pattern = re.compile(r'/\*(.*?)\*/', re.DOTALL)
389 |         multi_line_matches = multi_line_pattern.finditer(js_content)
390 |         
391 |         for match in multi_line_matches:
392 |             comment_content = match.group(1).strip()
393 |             start_pos = match.start(0)
394 |             end_pos = match.end(0)
395 |             
396 |             comments.append({
397 |                 'type': 'multi_line',
398 |                 'content': comment_content,
399 |                 'position': (start_pos, end_pos)
400 |             })
401 |     
402 |     except Exception as e:
403 |         logger.error(f"提取JavaScript注释失败: {str(e)}")
404 |     
405 |     return comments
406 | 
407 | def strip_comments(js_content: str) -> str:
408 |     """
409 |     移除JavaScript注释
410 |     """
411 |     try:
412 |         s = js_content
413 |         out = []
414 |         i = 0
415 |         n = len(s)
416 |         in_sq = False
417 |         in_dq = False
418 |         in_bt = False
419 |         while i < n:
420 |             ch = s[i]
421 |             if not in_sq and not in_dq and not in_bt and ch == '/' and i + 1 < n:
422 |                 nxt = s[i+1]
423 |                 if nxt == '/':
424 |                     j = i + 2
425 |                     while j < n and s[j] not in '\n\r':
426 |                         j += 1
427 |                     i = j
428 |                     continue
429 |                 if nxt == '*':
430 |                     j = i + 2
431 |                     while j + 1 < n and not (s[j] == '*' and s[j+1] == '/'):
432 |                         j += 1
433 |                     i = j + 2 if j + 1 < n else n
434 |                     continue
435 |             out.append(ch)
436 |             if ch == "'" and not in_dq and not in_bt:
437 |                 esc = i > 0 and s[i-1] == '\\'
438 |                 if not esc:
439 |                     in_sq = not in_sq
440 |             elif ch == '"' and not in_sq and not in_bt:
441 |                 esc = i > 0 and s[i-1] == '\\'
442 |                 if not esc:
443 |                     in_dq = not in_dq
444 |             elif ch == '`' and not in_sq and not in_dq:
445 |                 in_bt = not in_bt
446 |             i += 1
447 |         return ''.join(out)
448 |     except Exception as e:
449 |         logger.error(f"移除JavaScript注释失败: {str(e)}")
450 |         return js_content
451 | 
452 | # 兼容性函数，为了支持js_detector.py中的导入
453 | def identify_obfuscated_code(js_content: str) -> Dict[str, Any]:
454 |     """
455 |     识别混淆代码并返回聚合信息
456 |     """
457 |     segments = detect_obfuscated_code(js_content)
458 |     is_obf = len(segments) > 0
459 |     patterns = [seg.get('type', '') for seg in segments]
460 |     sample = segments[0].get('code_segment', '') if segments else ''
461 |     return {
462 |         'is_obfuscated': is_obf,
463 |         'detected_patterns': patterns,
464 |         'sample': sample
465 |     }
466 | 
467 | ## 兼容别名已移除，请使用 detect_document_modification
468 | 
469 | ## 兼容别名已移除，请使用 strip_comments
470 |     
471 | 


--------------------------------------------------------------------------------
/utils/network_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 网络处理工具模块
  5 | """
  6 | 
  7 | import os
  8 | import re
  9 | import logging
 10 | import requests
 11 | import ssl
 12 | from typing import Dict, List, Tuple, Optional, Any
 13 | from urllib.parse import urlparse, urljoin
 14 | 
 15 | logger = logging.getLogger('YuanZhao.utils.network')
 16 | 
 17 | # 常见URL模式正则表达式
 18 | URL_PATTERNS = [
 19 |     # 标准URL
 20 |     re.compile(r'https?://[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE),
 21 |     # 协议相对URL
 22 |     re.compile(r'//[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE),
 23 |     # 仅域名
 24 |     re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?', re.IGNORECASE),
 25 |     # IP地址形式
 26 |     re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b(?::\d{1,5})?', re.IGNORECASE),
 27 |     # JavaScript伪协议
 28 |     re.compile(r'javascript:[^\s"\'>]+', re.IGNORECASE),
 29 |     # data URI
 30 |     re.compile(r'data:[^;]+;base64,[^\s"\'>]+', re.IGNORECASE),
 31 |     # 相对路径
 32 |     re.compile(r'/[^\s"\'>]+', re.IGNORECASE),
 33 | ]
 34 | 
 35 | 
 36 | def normalize_url(url: str, base_url: Optional[str] = None) -> str:
 37 |     """
 38 |     规范化URL
 39 |     
 40 |     Args:
 41 |         url: 原始URL
 42 |         base_url: 基础URL，用于解析相对路径
 43 |     
 44 |     Returns:
 45 |         规范化后的URL
 46 |     """
 47 |     try:
 48 |         # 处理双斜杠开头的URL：优先https，或继承base_url协议
 49 |         if url.startswith('//'):
 50 |             if base_url:
 51 |                 base_parsed = urlparse(base_url)
 52 |                 scheme = base_parsed.scheme or 'https'
 53 |                 return f'{scheme}:{url}'
 54 |             return f'https:{url}'
 55 |         
 56 |         # 处理相对路径
 57 |         if base_url and not (url.startswith('http://') or url.startswith('https://')):
 58 |             return urljoin(base_url, url)
 59 |         
 60 |         # 对于纯域名，默认添加https://
 61 |         parsed = urlparse(url)
 62 |         if not parsed.scheme:
 63 |             return f'https://{url}'
 64 |         
 65 |         return url
 66 |         
 67 |     except Exception as e:
 68 |         logger.error(f"规范化URL失败: {url}, 错误: {str(e)}")
 69 |         return url
 70 | 
 71 | def get_url_type(url: str) -> str:
 72 |     """
 73 |     获取URL类型
 74 |     
 75 |     Args:
 76 |         url: URL字符串
 77 |     
 78 |     Returns:
 79 |         URL类型
 80 |     """
 81 |     if url.startswith('http://') or url.startswith('https://'):
 82 |         return 'absolute'
 83 |     elif url.startswith('//'):
 84 |         return 'protocol-relative'
 85 |     elif url.startswith('/'):
 86 |         return 'root-relative'
 87 |     else:
 88 |         return 'relative'
 89 | 
 90 | def check_url_reachability(url: str, timeout: int = 5, headers: Optional[Dict] = None) -> Tuple[bool, Optional[str]]:
 91 |     """
 92 |     检查URL是否可达
 93 |     
 94 |     Args:
 95 |         url: 要检查的URL
 96 |         timeout: 超时时间（秒）
 97 |         headers: 请求头
 98 |     
 99 |     Returns:
100 |         (是否可达, 状态码或错误信息)
101 |     """
102 |     try:
103 |         if headers is None:
104 |             headers = {
105 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
106 |             }
107 |         
108 |         response = requests.head(url, timeout=timeout, headers=headers, allow_redirects=True)
109 |         return response.status_code < 400, str(response.status_code)
110 |         
111 |     except requests.exceptions.RequestException as e:
112 |         logger.warning(f"URL检查失败: {url}, 错误: {str(e)}")
113 |         return False, str(e)
114 | 
115 | def validate_url(url: str) -> bool:
116 |     """
117 |     验证URL格式是否有效
118 |     
119 |     Args:
120 |         url: 要验证的URL
121 |     
122 |     Returns:
123 |         URL是否有效
124 |     """
125 |     try:
126 |         result = urlparse(url)
127 |         
128 |         # 对于绝对URL，需要有scheme和netloc
129 |         if url.startswith('http://') or url.startswith('https://'):
130 |             return all([result.scheme, result.netloc])
131 |         
132 |         # 对于相对URL，返回True
133 |         return True
134 |         
135 |     except Exception as e:
136 |         logger.error(f"URL验证失败: {url}, 错误: {str(e)}")
137 |         return False
138 | 
139 | def get_domain(url: str) -> Optional[str]:
140 |     """
141 |     从URL中提取域名
142 |     
143 |     Args:
144 |         url: URL字符串
145 |     
146 |     Returns:
147 |         域名
148 |     """
149 |     try:
150 |         parsed = urlparse(url)
151 |         return parsed.netloc
152 |     except Exception as e:
153 |         logger.error(f"提取域名失败: {url}, 错误: {str(e)}")
154 |         return None
155 | 
156 | def is_external_link(url: str, base_domain: Optional[str] = None) -> bool:
157 |     """
158 |     判断是否为外部链接
159 |     
160 |     Args:
161 |         url: 要检查的URL
162 |         base_domain: 基础域名
163 |     
164 |     Returns:
165 |         是否为外部链接
166 |     """
167 |     url_domain = get_domain(url)
168 |     if not url_domain:
169 |         return False
170 |     if not base_domain:
171 |         # 未提供基础域名时，尽量避免误报：只有显式协议的绝对链接视为外部
172 |         return url.startswith(('http://', 'https://'))
173 |     # 检查是否为同一域名或子域名
174 |     # 同域或子域视为内部，其余为外部
175 |     return not (url_domain == base_domain or url_domain.endswith(f'.{base_domain}'))
176 | 
177 | # 兼容性函数，用于判断字符串是否为URL
178 | def is_url(text: str) -> bool:
179 |     """
180 |     判断字符串是否为URL
181 |     
182 |     Args:
183 |         text: 要检查的文本
184 |     
185 |     Returns:
186 |         是否为URL
187 |     """
188 |     try:
189 |         # 首先检查是否为本地文件，如果是，直接返回False
190 |         if os.path.isfile(text) or os.path.isdir(text):
191 |             logger.debug(f"{text} 是本地文件或目录，不视为URL")
192 |             return False
193 |         
194 |         # 检查是否以http://或https://开头
195 |         if text.startswith(('http://', 'https://')):
196 |             return True
197 |         
198 |         # 过滤典型代码符号，避免误判为URL
199 |         if re.search(r"^(document|window|parent|this)\.[A-Za-z_]", text):
200 |             return False
201 |         if re.search(r"^[A-Za-z_][A-Za-z0-9_]*\s*\(", text):
202 |             if not re.search(r"https?://", text):
203 |                 # 若函数调用前缀，但内容中存在引号包裹的URL片段，视为URL
204 |                 quoted = re.findall(r'"([^"]+)"|\'([^\']+)' , text)
205 |                 candidates = [q[0] or q[1] for q in quoted]
206 |                 if not any((p.search(seg) for seg in URL_PATTERNS for seg in candidates)):
207 |                     return False
208 |         
209 |         # 检查是否通过URL格式验证
210 |         if not validate_url(text):
211 |             return False
212 |         
213 |         # 检查是否匹配至少一个URL模式
214 |         for pattern in URL_PATTERNS:
215 |             if pattern.search(text):
216 |                 return True
217 |         
218 |         return False
219 |     except Exception as e:
220 |         logger.error(f"URL检查失败: {text}, 错误: {str(e)}")
221 |         return False
222 | 
223 | # 兼容性函数，validate_url的别名
224 | def is_valid_url(url: str) -> bool:
225 |     """
226 |     验证URL格式是否有效（validate_url的别名）
227 |     
228 |     Args:
229 |         url: 要验证的URL
230 |     
231 |     Returns:
232 |         URL是否有效
233 |     """
234 |     return validate_url(url)
235 | 
236 | def get_url_context(text: str, position: Tuple[int, int], context_length: int = 50) -> str:
237 |     """
238 |     获取URL在文本中的上下文
239 |     
240 |     Args:
241 |         text: 原始文本
242 |         position: URL在文本中的位置 (start, end)
243 |         context_length: 上下文长度
244 |     
245 |     Returns:
246 |         包含上下文的文本
247 |     """
248 |     start_pos, end_pos = position
249 |     
250 |     # 计算上下文的起始和结束位置
251 |     context_start = max(0, start_pos - context_length)
252 |     context_end = min(len(text), end_pos + context_length)
253 |     
254 |     # 提取上下文
255 |     context = text[context_start:context_end]
256 |     
257 |     # 添加省略号
258 |     prefix = '...' if context_start > 0 else ''
259 |     suffix = '...' if context_end < len(text) else ''
260 |     
261 |     return f"{prefix}{context}{suffix}"
262 | 
263 | def build_request_session(proxy: Optional[str] = None, timeout: int = 10) -> requests.Session:
264 |     """
265 |     构建请求会话
266 |     
267 |     Args:
268 |         proxy: 代理设置
269 |         timeout: 超时时间
270 |     
271 |     Returns:
272 |         请求会话对象
273 |     """
274 |     session = requests.Session()
275 |     
276 |     # 设置默认请求头
277 |     session.headers.update({
278 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
279 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
280 |         'Accept-Language': 'zh-CN,zh;q=0.9',
281 |     })
282 |     
283 |     # 设置代理
284 |     if proxy:
285 |         proxies = {
286 |             'http': proxy,
287 |             'https': proxy
288 |         }
289 |         session.proxies.update(proxies)
290 |         logger.info(f"设置代理: {proxy}")
291 |     
292 |     # 配置HTTPS适配器，启用兼容旧式TLS重协商
293 |     class TLSAdapter(requests.adapters.HTTPAdapter):
294 |         def init_poolmanager(self, *args, **kwargs):
295 |             ctx = ssl.create_default_context()
296 |             try:
297 |                 ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0)
298 |             except Exception:
299 |                 pass
300 |             kwargs['ssl_context'] = ctx
301 |             return super().init_poolmanager(*args, **kwargs)
302 |         def proxy_manager_for(self, *args, **kwargs):
303 |             ctx = ssl.create_default_context()
304 |             try:
305 |                 ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0)
306 |             except Exception:
307 |                 pass
308 |             kwargs['ssl_context'] = ctx
309 |             return super().proxy_manager_for(*args, **kwargs)
310 |     try:
311 |         session.mount('https://', TLSAdapter(max_retries=3))
312 |     except Exception:
313 |         pass
314 |     # 超时需在请求时传递
315 |     
316 |     return session
317 | 
318 | def fetch_url_content(url: str, session: Optional[requests.Session] = None, **kwargs) -> Optional[Tuple[str, dict]]:
319 |     """
320 |     获取URL内容或本地文件内容
321 |     
322 |     Args:
323 |         url: 要获取的URL或本地文件路径
324 |         session: 请求会话对象
325 |         **kwargs: 其他请求参数
326 |     
327 |     Returns:
328 |         元组 (内容字符串, 头部信息字典)，失败时返回None
329 |     """
330 |     try:
331 |         # 检查是否为本地文件路径
332 |         if not url.startswith(('http://', 'https://')):
333 |             # 尝试作为本地文件读取
334 |             if os.path.isfile(url):
335 |                 logger.info(f"读取本地文件: {url}")
336 |                 with open(url, 'r', encoding='utf-8') as f:
337 |                     content = f.read()
338 |                 # 返回内容和模拟的头部信息
339 |                 return content, {'Content-Type': 'text/html'}
340 |             else:
341 |                 logger.error(f"本地文件不存在: {url}")
342 |                 return None
343 |         
344 |         # 添加标准浏览器请求头以避免被反爬机制拦截
345 |         default_headers = {
346 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
347 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
348 |             'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
349 |             'Accept-Encoding': 'gzip, deflate, br',
350 |             'Connection': 'keep-alive',
351 |             'Upgrade-Insecure-Requests': '1',
352 |             'Cache-Control': 'max-age=0'
353 |         }
354 |         
355 |         # 合并默认请求头和传入的请求头
356 |         headers = default_headers.copy()
357 |         if 'headers' in kwargs:
358 |             headers.update(kwargs['headers'])
359 |         kwargs['headers'] = headers
360 |         
361 |         # 增加重试机制
362 |         timeout = kwargs.get('timeout', 10)
363 |         if session:
364 |             response = session.get(url, timeout=timeout, **kwargs)
365 |         else:
366 |             # 创建临时会话以设置重试策略
367 |             temp_session = requests.Session()
368 |             adapter = requests.adapters.HTTPAdapter(max_retries=3)
369 |             temp_session.mount('http://', adapter)
370 |             try:
371 |                 class TLSAdapter(requests.adapters.HTTPAdapter):
372 |                     def init_poolmanager(self, *args, **kwargs):
373 |                         ctx = ssl.create_default_context()
374 |                         try:
375 |                             ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0)
376 |                         except Exception:
377 |                             pass
378 |                         kwargs['ssl_context'] = ctx
379 |                         return super().init_poolmanager(*args, **kwargs)
380 |                 temp_session.mount('https://', TLSAdapter(max_retries=3))
381 |             except Exception:
382 |                 temp_session.mount('https://', adapter)
383 |             response = temp_session.get(url, timeout=timeout, **kwargs)
384 |         
385 |         response.raise_for_status()
386 |         
387 |         # 尝试自动检测编码，并在失败时回退到原始字节解码
388 |         enc = response.apparent_encoding or response.encoding or 'utf-8'
389 |         try:
390 |             response.encoding = enc
391 |             text = response.text
392 |         except Exception:
393 |             try:
394 |                 text = response.content.decode(enc, errors='replace')
395 |             except Exception:
396 |                 text = response.content.decode('utf-8', errors='replace')
397 |         
398 |         return text, dict(response.headers)
399 |         
400 |     except requests.exceptions.RequestException as e:
401 |         logger.error(f"获取URL内容失败: {url}, 错误: {str(e)}")
402 |         return None
403 |     except Exception as e:
404 |         logger.error(f"读取内容失败: {url}, 错误: {str(e)}")
405 |         return None
406 | 
407 | # 兼容性函数，为了支持html_detector.py中的导入
408 | def extract_domain(url: str) -> Optional[str]:
409 |     """
410 |     从URL中提取域名（get_domain的别名）
411 |     
412 |     Args:
413 |         url: URL字符串
414 |     
415 |     Returns:
416 |         域名
417 |     """
418 |     return get_domain(url)
419 | 
420 | def analyze_url_risk(url: str) -> Dict[str, Any]:
421 |     """
422 |     评估URL风险等级
423 |     Returns: {risk_level: int, reason: str}
424 |     """
425 |     try:
426 |         risk = 0
427 |         reasons = []
428 |         parsed = urlparse(url)
429 |         scheme = parsed.scheme.lower()
430 |         domain = parsed.netloc.lower()
431 |         # 协议风险
432 |         if scheme == 'javascript':
433 |             risk += 5
434 |             reasons.append('JavaScript协议')
435 |         elif scheme == 'data':
436 |             risk += 4
437 |             reasons.append('Data URI')
438 |         elif scheme in ('http', 'https'):
439 |             risk += 1
440 |         # 端口风险
441 |         if parsed.port and parsed.port not in [80, 443, 8080, 8443]:
442 |             risk += 2
443 |             reasons.append('非标准端口')
444 |         # 可疑后缀与短链服务
445 |         suspicious_tlds = ['pro', 'xyz', 'pw', 'top', 'loan', 'win', 'bid', 'online']
446 |         short_link_domains = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly', 'is.gd', 'adf.ly']
447 |         if any(domain.endswith('.' + tld) for tld in suspicious_tlds):
448 |             risk += 2
449 |             reasons.append('高风险域名后缀')
450 |         if any(domain.endswith(sl) or domain == sl for sl in short_link_domains):
451 |             risk += 3
452 |             reasons.append('短链接服务')
453 |         # 路径随机性
454 |         if re.search(r'/[a-zA-Z0-9]{8,}\.(?:js|php)$', parsed.path):
455 |             risk += 1
456 |             reasons.append('可疑随机路径')
457 |         return {'risk_level': min(risk, 10), 'reason': ', '.join(reasons) or '普通URL'}
458 |     except Exception as e:
459 |         logger.error(f"URL风险评估失败: {url}, 错误: {str(e)}")
460 |         return {'risk_level': 0, 'reason': '评估失败'}
461 | 
462 | def extract_urls(text: str, context_type: Optional[str] = None) -> List[Dict[str, Any]]:
463 |     """
464 |     从文本中提取所有URL
465 |     
466 |     Args:
467 |         text: 要提取URL的文本
468 |     
469 |     Returns:
470 |         包含URL和上下文的字典列表
471 |     """
472 |     results = []
473 |     urls_set = set()  # 用于去重
474 |     
475 |     # 增加URL模式匹配
476 |     url_patterns = [
477 |         re.compile(r'(https?://[\w._~:/?#[\]@!$&\'()*+,-;=]+)', re.IGNORECASE),
478 |         re.compile(r'(/[-\w./?%&=]+)', re.IGNORECASE),
479 |         re.compile(r'([a-zA-Z0-9][a-zA-Z0-9-]{0,61}[a-zA-Z0-9]\.[a-zA-Z]{2,}(?:/[^\s<>"]*)?)', re.IGNORECASE),
480 |         re.compile(r'(javascript:[\w./?%&=;(),\'"`-]+)', re.IGNORECASE),
481 |         re.compile(r'(data:[^;]+;base64,[^\s<>"]+)', re.IGNORECASE),
482 |     ]
483 |     
484 |     logger.info(f"开始提取URL，文本长度: {len(text)}")
485 |     
486 |     for i, pattern in enumerate(url_patterns):
487 |         matches = pattern.finditer(text)
488 |         match_count = 0
489 |         
490 |         for match in matches:
491 |             match_count += 1
492 |             url = match.group(1)
493 |             start = max(0, match.start() - 50)
494 |             end = min(len(text), match.end() + 50)
495 |             context = text[start:end]
496 |             
497 |             # 清理URL
498 |             url = url.strip('"\'')
499 |             
500 |             # 跳过空URL
501 |             if not url or len(url) < 3:
502 |                 continue
503 |             
504 |             # 跳过纯数字或不包含有效字符的URL
505 |             if re.match(r'^\d+$', url):
506 |                 continue
507 |             
508 |             # 基本过滤：非http且非根相对路径、非伪协议时需校验TLD
509 |             if not url.lower().startswith(('http://','https://','javascript:','data:')) and not url.startswith('/'):
510 |                 domain_part = url.split('/', 1)[0]
511 |                 tld = domain_part.rsplit('.', 1)[-1].lower() if '.' in domain_part else ''
512 |                 allowed_tlds = {
513 |                     'com','org','net','cn','cc','io','me','xyz','tk','ga','gq','ml','cf','edu','gov','mil','biz','info'
514 |                 }
515 |                 if tld not in allowed_tlds:
516 |                     continue
517 |             # 去重
518 |             if url not in urls_set:
519 |                 urls_set.add(url)
520 |                 results.append({
521 |                     'url': url,
522 |                     'context': context,
523 |                     'position': (match.start(), match.end()),
524 |                     'context_type': context_type or 'unknown'
525 |                 })
526 |         
527 |         logger.debug(f"模式 {i} 匹配到 {match_count} 个URL")
528 |     
529 |     logger.debug(f"共提取到 {len(results)} 个唯一URL")
530 |     return results
531 | EXTRA_PATTERNS = [
532 |     # 扩展的HTTP/HTTPS URL
533 |     re.compile(r'https?://[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE),
534 |     # 没有协议的域名
535 |     re.compile(r'\b[\w\-\.]+(?:\.[\w\-]+)+\b(?::\d{1,5})?/[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]*', re.IGNORECASE),
536 |     # JavaScript伪协议
537 |     re.compile(r'javascript:[^\s"\'>]+', re.IGNORECASE),
538 |     # data URI
539 |     re.compile(r'data:[^;]+;base64,[^\s"\'>]+', re.IGNORECASE),
540 |     # 相对路径
541 |     re.compile(r'\/[^\s"\'>]+', re.IGNORECASE),
542 | ]
543 | # 模式去重：基于正则字符串与flags，避免重复匹配与性能开销
544 | _unique_patterns = []
545 | _seen = set()
546 | for _pat in URL_PATTERNS:
547 |     _key = (_pat.pattern, _pat.flags)
548 |     if _key not in _seen:
549 |         _seen.add(_key)
550 |         _unique_patterns.append(_pat)
551 | URL_PATTERNS = _unique_patterns
552 | 


--------------------------------------------------------------------------------
/core/detector/headless_browser_detector.py:
--------------------------------------------------------------------------------
  1 | """无头浏览器检测器模块
  2 | 
  3 | 用于通过Chrome无头浏览器检测动态生成的暗链和隐藏内容。
  4 | 支持检测JavaScript动态生成的内容、DOM操作、iframe内容等。
  5 | """
  6 | import logging
  7 | from typing import List, Dict, Any
  8 | from core.config import Config
  9 | 
 10 | class HeadlessBrowserDetector:
 11 |     """无头浏览器检测器类"""
 12 |     
 13 |     def __init__(self, config: Config):
 14 |         """初始化无头浏览器检测器
 15 |         
 16 |         Args:
 17 |             config: 配置对象
 18 |         """
 19 |         self.config = config
 20 |         self.logger = logging.getLogger(__name__)
 21 |         self.driver = None
 22 |         self._initialize_driver()
 23 |     
 24 |     def _initialize_driver(self):
 25 |         """初始化Chrome无头浏览器驱动"""
 26 |         try:
 27 |             # 动态导入，避免在不使用时产生依赖问题
 28 |             from selenium import webdriver
 29 |             from selenium.webdriver.chrome.options import Options
 30 |             from selenium.webdriver.chrome.service import Service
 31 |             import os
 32 |             driver_path = getattr(self.config, 'headless_driver_path', None)
 33 |             binary_path = getattr(self.config, 'headless_binary', None)
 34 |             
 35 |             # 创建Chrome选项
 36 |             chrome_options = Options()
 37 |             if binary_path:
 38 |                 chrome_options.binary_location = binary_path
 39 |             chrome_options.add_argument('--headless')  # 无头模式
 40 |             chrome_options.add_argument('--disable-gpu')  # 禁用GPU加速
 41 |             chrome_options.add_argument('--no-sandbox')  # 禁用沙箱
 42 |             chrome_options.add_argument('--disable-dev-shm-usage')  # 解决内存问题
 43 |             chrome_options.add_argument('--window-size=1920,1080')  # 设置窗口大小
 44 |             chrome_options.add_argument('--log-level=3')  # 减少日志输出
 45 |             
 46 |             # 选择驱动来源：优先本地路径；否则在允许时自动下载
 47 |             if driver_path and os.path.exists(driver_path):
 48 |                 service = Service(driver_path)
 49 |             else:
 50 |                 if getattr(self.config, 'headless_auto_download', False):
 51 |                     from webdriver_manager.chrome import ChromeDriverManager
 52 |                     service = Service(ChromeDriverManager().install())
 53 |                 else:
 54 |                     self.logger.error("未提供本地驱动路径且未启用自动下载，跳过无头浏览器初始化")
 55 |                     return
 56 |             
 57 |             # 创建浏览器驱动
 58 |             self.driver = webdriver.Chrome(service=service, options=chrome_options)
 59 |             
 60 |             # 设置超时时间
 61 |             self.driver.set_page_load_timeout(self.config.headless_timeout)
 62 |             self.driver.set_script_timeout(self.config.headless_timeout)
 63 |             
 64 |             self.logger.info("Chrome无头浏览器初始化成功")
 65 |             
 66 |         except ImportError as e:
 67 |             self.logger.error(f"缺少无头浏览器相关依赖: {str(e)}")
 68 |             self.logger.error("请安装依赖: pip install selenium webdriver-manager")
 69 |         except Exception as e:
 70 |             self.logger.error(f"无头浏览器初始化失败: {str(e)}")
 71 | 
 72 |     def close(self):
 73 |         """释放浏览器驱动资源"""
 74 |         try:
 75 |             if self.driver:
 76 |                 self.driver.quit()
 77 |                 self.driver = None
 78 |                 self.logger.info("已释放无头浏览器驱动")
 79 |         except Exception as e:
 80 |             self.logger.error(f"释放无头浏览器驱动失败: {str(e)}")
 81 | 
 82 |     def __del__(self):
 83 |         try:
 84 |             self.close()
 85 |         except Exception:
 86 |             pass
 87 |     
 88 |     def detect(self, url: str, content: str = None) -> List[Dict[str, Any]]:
 89 |         """使用无头浏览器检测暗链
 90 |         
 91 |         Args:
 92 |             url: 要检测的URL
 93 |             content: 可选，页面内容（如果已获取）
 94 |             
 95 |         Returns:
 96 |             检测结果列表
 97 |         """
 98 |         results = []
 99 |         
100 |         if not self.driver:
101 |             self.logger.error("无头浏览器未初始化，跳过检测")
102 |             return results
103 |         
104 |         try:
105 |             from selenium.webdriver.support.ui import WebDriverWait
106 |             # 加载页面
107 |             self.logger.info(f"无头浏览器正在加载页面: {url}")
108 |             self.driver.get(url)
109 |             
110 |             # 等待JavaScript执行完成
111 |             try:
112 |                 WebDriverWait(self.driver, self.config.js_wait_time).until(
113 |                     lambda d: d.execute_script("return document.readyState") in ("complete", "interactive")
114 |                 )
115 |             except Exception:
116 |                 pass
117 |             self.logger.info(f"等待页面加载/JS执行完成 (<= {self.config.js_wait_time}秒)")
118 |             
119 |             # 执行各项检测
120 |             self.logger.info("开始执行动态链接检测")
121 |             dynamic_links = self._detect_dynamic_links()
122 |             results.extend(dynamic_links)
123 |             
124 |             self.logger.info("开始执行DOM操作检测")
125 |             dom_operations = self._detect_dom_manipulations()
126 |             results.extend(dom_operations)
127 |             
128 |             self.logger.info("开始执行iframe内容检测")
129 |             iframe_content = self._detect_iframe_content()
130 |             results.extend(iframe_content)
131 |             
132 |             self.logger.info("开始执行隐藏元素检测")
133 |             hidden_elements = self._detect_hidden_elements()
134 |             results.extend(hidden_elements)
135 |             
136 |             self.logger.info(f"无头浏览器检测完成，发现 {len(results)} 个可疑项")
137 |             
138 |         except Exception as e:
139 |             self.logger.error(f"无头浏览器检测过程中出错: {str(e)}")
140 |         
141 |         return results
142 |     
143 |     def _detect_dynamic_links(self) -> List[Dict[str, Any]]:
144 |         """检测动态生成的链接
145 |         
146 |         Returns:
147 |             检测到的可疑链接列表
148 |         """
149 |         results = []
150 |         
151 |         try:
152 |             from selenium.webdriver.common.by import By
153 |             # 获取所有链接元素
154 |             links = self.driver.find_elements(By.TAG_NAME, 'a')
155 |             self.logger.info(f"发现 {len(links)} 个链接元素")
156 |             
157 |             for link in links:
158 |                 try:
159 |                     href = link.get_attribute('href')
160 |                     if href:
161 |                         # 分析链接风险（使用现有工具类）
162 |                         from utils.network_utils import analyze_url_risk
163 |                         risk_info = analyze_url_risk(href)
164 |                         
165 |                         if risk_info['risk_level'] > 0:
166 |                             text = link.text.strip()[:100]  # 限制文本长度
167 |                             results.append({
168 |                                 'type': 'suspicious_url',
169 |                                 'url': href,
170 |                                 'risk_level': risk_info['risk_level'],
171 |                                 'context': f"动态生成链接: {text}",
172 |                                 'detection_method': 'headless_browser',
173 |                                 'element': 'a',
174 |                                 'risk_reason': risk_info.get('reason', '未知风险')
175 |                             })
176 |                 except Exception as e:
177 |                     self.logger.error(f"分析动态链接时出错: {str(e)}")
178 |         except Exception as e:
179 |             self.logger.error(f"获取链接元素时出错: {str(e)}")
180 |         
181 |         return results
182 |     
183 |     def _detect_dom_manipulations(self) -> List[Dict[str, Any]]:
184 |         """检测可疑的DOM操作
185 |         
186 |         Returns:
187 |             检测到的可疑DOM操作列表
188 |         """
189 |         results = []
190 |         
191 |         # 注入JavaScript以检测可疑的DOM操作
192 |         monitor_script = r"""
193 |         (function() {
194 |             const suspiciousPatterns = [];
195 |             
196 |             // 初始化正则表达式
197 |             const eval_pattern = /eval[\s]*\(/;
198 |             const doc_write_pattern = /document\.write[\s]*\(/;
199 |             const innerhtml_pattern = /innerHTML[\s]*=/;
200 |             const base64_pattern = /base64/i;
201 |             const fromCharCode_pattern = /fromCharCode/;
202 |             const escape_pattern = /escape[\s]*\(/;
203 |             const unescape_pattern = /unescape[\s]*\(/;
204 |             
205 |             // 检测可疑的JavaScript代码模式
206 |             const scriptElements = document.querySelectorAll('script');
207 |             scriptElements.forEach(script => {
208 |                 if (script.textContent) {
209 |                     const content = script.textContent;
210 |                     if (eval_pattern.test(content) || 
211 |                         doc_write_pattern.test(content) ||
212 |                         innerhtml_pattern.test(content) ||
213 |                         base64_pattern.test(content) ||
214 |                         fromCharCode_pattern.test(content) ||
215 |                         escape_pattern.test(content) ||
216 |                         unescape_pattern.test(content)) {
217 |                         suspiciousPatterns.push({
218 |                             type: 'suspicious_script',
219 |                             content: content.substring(0, 200) + '...',
220 |                             lineCount: content.split('\n').length
221 |                         });
222 |                     }
223 |                 }
224 |             });
225 |             
226 |             // 检测动态创建的元素
227 |             const dynamicElements = [];
228 |             document.querySelectorAll('*').forEach(element => {
229 |                 if (element.tagName === 'SCRIPT' && element.getAttribute('src') === null && 
230 |                     element.textContent.length > 50) {
231 |                     dynamicElements.push({tag: element.tagName, type: 'inline_script'});
232 |                 }
233 |                 if (element.tagName === 'IFRAME') {
234 |                     dynamicElements.push({tag: element.tagName, src: element.getAttribute('src')});
235 |                 }
236 |             });
237 |             
238 |             return {suspiciousPatterns, dynamicElements};
239 |         })();
240 |         """
241 |         
242 |         try:
243 |             result = self.driver.execute_script(monitor_script)
244 |             
245 |             # 分析可疑脚本模式
246 |             for pattern in result['suspiciousPatterns']:
247 |                 risk_level = 8  # 较高风险
248 |                 results.append({
249 |                     'type': 'suspicious_dom_operation',
250 |                     'technique': pattern['type'],
251 |                     'risk_level': risk_level,
252 |                     'context': f"检测到可疑脚本模式: {pattern['content']}",
253 |                     'detection_method': 'headless_browser',
254 |                     'risk_reason': '包含可疑JavaScript操作函数'
255 |                 })
256 |             
257 |             # 分析动态创建的元素
258 |             for element in result['dynamicElements']:
259 |                 if element['tag'] == 'IFRAME' and element.get('src'):
260 |                     from utils.network_utils import analyze_url_risk
261 |                     risk_info = analyze_url_risk(element['src'])
262 |                     if risk_info['risk_level'] > 0:
263 |                         results.append({
264 |                             'type': 'suspicious_iframe',
265 |                             'url': element['src'],
266 |                             'risk_level': risk_info['risk_level'],
267 |                             'context': f"动态创建的iframe",
268 |                             'detection_method': 'headless_browser',
269 |                             'risk_reason': risk_info.get('reason', '可疑iframe')
270 |                         })
271 |         except Exception as e:
272 |             self.logger.error(f"检测DOM操作时出错: {str(e)}")
273 |         
274 |         return results
275 |     
276 |     def _detect_iframe_content(self) -> List[Dict[str, Any]]:
277 |         """检测iframe中的内容
278 |         
279 |         Returns:
280 |             检测到的iframe中的可疑内容列表
281 |         """
282 |         results = []
283 |         
284 |         try:
285 |             from selenium.webdriver.common.by import By
286 |             # 获取所有iframe
287 |             iframes = self.driver.find_elements(By.TAG_NAME, 'iframe')
288 |             self.logger.info(f"发现 {len(iframes)} 个iframe元素")
289 |             
290 |             for index, iframe in enumerate(iframes):
291 |                 try:
292 |                     iframe_src = iframe.get_attribute('src')
293 |                     self.logger.info(f"处理iframe {index + 1}/{len(iframes)}: {iframe_src or '无src属性'}")
294 |                     
295 |                     # 分析iframe的src属性
296 |                     if iframe_src:
297 |                         from utils.network_utils import analyze_url_risk
298 |                         risk_info = analyze_url_risk(iframe_src)
299 |                         
300 |                         if risk_info['risk_level'] > 0:
301 |                             results.append({
302 |                                 'type': 'suspicious_iframe',
303 |                                 'url': iframe_src,
304 |                                 'risk_level': risk_info['risk_level'],
305 |                                 'context': f"iframe中的可疑链接",
306 |                                 'detection_method': 'headless_browser',
307 |                                 'risk_reason': risk_info.get('reason', '可疑iframe源')
308 |                             })
309 |                     
310 |                     # 尝试切换到iframe上下文分析内容
311 |                     try:
312 |                         self.driver.switch_to.frame(iframe)
313 |                         
314 |                         # 获取iframe中的链接
315 |                         iframe_links = self.driver.find_elements(By.TAG_NAME, 'a')
316 |                         for link in iframe_links:
317 |                             href = link.get_attribute('href')
318 |                             if href:
319 |                                 from utils.network_utils import analyze_url_risk
320 |                                 risk_info = analyze_url_risk(href)
321 |                                 
322 |                                 if risk_info['risk_level'] > 0:
323 |                                     results.append({
324 |                                         'type': 'suspicious_url',
325 |                                         'url': href,
326 |                                         'risk_level': risk_info['risk_level'],
327 |                                         'context': f"iframe内部的可疑链接",
328 |                                         'detection_method': 'headless_browser',
329 |                                         'risk_reason': risk_info.get('reason', 'iframe内部链接风险')
330 |                                     })
331 |                     except Exception as iframe_e:
332 |                         self.logger.error(f"分析iframe内容时出错: {str(iframe_e)}")
333 |                     finally:
334 |                         # 确保切回主文档
335 |                         self.driver.switch_to.default_content()
336 |                 
337 |                 except Exception as e:
338 |                     self.logger.error(f"处理iframe时出错: {str(e)}")
339 |         
340 |         except Exception as e:
341 |             self.logger.error(f"获取iframe元素时出错: {str(e)}")
342 |         
343 |         return results
344 |     
345 |     def _detect_hidden_elements(self) -> List[Dict[str, Any]]:
346 |         """检测视觉上隐藏的元素
347 |         
348 |         Returns:
349 |             检测到的隐藏元素列表
350 |         """
351 |         results = []
352 |         
353 |         # 注入JavaScript获取隐藏元素
354 |         hidden_elements_script = """
355 |         (function() {
356 |             const hiddenElements = [];
357 |             
358 |             // 获取所有元素
359 |             const allElements = document.querySelectorAll('*');
360 |             
361 |             allElements.forEach(element => {
362 |                 const style = window.getComputedStyle(element);
363 |                 const rect = element.getBoundingClientRect();
364 |                 
365 |                 // 检查各种隐藏技术
366 |                 const isHidden = 
367 |                     style.display === 'none' ||
368 |                     style.visibility === 'hidden' ||
369 |                     style.opacity === '0' ||
370 |                     rect.width <= 1 ||
371 |                     rect.height <= 1 ||
372 |                     parseInt(style.fontSize) <= 0 ||
373 |                     element.offsetParent === null;
374 |                 
375 |                 // 检查绝对定位隐藏
376 |                 const isAbsPosHidden = 
377 |                     style.position === 'absolute' &&
378 |                     (parseInt(style.left) < -1000 || parseInt(style.top) < -1000 ||
379 |                      parseInt(style.right) < -1000 || parseInt(style.bottom) < -1000);
380 |                 
381 |                 // 检查文本颜色与背景色相同
382 |                 const textColor = style.color;
383 |                 const bgColor = style.backgroundColor || style.background;
384 |                 const isSameColor = textColor === bgColor && textColor !== 'rgba(0, 0, 0, 0)';
385 |                 
386 |                 // 检查是否包含链接或文本
387 |                 const hasLinks = element.querySelector('a') !== null;
388 |                 const hasText = element.textContent.trim().length > 0;
389 |                 const hasContent = hasLinks || hasText;
390 |                 
391 |                 if ((isHidden || isAbsPosHidden || isSameColor) && hasContent) {
392 |                     // 获取元素中的链接（如果有）
393 |                     const links = [];
394 |                     if (hasLinks) {
395 |                         const linkElements = element.querySelectorAll('a');
396 |                         linkElements.forEach(link => {
397 |                             const href = link.getAttribute('href');
398 |                             if (href) links.push(href);
399 |                         });
400 |                     }
401 |                     
402 |                     hiddenElements.push({
403 |                         tagName: element.tagName,
404 |                         id: element.id || '无ID',
405 |                         classes: element.className || '无类名',
406 |                         hiddenBy: isSameColor ? 'color_matching' : 
407 |                                   isAbsPosHidden ? 'absolute_position' : 'visibility',
408 |                         content: element.textContent.trim().substring(0, 200) + '...',
409 |                         hasLinks: hasLinks,
410 |                         links: links,
411 |                         textColor: textColor,
412 |                         bgColor: bgColor
413 |                     });
414 |                 }
415 |             });
416 |             
417 |             return hiddenElements;
418 |         })();
419 |         """
420 |         
421 |         try:
422 |             hidden_elements = self.driver.execute_script(hidden_elements_script)
423 |             self.logger.info(f"发现 {len(hidden_elements)} 个隐藏元素")
424 |             
425 |             for elem in hidden_elements:
426 |                 # 计算风险等级
427 |                 risk_level = 8 if elem['hasLinks'] else 6
428 |                 
429 |                 # 构建风险描述
430 |                 context = f"隐藏元素 ({elem['tagName']}): {elem['content']}"
431 |                 if elem['hasLinks']:
432 |                     context += f" 包含 {len(elem['links'])} 个链接"
433 |                 
434 |                 result_item = {
435 |                     'type': 'hidden_element',
436 |                     'technique': elem['hiddenBy'],
437 |                     'risk_level': risk_level,
438 |                     'context': context,
439 |                     'detection_method': 'headless_browser',
440 |                     'risk_reason': '视觉上隐藏的元素可能包含暗链'
441 |                 }
442 |                 
443 |                 # 如果有链接，添加链接信息
444 |                 if elem['hasLinks'] and elem['links']:
445 |                     result_item['hidden_links'] = elem['links']
446 |                 
447 |                 results.append(result_item)
448 |                 
449 |                 # 对于包含链接的隐藏元素，分别记录每个链接
450 |                 if elem['hasLinks'] and elem['links']:
451 |                     for link in elem['links']:
452 |                         from utils.network_utils import analyze_url_risk
453 |                         risk_info = analyze_url_risk(link)
454 |                         results.append({
455 |                             'type': 'suspicious_url',
456 |                             'url': link,
457 |                             'risk_level': max(risk_level, risk_info['risk_level']),
458 |                             'context': f"隐藏元素中的链接: {link}",
459 |                             'detection_method': 'headless_browser',
460 |                             'risk_reason': f"隐藏在{elem['hiddenBy']}类型的{elem['tagName']}元素中"
461 |                         })
462 |         
463 |         except Exception as e:
464 |             self.logger.error(f"检测隐藏元素时出错: {str(e)}")
465 |         
466 |         return results
467 |     
468 |     def close(self):
469 |         """关闭无头浏览器驱动
470 |         
471 |         清理资源，避免内存泄漏
472 |         """
473 |         if self.driver:
474 |             try:
475 |                 self.driver.quit()
476 |                 self.logger.info("无头浏览器已关闭")
477 |             except Exception as e:
478 |                 self.logger.error(f"关闭无头浏览器时出错: {str(e)}")
479 |             finally:
480 |                 self.driver = None
481 |     
482 |     def __del__(self):
483 |         """析构函数，确保资源被释放"""
484 |         self.close()
485 | 


--------------------------------------------------------------------------------
/core/detector/html_detector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | HTML检测器模块
  5 | """
  6 | 
  7 | import re
  8 | from typing import List, Dict, Any
  9 | from urllib.parse import urlparse
 10 | 
 11 | from utils.html_utils import (
 12 |     extract_script_tags,
 13 |     extract_link_tags,
 14 |     extract_meta_tags,
 15 |     extract_iframe_tags,
 16 |     find_hidden_elements,
 17 |     get_dom_structure,
 18 |     extract_comments
 19 | )
 20 | from utils.network_utils import (
 21 |     extract_urls,
 22 |     is_external_link,
 23 |     extract_domain
 24 | )
 25 | from utils.common_utils import (
 26 |     extract_text_between_markers,
 27 |     get_context
 28 | )
 29 | 
 30 | class HTMLDetector:
 31 |     """
 32 |     HTML内容检测器，用于检测HTML文件中的可疑链接和隐藏元素
 33 |     """
 34 |     
 35 |     def __init__(self, config):
 36 |         """
 37 |         初始化HTML检测器
 38 |         
 39 |         Args:
 40 |             config: 扫描配置对象
 41 |         """
 42 |         self.config = config
 43 |         self.logger = config.logger
 44 |         
 45 |         # 可疑HTML模式
 46 |         self.suspicious_patterns = {
 47 |             'suspicious_attributes': re.compile(r'\bon\w+\s*=\s*["\']?javascript:', re.IGNORECASE),
 48 |             'eval_inline': re.compile(r'\beval\s*\(', re.IGNORECASE),
 49 |             'document_write': re.compile(r'\bdocument\.write\s*\(', re.IGNORECASE),
 50 |             'base64_decode': re.compile(r'\batob\s*\(|\bfromCharCode\s*\(', re.IGNORECASE),
 51 |             'data_uri': re.compile(r'data:[^;]+;base64,', re.IGNORECASE),
 52 |             'remote_iframe': re.compile(r'<iframe[^>]+src=["\']?https?://', re.IGNORECASE),
 53 |             'hidden_divs': re.compile(r'<(div|span|p|section|article)[^>]+style=["\'][^"\']*(display\s*:\s*none|visibility\s*:\s*hidden)[^"\']*["\']', re.IGNORECASE),
 54 |             'obfuscated_attributes': re.compile(r'\b(data-|on)[a-z0-9_-]+\s*=\s*["\']?[^"\']*(\\\\x[0-9a-f]{2}|\\\\u[0-9a-f]{4})[^"\']*["\']?', re.IGNORECASE),
 55 |         }
 56 |         
 57 |         # 可疑域名模式
 58 |         self.suspicious_domain_patterns = [
 59 |             re.compile(r'\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+\b(?:cn|cc|tk|ml|ga|cf|pro|xyz|pw|top|loan|win|bid|online)\b', re.IGNORECASE),
 60 |             re.compile(r'\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+\b(?:bet|casino|poker|gamble)\b', re.IGNORECASE),
 61 |             re.compile(r'\b(?:[a-z0-9]{8,}\.)+\b(?:[a-z]{2,})\b', re.IGNORECASE),  # 检测8个字符以上的随机域名前缀
 62 |         ]
 63 |         
 64 |     def detect(self, file_path: str, content: str) -> List[Dict[str, Any]]:
 65 |         """
 66 |         检测HTML内容中的可疑元素
 67 |         
 68 |         Args:
 69 |             file_path: 文件路径或URL
 70 |             content: HTML内容
 71 |             
 72 |         Returns:
 73 |             检测结果列表
 74 |         """
 75 |         results = []
 76 |         
 77 |         try:
 78 |             # 1. 检测可疑URL
 79 |             url_results = self._detect_suspicious_urls(file_path, content)
 80 |             results.extend(url_results)
 81 |             
 82 |             # 2. 检测可疑模式
 83 |             pattern_results = self._detect_suspicious_patterns(file_path, content)
 84 |             results.extend(pattern_results)
 85 |             
 86 |             # 3. 检测隐藏元素
 87 |             hidden_results = self._detect_hidden_elements(file_path, content)
 88 |             results.extend(hidden_results)
 89 |             
 90 |             # 4. 检测可疑注释
 91 |             comment_results = self._detect_suspicious_comments(file_path, content)
 92 |             results.extend(comment_results)
 93 |             
 94 |             # 5. 检测可疑Meta标签
 95 |             meta_results = self._detect_suspicious_meta(file_path, content)
 96 |             results.extend(meta_results)
 97 |             
 98 |         except Exception as e:
 99 |             self.logger.error(f"HTML检测过程中发生错误: {str(e)}", exc_info=True)
100 |         
101 |         return results
102 |     
103 |     def _detect_suspicious_urls(self, file_path: str, content: str) -> List[Dict[str, Any]]:
104 |         """
105 |         检测HTML中的可疑URL
106 |         
107 |         Args:
108 |             file_path: 文件路径或URL
109 |             content: HTML内容
110 |             
111 |         Returns:
112 |             可疑URL检测结果
113 |         """
114 |         results = []
115 |         
116 |         # 提取所有URL
117 |         urls = extract_urls(content)
118 |         
119 |         for url_obj in urls:
120 |             url = url_obj['url']
121 |             context = url_obj['context']
122 |             
123 |             # 计算风险等级
124 |             risk_level, reason = self._calculate_url_risk(url, context, file_path)
125 |             
126 |             if risk_level > 0:
127 |                 result = {
128 |                     'type': 'suspicious_url',
129 |                     'file_path': file_path,
130 |                     'url': url,
131 |                     'risk_level': risk_level,
132 |                     'reason': reason,
133 |                     'context': context
134 |                 }
135 |                 results.append(result)
136 |         
137 |         return results
138 |     
139 |     def _calculate_url_risk(self, url: str, context: str, source: str) -> tuple:
140 |         """
141 |         计算URL的风险等级
142 |         
143 |         Args:
144 |             url: 要评估的URL
145 |             context: URL的上下文
146 |             
147 |         Returns:
148 |             (风险等级, 原因)
149 |         """
150 |         risk_level = 0
151 |         reason = []
152 |         
153 |         # 外部/内部链接判断
154 |         source_domain = None
155 |         try:
156 |             if isinstance(source, str) and source.startswith(('http://', 'https://')):
157 |                 source_domain = extract_domain(source)
158 |         except Exception:
159 |             source_domain = None
160 |         is_abs = url.lower().startswith(('http://', 'https://'))
161 |         if is_abs:
162 |             # 仅对跨域外部链接提高基础风险
163 |             try:
164 |                 link_domain = extract_domain(url)
165 |             except Exception:
166 |                 link_domain = None
167 |             from utils.network_utils import is_external_link
168 |             trusted_domains = {
169 |                 # JS/CSS 通用CDN
170 |                 'cdn.jsdelivr.net', 'cdnjs.cloudflare.com', 'code.jquery.com', 'ajax.googleapis.com',
171 |                 'fonts.googleapis.com', 'fonts.gstatic.com', 'unpkg.com', 'www.unpkg.com',
172 |                 'lib.baomitu.com', 'cdn.staticfile.org', 'staticfile.org', 'stackpath.bootstrapcdn.com',
173 |                 'maxcdn.bootstrapcdn.com', 'bootcss.com', 'cdn.bootcss.com', 'bootcdn.net', 'cdn.bootcdn.net',
174 |                 # 常见站点/资源域（降低误报）
175 |                 'hm.baidu.com', 'www.googletagmanager.com', 'busuanzi.ibruce.info',
176 |                 'seccdn.libravatar.org', 'registry.npmmirror.com', 'icp.gov.moe',
177 |                 'www.bilibili.com', 'hexo.io'
178 |             }
179 |             def _is_trusted(domain: str) -> bool:
180 |                 if not domain:
181 |                     return False
182 |                 for td in trusted_domains:
183 |                     if domain == td or domain.endswith('.' + td):
184 |                         return True
185 |                 return False
186 |             if _is_trusted(link_domain):
187 |                 return 0, '可信CDN域名'
188 |             elif is_external_link(url, source_domain):
189 |                 risk_level += 2
190 |                 reason.append('外部链接')
191 |             
192 |             # 检测可疑域名后缀（跨域时才计入）
193 |             suspicious_tlds = ['pro', 'pw', 'top', 'loan', 'win', 'bid', 'online', 'tk', 'ga', 'gq', 'ml', 'cf']
194 |             parsed_url = urlparse(url)
195 |             domain = parsed_url.netloc
196 |             if _is_trusted(link_domain):
197 |                 pass
198 |             elif is_external_link(url, source_domain):
199 |                 for tld in suspicious_tlds:
200 |                     if domain.endswith('.' + tld):
201 |                         risk_level += 2
202 |                         reason.append(f'高风险域名后缀: {tld}')
203 |                         break
204 |             
205 |             # 检测短随机字符串域名
206 |             domain_parts = domain.split('.')
207 |             if len(domain_parts) >= 2 and len(domain_parts[-2]) >= 8 and not any(c.isdigit() for c in domain_parts[-2]):
208 |                 risk_level += 2
209 |                 reason.append('可能为随机生成的可疑域名')
210 |         
211 |         # 检查是否使用了可疑端口
212 |         parsed_url = urlparse(url)
213 |         if parsed_url.port and parsed_url.port not in [80, 443, 8080, 8443]:
214 |             risk_level += 2
215 |             reason.append('使用非标准端口')
216 |         
217 |         # 检查是否包含可疑查询参数
218 |         suspicious_params = ['redirect', 'proxy', 'referer', 'origin', 'callback']
219 |         if parsed_url.query:
220 |             for param in suspicious_params:
221 |                 if param in parsed_url.query.lower():
222 |                     risk_level += 1
223 |                     reason.append(f'包含可疑参数: {param}')
224 |                     break
225 |         
226 |         # 检查是否使用了短链接服务
227 |         short_link_domains = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly', 'is.gd', 'adf.ly']
228 |         domain = extract_domain(url)
229 |         if domain in short_link_domains:
230 |             risk_level += 3
231 |             reason.append('使用短链接服务')
232 |         
233 |         # 检查是否匹配可疑域名模式
234 |         for pattern in self.suspicious_domain_patterns:
235 |             if pattern.search(url):
236 |                 # 同域名不计入可疑域名模式
237 |                 if is_abs and source_domain and not is_external_link(url, source_domain):
238 |                     pass
239 |                 else:
240 |                     risk_level += 2
241 |                     reason.append('匹配可疑域名模式')
242 |                     break
243 |         
244 |         # 检查上下文是否包含可疑关键词
245 |         suspicious_context_keywords = ['hidden', 'display:none', 'visibility:hidden', 'opacity:0']
246 |         for keyword in suspicious_context_keywords:
247 |             if keyword.lower() in context.lower():
248 |                 risk_level += 2
249 |                 reason.append('URL位于可疑上下文中')
250 |                 break
251 |         
252 |         # 检查是否为JavaScript伪协议
253 |         if url.lower().startswith('javascript:'):
254 |             risk_level += 4
255 |             reason.append('JavaScript伪协议')
256 | 
257 |         # 对相对路径与同域资源降低风险
258 |         if not is_abs:
259 |             if url.startswith('/'):
260 |                 # 同域相对路径，不计风险
261 |                 return 0, ''
262 |             # 非协议/非根路径的文本片段，不计风险
263 |             if not url.lower().startswith(('javascript:', 'data:')):
264 |                 return 0, ''
265 | 
266 |         return risk_level, ', '.join(reason)
267 |     
268 |     def _detect_suspicious_patterns(self, file_path: str, content: str) -> List[Dict[str, Any]]:
269 |         """
270 |         检测HTML中的可疑模式
271 |         
272 |         Args:
273 |             file_path: 文件路径或URL
274 |             content: HTML内容
275 |             
276 |         Returns:
277 |             可疑模式检测结果
278 |         """
279 |         results = []
280 |         
281 |         for pattern_name, pattern in self.suspicious_patterns.items():
282 |             for match in pattern.finditer(content):
283 |                 start_pos = max(0, match.start() - 50)
284 |                 end_pos = min(len(content), match.end() + 50)
285 |                 context = get_context(content, match.start(), 50)
286 |                 
287 |                 # 计算风险等级
288 |                 risk_level = self._get_pattern_risk_level(pattern_name)
289 |                 
290 |                 result = {
291 |                     'type': 'suspicious_pattern',
292 |                     'file_path': file_path,
293 |                     'pattern': pattern_name,
294 |                     'matched_content': match.group(0),
295 |                     'risk_level': risk_level,
296 |                     'description': self._get_pattern_description(pattern_name),
297 |                     'context': context
298 |                 }
299 |                 results.append(result)
300 |         
301 |         # 检测内联脚本
302 |         script_tags = extract_script_tags(content)
303 |         for script in script_tags:
304 |             is_inline = script.get('inline') if 'inline' in script else (not script.get('src') and bool(script.get('content')))
305 |             if is_inline:
306 |                 script_text = script.get('content', '') or ''
307 |                 script_length = len(script_text)
308 |                 # 计算位置，避免缺少start_pos/end_pos导致异常
309 |                 pos = 0
310 |                 try:
311 |                     original = script.get('original_tag', '') or ''
312 |                     if original:
313 |                         idx = content.find(original)
314 |                         if idx >= 0:
315 |                             pos = idx
316 |                         else:
317 |                             # 回退使用脚本内容定位
318 |                             cidx = content.find(script_text[:50]) if script_text else -1
319 |                             pos = cidx if cidx >= 0 else 0
320 |                     else:
321 |                         # 使用src或部分内容定位
322 |                         src = script.get('src', '') or ''
323 |                         if src:
324 |                             import re as _re
325 |                             m = _re.search(r'<script[^>]*src=["\']' + _re.escape(src) + r'["\']', content, _re.IGNORECASE)
326 |                             pos = m.start() if m else 0
327 |                         else:
328 |                             cidx = content.find(script_text[:50]) if script_text else -1
329 |                             pos = cidx if cidx >= 0 else 0
330 |                 except Exception:
331 |                     pos = 0
332 |                 # 检测复杂内联脚本
333 |                 if script_length > 1000:
334 |                     context = get_context(content, pos, 100)
335 |                     result = {
336 |                         'type': 'suspicious_pattern',
337 |                         'file_path': file_path,
338 |                         'pattern': 'large_inline_script',
339 |                         'matched_content': script_text[:200] + '...',
340 |                         'risk_level': 2,
341 |                         'description': '大型内联脚本',
342 |                         'context': context
343 |                     }
344 |                     results.append(result)
345 |         
346 |         return results
347 |     
348 |     def _get_pattern_risk_level(self, pattern_name: str) -> int:
349 |         """
350 |         获取模式的风险等级
351 |         
352 |         Args:
353 |             pattern_name: 模式名称
354 |             
355 |         Returns:
356 |             风险等级
357 |         """
358 |         risk_levels = {
359 |             'suspicious_attributes': 3,
360 |             'eval_inline': 4,
361 |             'document_write': 3,
362 |             'base64_decode': 2,
363 |             'data_uri': 2,
364 |             'remote_iframe': 3,
365 |             'hidden_divs': 2,
366 |             'obfuscated_attributes': 3
367 |         }
368 |         
369 |         return risk_levels.get(pattern_name, 1)
370 |     
371 |     def _get_pattern_description(self, pattern_name: str) -> str:
372 |         """
373 |         获取模式的描述
374 |         
375 |         Args:
376 |             pattern_name: 模式名称
377 |             
378 |         Returns:
379 |             描述文本
380 |         """
381 |         descriptions = {
382 |             'suspicious_attributes': '可疑的事件属性',
383 |             'eval_inline': '内联eval函数',
384 |             'document_write': 'document.write调用',
385 |             'base64_decode': 'Base64解码操作',
386 |             'data_uri': 'Data URI',
387 |             'remote_iframe': '远程iframe',
388 |             'hidden_divs': '隐藏的div元素',
389 |             'obfuscated_attributes': '混淆的属性'
390 |         }
391 |         
392 |         return descriptions.get(pattern_name, pattern_name)
393 |     
394 |     def _detect_hidden_elements(self, file_path: str, content: str) -> List[Dict[str, Any]]:
395 |         """
396 |         检测HTML中的隐藏元素
397 |         
398 |         Args:
399 |             file_path: 文件路径或URL
400 |             content: HTML内容
401 |             
402 |         Returns:
403 |             隐藏元素检测结果
404 |         """
405 |         results = []
406 |         
407 |         hidden_elements = find_hidden_elements(content)
408 |         
409 |         for element in hidden_elements:
410 |             # 确保元素字典包含必要的键
411 |             if not all(key in element for key in ['type', 'method', 'context']):
412 |                 # 如果缺少必要的键，使用默认值
413 |                 element_type = element.get('type', 'unknown')
414 |                 hiding_method = element.get('method', 'unknown')
415 |                 context = element.get('context', '')
416 |             else:
417 |                 element_type = element['type']
418 |                 hiding_method = element['method']
419 |                 context = element['context']
420 |             
421 |             # 计算风险等级
422 |             risk_level = self._calculate_hidden_element_risk(element)
423 |             
424 |             if risk_level > 0:
425 |                 result = {
426 |                     'type': 'hidden_element',
427 |                     'file_path': file_path,
428 |                     'element_type': element_type,
429 |                     'hiding_method': hiding_method,
430 |                     'risk_level': risk_level,
431 |                     'context': context,
432 |                     'description': f"隐藏的{element_type}元素，使用{hiding_method}技术"
433 |                 }
434 |                 results.append(result)
435 |         
436 |         return results
437 |     
438 |     def _calculate_hidden_element_risk(self, element: Dict[str, Any]) -> int:
439 |         """
440 |         计算隐藏元素的风险等级
441 |         
442 |         Args:
443 |             element: 隐藏元素信息
444 |             
445 |         Returns:
446 |             风险等级
447 |         """
448 |         # 基础风险
449 |         risk_level = 1
450 |         
451 |         # 根据隐藏方法调整风险，确保'method'键存在
452 |         high_risk_methods = ['position:absolute', 'opacity:0', 'clip-path']
453 |         if 'method' in element and any(method in element['method'] for method in high_risk_methods):
454 |             risk_level += 1
455 |         
456 |         # 检查内容长度，如果内容很长，风险更高
457 |         if 'context' in element and len(element['context']) > 100:
458 |             risk_level += 1
459 |         
460 |         # 检查是否包含链接
461 |         if 'context' in element and ('href=' in element['context'] or 'src=' in element['context']):
462 |             risk_level += 2
463 |         
464 |         return risk_level
465 |     
466 |     def _detect_suspicious_comments(self, file_path: str, content: str) -> List[Dict[str, Any]]:
467 |         """
468 |         检测HTML中的可疑注释
469 |         
470 |         Args:
471 |             file_path: 文件路径或URL
472 |             content: HTML内容
473 |             
474 |         Returns:
475 |             可疑注释检测结果
476 |         """
477 |         results = []
478 |         
479 |         comments = extract_comments(content)
480 |         
481 |         # 可疑注释模式
482 |         suspicious_comment_patterns = {
483 |             'hidden_content': re.compile(r'<!--(?:(?!-->)[\s\S])*?(?:password|secret|hidden|private|admin)(?:(?!-->)[\s\S])*?-->'),
484 |             'encoded_content': re.compile(r'<!--(?:(?!-->)[\s\S])*?(?:base64|hex|escape|decodeURI)(?:(?!-->)[\s\S])*?-->'),
485 |             'conditional_comments': re.compile(r'<!--\[(?:(?!\]-->)\S\s)*\]>'),
486 |             'large_comment': re.compile(r'<!--(?:(?!-->)[\s\S]){500,}-->')
487 |         }
488 |         
489 |         for c in comments:
490 |             text = c['content'] if isinstance(c, dict) else (c if isinstance(c, str) else '')
491 |             if not text:
492 |                 continue
493 |                 
494 |             for pattern_name, pattern in suspicious_comment_patterns.items():
495 |                 if pattern.search(text):
496 |                     # 计算风险等级
497 |                     risk_level = self._get_comment_risk_level(pattern_name)
498 |                     
499 |                     result = {
500 |                         'type': 'suspicious_comment',
501 |                         'file_path': file_path,
502 |                         'pattern': pattern_name,
503 |                         'comment': text[:200] + ('...' if len(text) > 200 else ''),
504 |                         'risk_level': risk_level,
505 |                         'description': self._get_comment_description(pattern_name),
506 |                         'context': get_context(content, content.find(text), content.find(text) + len(text), 50)
507 |                     }
508 |                     results.append(result)
509 |         
510 |         # 检测注释中的链接
511 |         link_pattern = re.compile(r'href=["\'](https?://[^"\']+)')
512 |         for c in comments:
513 |             text = c['content'] if isinstance(c, dict) else (c if isinstance(c, str) else '')
514 |             if not text:
515 |                 continue
516 |             for match in link_pattern.finditer(text):
517 |                 url = match.group(1)
518 |                 result = {
519 |                     'type': 'suspicious_url',
520 |                     'file_path': file_path,
521 |                     'url': url,
522 |                     'risk_level': 3,
523 |                     'reason': '链接位于HTML注释中',
524 |                     'context': text[:200] + ('...' if len(text) > 200 else '')
525 |                 }
526 |                 results.append(result)
527 |         
528 |         return results
529 |     
530 |     def _get_comment_risk_level(self, pattern_name: str) -> int:
531 |         """
532 |         获取注释模式的风险等级
533 |         
534 |         Args:
535 |             pattern_name: 模式名称
536 |             
537 |         Returns:
538 |             风险等级
539 |         """
540 |         risk_levels = {
541 |             'hidden_content': 3,
542 |             'encoded_content': 3,
543 |             'conditional_comments': 1,
544 |             'large_comment': 2
545 |         }
546 |         
547 |         return risk_levels.get(pattern_name, 1)
548 |     
549 |     def _get_comment_description(self, pattern_name: str) -> str:
550 |         """
551 |         获取注释模式的描述
552 |         
553 |         Args:
554 |             pattern_name: 模式名称
555 |             
556 |         Returns:
557 |             描述文本
558 |         """
559 |         descriptions = {
560 |             'hidden_content': '包含敏感信息的注释',
561 |             'encoded_content': '包含编码内容的注释',
562 |             'conditional_comments': '条件注释',
563 |             'large_comment': '大型注释'
564 |         }
565 |         
566 |         return descriptions.get(pattern_name, pattern_name)
567 |     
568 |     def _detect_suspicious_meta(self, file_path: str, content: str) -> List[Dict[str, Any]]:
569 |         """
570 |         检测HTML中的可疑Meta标签
571 |         
572 |         Args:
573 |             file_path: 文件路径或URL
574 |             content: HTML内容
575 |             
576 |         Returns:
577 |             可疑Meta标签检测结果
578 |         """
579 |         results = []
580 |         
581 |         meta_tags = extract_meta_tags(content)
582 |         
583 |         # 检测可疑的refresh或redirect Meta标签
584 |         for meta in meta_tags:
585 |             http_equiv = meta.get('http-equiv', '').lower()
586 |             content_attr = meta.get('content', '').lower()
587 |             
588 |             if http_equiv in ['refresh', 'redirect'] and 'url=' in content_attr:
589 |                 # 提取URL
590 |                 url_match = re.search(r'url=(\S+)', content_attr)
591 |                 if url_match:
592 |                     url = url_match.group(1)
593 |                     result = {
594 |                         'type': 'suspicious_url',
595 |                         'file_path': file_path,
596 |                         'url': url,
597 |                         'risk_level': 3,
598 |                         'reason': '通过Meta标签重定向',
599 |                         'context': meta.get('raw', '')
600 |                     }
601 |                     results.append(result)
602 |         
603 |         # 检测包含可疑内容的Meta标签
604 |         suspicious_meta_keywords = ['bot', 'spider', 'crawler', 'nofollow', 'noindex']
605 |         for meta in meta_tags:
606 |             name = meta.get('name', '').lower()
607 |             content_attr = meta.get('content', '').lower()
608 |             
609 |             if name in ['robots', 'keywords', 'description']:
610 |                 for keyword in suspicious_meta_keywords:
611 |                     if keyword in content_attr:
612 |                         result = {
613 |                             'type': 'suspicious_meta',
614 |                             'file_path': file_path,
615 |                             'meta_name': name,
616 |                             'suspicious_keyword': keyword,
617 |                             'risk_level': 1,
618 |                             'description': f"包含可疑关键词'{keyword}'的Meta标签",
619 |                             'context': meta.get('raw', '')
620 |                         }
621 |                         results.append(result)
622 |         
623 |         return results
624 |         
625 | 


--------------------------------------------------------------------------------
/core/detector/js_detector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | JavaScript检测器模块
  5 | """
  6 | 
  7 | import re
  8 | from typing import List, Dict, Any
  9 | 
 10 | from utils.js_utils import (
 11 |     extract_suspicious_patterns,
 12 |     extract_function_calls,
 13 |     detect_dynamic_urls,
 14 |     identify_obfuscated_code,
 15 |     detect_document_modification,
 16 |     extract_variable_assignments,
 17 |     extract_comments as js_extract_comments,
 18 |     strip_comments as js_remove_comments
 19 | )
 20 | from utils.common_utils import (
 21 |     get_context,
 22 |     calculate_entropy,
 23 |     clean_text
 24 | )
 25 | from utils.network_utils import (
 26 |     extract_urls,
 27 |     is_external_link,
 28 |     get_domain
 29 | )
 30 | 
 31 | class JSDetector:
 32 |     """
 33 |     JavaScript代码检测器，用于检测JS中的恶意代码和可疑行为
 34 |     """
 35 |     
 36 |     def __init__(self, config):
 37 |         """
 38 |         初始化JavaScript检测器
 39 |         
 40 |         Args:
 41 |             config: 扫描配置对象
 42 |         """
 43 |         self.config = config
 44 |         self.logger = config.logger
 45 |         
 46 |         # 高危JavaScript函数和方法
 47 |         self.high_risk_functions = {
 48 |             'eval': 5,
 49 |             'Function': 4,
 50 |             'setTimeout': 3,
 51 |             'setInterval': 3,
 52 |             'document.write': 4,
 53 |             'document.writeln': 4,
 54 |             'innerHTML': 4,
 55 |             'outerHTML': 4,
 56 |             'execScript': 5,
 57 |             'XMLHttpRequest': 3,
 58 |             'fetch': 3,
 59 |             'WebSocket': 3,
 60 |             'navigator.sendBeacon': 3,
 61 |             'window.open': 3,
 62 |             'unescape': 3,
 63 |             'escape': 3,
 64 |             'decodeURI': 2,
 65 |             'decodeURIComponent': 2,
 66 |             'document.createElement': 3,  # 提升DOM创建的风险等级
 67 |             'document.createElementNS': 3,
 68 |             'appendChild': 3,
 69 |             'insertBefore': 3
 70 |         }
 71 |         
 72 |         # 可疑的DOM操作
 73 |         self.suspicious_dom_operations = {
 74 |             'appendChild': 3,
 75 |             'insertBefore': 3,
 76 |             'replaceChild': 3,
 77 |             'createElement': 2,
 78 |             'createTextNode': 2,
 79 |             'createDocumentFragment': 2,
 80 |             'querySelector': 2,
 81 |             'querySelectorAll': 2,
 82 |             'getElementById': 2,
 83 |             'getElementsByClassName': 2,
 84 |             'getElementsByTagName': 2
 85 |         }
 86 |         
 87 |         # 混淆代码特征
 88 |         self.obfuscation_patterns = {
 89 |             'hex_encoding': re.compile(r'\\x[0-9a-fA-F]{2}'),
 90 |             'unicode_encoding': re.compile(r'\\u[0-9a-fA-F]{4}'),
 91 |             'string_concatenation': re.compile(r'["\'][^"\']*["\']\s*\+\s*["\'][^"\']*["\']'),
 92 |             'array_manipulation': re.compile(r'\[.*\]\.join\s*\(\s*["\']'),
 93 |             'eval_with_arguments': re.compile(r'eval\s*\(\s*[a-zA-Z0-9_$\[\]]+\s*\+'),
 94 |             'reversed_string': re.compile(r'\.split\(\s*["\']\s*\)\s*\.reverse\(\)\s*\.join'),
 95 |             'base64_like': re.compile(r'[A-Za-z0-9+/=]{20,}'),
 96 |             'unusual_variable_names': re.compile(r'[a-zA-Z_$][a-zA-Z0-9_$]{15,}'),
 97 |             'suspicious_domain_pattern': re.compile(r'https?://[a-zA-Z0-9]{8,}\.(?:pro|xyz|pw|top|loan|win|bid|online)', re.IGNORECASE)
 98 |         }
 99 |         
100 |         # 可疑的代码模式
101 |         self.suspicious_patterns = {
102 |             'self_executing': re.compile(r'(function\s*\(\s*\)\s*\{[^\}]*\}\s*\(\s*\))|\(([^\)]+)\)\(\)'),
103 |             'conditional_eval': re.compile(r'if\s*\([^\)]*\)\s*\{[^\}]*eval\s*\('),
104 |             'try_catch_eval': re.compile(r'try\s*\{[^\}]*eval\s*\([^\)]*\)[^\}]*\}\s*catch'),
105 |             'hidden_eval': re.compile(r'[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*["\']eval["\'].*;.*\[.*\]\s*\('),
106 |             'document_manipulation_with_eval': re.compile(r'document\.(body|documentElement|head)\.(appendChild|innerHTML)\s*=\s*eval\s*\('),
107 |             'url_to_eval': re.compile(r'(document\.location|window\.location|location)\.(href|search|hash)\s*.*eval\s*\('),
108 |             'cookie_manipulation': re.compile(r'document\.cookie'),
109 |             'user_agent_check': re.compile(r'navigator\.userAgent'),
110 |             'referrer_check': re.compile(r'document\.referrer')
111 |         }
112 |         
113 |     def detect(self, file_path: str, content: str) -> List[Dict[str, Any]]:
114 |         """
115 |         检测JavaScript代码中的恶意内容
116 |         
117 |         Args:
118 |             file_path: 文件路径
119 |             content: JavaScript代码内容
120 |             
121 |         Returns:
122 |             检测结果列表
123 |         """
124 |         results = []
125 |         
126 |         try:
127 |             # 预处理代码，清理空白字符等
128 |             clean_content = clean_text(content)
129 |             
130 |             # 1. 检测高危函数调用
131 |             high_risk_results = self._detect_high_risk_functions(file_path, content, clean_content)
132 |             results.extend(high_risk_results)
133 |             
134 |             # 2. 检测混淆代码
135 |             obfuscation_results = self._detect_obfuscation(file_path, content, clean_content)
136 |             results.extend(obfuscation_results)
137 |             
138 |             # 3. 检测可疑代码模式
139 |             pattern_results = self._detect_suspicious_patterns(file_path, content)
140 |             results.extend(pattern_results)
141 |             
142 |             # 4. 检测动态URL和网络请求
143 |             url_results = self._detect_dynamic_urls(file_path, content)
144 |             results.extend(url_results)
145 |             
146 |             # 5. 检测DOM修改操作
147 |             dom_results = self._detect_dom_manipulations(file_path, content)
148 |             results.extend(dom_results)
149 |             
150 |             # 6. 检测可疑注释
151 |             comment_results = self._detect_suspicious_comments(file_path, content)
152 |             results.extend(comment_results)
153 |             
154 |             # 7. 代码复杂度和熵分析
155 |             complexity_results = self._analyze_code_complexity(file_path, content)
156 |             results.extend(complexity_results)
157 |             
158 |         except Exception as e:
159 |             self.logger.error(f"JavaScript检测过程中发生错误: {str(e)}", exc_info=True)
160 |         
161 |         return results
162 |     
163 |     def _detect_high_risk_functions(self, file_path: str, content: str, clean_content: str) -> List[Dict[str, Any]]:
164 |         """
165 |         检测高危函数调用
166 |         
167 |         Args:
168 |             file_path: 文件路径
169 |             content: JavaScript代码内容
170 |             clean_content: 清理后的代码内容
171 |             
172 |         Returns:
173 |             高危函数检测结果
174 |         """
175 |         results = []
176 |         
177 |         for func_name in self.high_risk_functions.keys():
178 |             calls = extract_function_calls(content, func_name)
179 |             for func_call in calls:
180 |                 args_str = func_call.get('arguments', '')
181 |                 pos = func_call.get('position', (0, 0))
182 |                 risk_level = self.high_risk_functions[func_name]
183 |                 if any(pattern.search(args_str) for pattern in self.obfuscation_patterns.values()):
184 |                     risk_level = min(5, risk_level + 1)
185 |                 context = get_context(content, pos[0], pos[1], 100)
186 |                 result = {
187 |                     'type': 'high_risk_function',
188 |                     'file_path': file_path,
189 |                     'function_name': func_name,
190 |                     'arguments': args_str,
191 |                     'risk_level': risk_level,
192 |                     'description': f"调用高危函数 {func_name}",
193 |                     'context': context
194 |                 }
195 |                 results.append(result)
196 |         
197 |         return results
198 |     
199 |     def _detect_obfuscation(self, file_path: str, content: str, clean_content: str) -> List[Dict[str, Any]]:
200 |         """
201 |         检测混淆代码
202 |         
203 |         Args:
204 |             file_path: 文件路径
205 |             content: JavaScript代码内容
206 |             clean_content: 清理后的代码内容
207 |             
208 |         Returns:
209 |             混淆代码检测结果
210 |         """
211 |         results = []
212 |         
213 |         # 使用js_utils中的函数识别混淆代码
214 |         obfuscation_info = identify_obfuscated_code(content)
215 |         
216 |         if obfuscation_info['is_obfuscated']:
217 |             # 基于混淆特征计算风险等级
218 |             risk_level = 3 + len(obfuscation_info['detected_patterns'])
219 |             risk_level = min(5, risk_level)
220 |             
221 |             result = {
222 |                 'type': 'obfuscated_code',
223 |                 'file_path': file_path,
224 |                 'risk_level': risk_level,
225 |                 'detected_patterns': obfuscation_info['detected_patterns'],
226 |                 'description': "代码疑似被混淆，可能隐藏恶意行为",
227 |                 'context': obfuscation_info.get('sample', '')[:200] + '...' if len(obfuscation_info.get('sample', '')) > 200 else obfuscation_info.get('sample', '')
228 |             }
229 |             results.append(result)
230 |         
231 |         # 额外检测特定混淆模式
232 |         for pattern_name, pattern in self.obfuscation_patterns.items():
233 |             matches = list(pattern.finditer(content))
234 |             if matches:
235 |                 # 统计匹配次数
236 |                 match_count = len(matches)
237 |                 
238 |                 # 根据匹配次数确定风险等级
239 |                 if match_count >= 10:
240 |                     risk_level = 4
241 |                 elif match_count >= 5:
242 |                     risk_level = 3
243 |                 else:
244 |                     risk_level = 2
245 |                 
246 |                 # 获取第一个匹配的上下文
247 |                 first_match = matches[0]
248 |                 context = get_context(content, first_match.start(), first_match.end(), 100)
249 |                 
250 |                 result = {
251 |                     'type': 'obfuscation_pattern',
252 |                     'file_path': file_path,
253 |                     'pattern': pattern_name,
254 |                     'match_count': match_count,
255 |                     'risk_level': risk_level,
256 |                     'description': f"检测到{self._get_pattern_description(pattern_name)}，匹配{match_count}次",
257 |                     'context': context
258 |                 }
259 |                 results.append(result)
260 |         
261 |         return results
262 |     
263 |     def _detect_suspicious_patterns(self, file_path: str, content: str) -> List[Dict[str, Any]]:
264 |         """
265 |         检测可疑代码模式
266 |         
267 |         Args:
268 |             file_path: 文件路径
269 |             content: JavaScript代码内容
270 |             
271 |         Returns:
272 |             可疑代码模式检测结果
273 |         """
274 |         results = []
275 |         
276 |         for pattern_name, pattern in self.suspicious_patterns.items():
277 |             matches = list(pattern.finditer(content))
278 |             if matches:
279 |                 for match in matches:
280 |                     # 确定风险等级
281 |                     risk_level = self._get_pattern_risk_level(pattern_name)
282 |                     
283 |                     # 获取上下文
284 |                     context = get_context(content, match.start(), match.end(), 100)
285 |                     
286 |                     result = {
287 |                         'type': 'suspicious_pattern',
288 |                         'file_path': file_path,
289 |                         'pattern': pattern_name,
290 |                         'matched_content': match.group(0),
291 |                         'risk_level': risk_level,
292 |                         'description': self._get_pattern_description(pattern_name),
293 |                         'context': context
294 |                     }
295 |                     results.append(result)
296 |         
297 |         # 使用js_utils中的函数提取可疑模式
298 |         suspicious_patterns = extract_suspicious_patterns(content)
299 |         for pattern_info in suspicious_patterns:
300 |             result = {
301 |                 'type': 'suspicious_pattern',
302 |                 'file_path': file_path,
303 |                 'pattern': pattern_info['type'],
304 |                 'matched_content': pattern_info['content'],
305 |                 'risk_level': pattern_info['risk_level'],
306 |                 'description': pattern_info['description'],
307 |                 'context': pattern_info.get('context', '')
308 |             }
309 |             results.append(result)
310 |         
311 |         return results
312 |     
313 |     def _detect_dynamic_urls(self, file_path: str, content: str) -> List[Dict[str, Any]]:
314 |         """
315 |         检测动态URL和网络请求
316 |         
317 |         Args:
318 |             file_path: 文件路径
319 |             content: JavaScript代码内容
320 |             
321 |         Returns:
322 |             动态URL检测结果
323 |         """
324 |         results = []
325 |         
326 |         # 使用js_utils中的函数检测动态URL
327 |         dynamic_urls = detect_dynamic_urls(content)
328 |         
329 |         for url_info in dynamic_urls:
330 |             risk_level = url_info.get('risk_level', 3)
331 |             raw_url = url_info.get('url')
332 |             expr = url_info.get('expression', '')
333 |             if raw_url:
334 |                 if ('+' in raw_url) or ('[' in raw_url):
335 |                     risk_level = min(5, risk_level + 1)
336 |                 result = {
337 |                     'type': 'dynamic_url',
338 |                     'file_path': file_path,
339 |                     'url': raw_url,
340 |                     'risk_level': risk_level,
341 |                     'reason': url_info.get('reason', '动态构建的URL'),
342 |                     'context': url_info.get('context', '')
343 |                 }
344 |             else:
345 |                 result = {
346 |                     'type': 'dynamic_expression',
347 |                     'file_path': file_path,
348 |                     'expression': expr,
349 |                     'risk_level': risk_level,
350 |                     'reason': url_info.get('reason', '动态构建的URL'),
351 |                     'context': url_info.get('context', '')
352 |                 }
353 |             results.append(result)
354 |         
355 |         # 提取所有URL并检测可疑URL
356 |         urls = extract_urls(content, context_type='js')
357 |         for url_obj in urls:
358 |             url = url_obj['url']
359 |             context = url_obj['context']
360 |             
361 |             # 检测可疑URL
362 |             # 传入基准域用于外链判断
363 |             base_domain = None
364 |             if isinstance(file_path, str) and file_path.startswith(('http://', 'https://')):
365 |                 base_domain = get_domain(file_path)
366 |             if is_external_link(url, base_domain):
367 |                 risk_level = 3
368 |                 reasons = []
369 |                 
370 |                 # 1. 检查URL是否匹配可疑域名模式
371 |                 if self.obfuscation_patterns['suspicious_domain_pattern'].search(url):
372 |                     risk_level = 5
373 |                     reasons.append('URL匹配可疑域名模式')
374 |                 
375 |                 # 2. 检查URL是否在可疑上下文中
376 |                 if any(keyword in context.lower() for keyword in ['eval', 'exec', 'decode', 'base64']):
377 |                     risk_level = min(5, risk_level + 1)
378 |                     reasons.append('URL在可疑上下文中')
379 |                 
380 |                 # 3. 检查URL域名是否使用了可疑后缀
381 |                 suspicious_suffixes = ['.pro', '.xyz', '.pw', '.top', '.loan', '.win', '.bid', '.online']
382 |                 for suffix in suspicious_suffixes:
383 |                     if url.endswith(suffix):
384 |                         risk_level = min(5, risk_level + 1)
385 |                         reasons.append(f'使用了高风险域名后缀{suffix}')
386 |                         break
387 |                 
388 |                 # 4. 检查URL路径是否包含随机字符串
389 |                 if re.search(r'/[a-zA-Z0-9]{8,}\.js$', url):
390 |                     risk_level = min(5, risk_level + 1)
391 |                     reasons.append('URL路径包含长随机字符串')
392 |                 
393 |                 # 如果有任何风险因素，添加结果
394 |                 if risk_level >= 3 or reasons:
395 |                     result = {
396 |                         'type': 'suspicious_url',
397 |                         'file_path': file_path,
398 |                         'url': url,
399 |                         'risk_level': risk_level,
400 |                         'reason': '; '.join(reasons) if reasons else '外部URL',
401 |                         'context': context
402 |                     }
403 |                     results.append(result)
404 |         
405 |         return results
406 |     
407 |     def _detect_dom_manipulations(self, file_path: str, content: str) -> List[Dict[str, Any]]:
408 |         """
409 |         检测DOM修改操作
410 |         
411 |         Args:
412 |             file_path: 文件路径
413 |             content: JavaScript代码内容
414 |             
415 |         Returns:
416 |             DOM操作检测结果
417 |         """
418 |         results = []
419 |         
420 |         # 使用js_utils中的函数检测文档修改
421 |         modifications = detect_document_modification(content)
422 |         
423 |         for mod_info in modifications:
424 |             # 确定风险等级
425 |             risk_level = mod_info.get('risk_level', 3)
426 |             
427 |             # 检查是否包含可疑内容
428 |             target = mod_info.get('target', '')
429 |             value = mod_info.get('value', '')
430 |             
431 |             if 'innerHTML' in target or 'outerHTML' in target:
432 |                 risk_level = min(5, risk_level + 1)
433 |             
434 |             if any(pattern.search(value) for pattern in self.obfuscation_patterns.values()):
435 |                 risk_level = min(5, risk_level + 1)
436 |             
437 |             result = {
438 |                 'type': 'dom_manipulation',
439 |                 'file_path': file_path,
440 |                 'target': target,
441 |                 'value': value[:200] + ('...' if len(value) > 200 else ''),
442 |                 'risk_level': risk_level,
443 |                 'description': mod_info.get('description', 'DOM修改操作'),
444 |                 'context': mod_info.get('context', '')
445 |             }
446 |             results.append(result)
447 |         
448 |         # 检测可疑的DOM操作函数
449 |         for op_name, base_risk in self.suspicious_dom_operations.items():
450 |             pattern = re.compile(r'\b(?:document|window|this)\b[^\n;]*?\b' + re.escape(op_name) + r'\s*\(')
451 |             matches = list(pattern.finditer(content))
452 |             
453 |             for match in matches:
454 |                 # 获取上下文
455 |                 context = get_context(content, match.start(), match.end(), 100)
456 |                 
457 |                 # 检查是否与可疑内容组合使用
458 |                 risk_level = base_risk
459 |                 if any(keyword in context.lower() for keyword in ['eval', 'decode', 'base64', 'fromcharcode']):
460 |                     risk_level = min(5, risk_level + 2)
461 |                 
462 |                 result = {
463 |                     'type': 'dom_operation',
464 |                     'file_path': file_path,
465 |                     'operation': op_name,
466 |                     'risk_level': risk_level,
467 |                     'description': f"可疑的DOM操作: {op_name}",
468 |                     'context': context
469 |                 }
470 |                 results.append(result)
471 |         
472 |         return results
473 |     
474 |     def _detect_suspicious_comments(self, file_path: str, content: str) -> List[Dict[str, Any]]:
475 |         """
476 |         检测可疑注释
477 |         
478 |         Args:
479 |             file_path: 文件路径
480 |             content: JavaScript代码内容
481 |             
482 |         Returns:
483 |             可疑注释检测结果
484 |         """
485 |         results = []
486 |         
487 |         # 使用js_utils中的函数提取注释
488 |         comments = js_extract_comments(content)
489 |         
490 |         # 可疑注释关键词
491 |         suspicious_keywords = ['hack', 'exploit', 'backdoor', 'trojan', 'malware', 'keylogger', 'cracker', 
492 |                              'steal', 'inject', 'redirect', 'obfuscate', 'encrypt', 'decrypt', 'hidden',
493 |                              'admin', 'password', 'credential', 'phish', 'spy', 'tracking']
494 |         
495 |         for comment in comments:
496 |             text = comment.get('content', '')
497 |             comment_lower = text.lower()
498 |             
499 |             # 检查是否包含可疑关键词
500 |             for keyword in suspicious_keywords:
501 |                 if keyword in comment_lower:
502 |                     result = {
503 |                         'type': 'suspicious_comment',
504 |                         'file_path': file_path,
505 |                         'keyword': keyword,
506 |                         'risk_level': 3,
507 |                         'description': f"注释中包含可疑关键词: {keyword}",
508 |                         'context': text[:200] + ('...' if len(text) > 200 else '')
509 |                     }
510 |                     results.append(result)
511 |                     break
512 |             
513 |             # 检查注释中是否包含Base64编码内容
514 |             base64_pattern = re.compile(r'[A-Za-z0-9+/=]{32,}')
515 |             if base64_pattern.search(text) and len(text) > 50:
516 |                 result = {
517 |                     'type': 'suspicious_comment',
518 |                     'file_path': file_path,
519 |                     'risk_level': 4,
520 |                     'description': "注释中包含疑似Base64编码的长字符串",
521 |                     'context': text[:200] + ('...' if len(text) > 200 else '')
522 |                 }
523 |                 results.append(result)
524 |         
525 |         return results
526 |     
527 |     def _analyze_code_complexity(self, file_path: str, content: str) -> List[Dict[str, Any]]:
528 |         """
529 |         分析代码复杂度和熵
530 |         
531 |         Args:
532 |             file_path: 文件路径
533 |             content: JavaScript代码内容
534 |             
535 |         Returns:
536 |             代码复杂度分析结果
537 |         """
538 |         results = []
539 |         
540 |         # 移除注释后的代码用于熵计算
541 |         code_without_comments = js_remove_comments(content)
542 |         
543 |         # 计算代码熵
544 |         entropy = calculate_entropy(code_without_comments)
545 |         
546 |         # 如果熵值过高，可能是混淆代码
547 |         if entropy > 4.5:
548 |             result = {
549 |                 'type': 'code_complexity',
550 |                 'file_path': file_path,
551 |                 'entropy': round(entropy, 2),
552 |                 'risk_level': 4,
553 |                 'description': f"代码熵值过高 ({round(entropy, 2)})，疑似经过混淆",
554 |                 'context': code_without_comments[:200] + ('...' if len(code_without_comments) > 200 else '')
555 |             }
556 |             results.append(result)
557 |         elif entropy > 3.8:
558 |             result = {
559 |                 'type': 'code_complexity',
560 |                 'file_path': file_path,
561 |                 'entropy': round(entropy, 2),
562 |                 'risk_level': 2,
563 |                 'description': f"代码熵值较高 ({round(entropy, 2)})，可能包含复杂逻辑",
564 |                 'context': code_without_comments[:200] + ('...' if len(code_without_comments) > 200 else '')
565 |             }
566 |             results.append(result)
567 |         
568 |         # 分析变量命名模式
569 |         var_pattern = re.compile(r'\bvar\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\b|\blet\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\b|\bconst\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\b')
570 |         var_names = []
571 |         
572 |         for match in var_pattern.finditer(content):
573 |             for group in match.groups():
574 |                 if group:
575 |                     var_names.append(group)
576 |         
577 |         # 检查是否有大量的短变量名（可能是混淆特征）
578 |         short_vars = [name for name in var_names if len(name) <= 2]
579 |         if len(short_vars) > 30 and len(var_names) > 50:
580 |             short_var_ratio = len(short_vars) / len(var_names)
581 |             if short_var_ratio > 0.4:
582 |                 result = {
583 |                     'type': 'code_complexity',
584 |                     'file_path': file_path,
585 |                     'short_var_count': len(short_vars),
586 |                     'total_var_count': len(var_names),
587 |                     'risk_level': 3,
588 |                     'description': f"存在大量短变量名 ({len(short_vars)}/{len(var_names)})，可能是混淆特征",
589 |                     'context': ', '.join(short_vars[:10]) + ('...' if len(short_vars) > 10 else '')
590 |                 }
591 |                 results.append(result)
592 |         
593 |         # 分析函数调用密度
594 |         function_call_pattern = re.compile(r'\b[a-zA-Z_$][a-zA-Z0-9_$]*\s*\(')
595 |         function_calls = len(list(function_call_pattern.finditer(content)))
596 |         
597 |         code_length = len(content)
598 |         calls_per_1000 = (function_calls / code_length * 1000) if code_length > 0 else 0
599 |         
600 |         if calls_per_1000 > 50:
601 |             result = {
602 |                 'type': 'code_complexity',
603 |                 'file_path': file_path,
604 |                 'function_call_density': round(calls_per_1000, 2),
605 |                 'risk_level': 2,
606 |                 'description': f"函数调用密度较高 ({round(calls_per_1000, 2)} 次/1000字符)，可能包含复杂逻辑",
607 |                 'context': content[:200] + ('...' if len(content) > 200 else '')
608 |             }
609 |             results.append(result)
610 |         
611 |         return results
612 |     
613 |     def _get_pattern_description(self, pattern_name: str) -> str:
614 |         """
615 |         获取模式的描述
616 |         
617 |         Args:
618 |             pattern_name: 模式名称
619 |             
620 |         Returns:
621 |             描述文本
622 |         """
623 |         descriptions = {
624 |             # 混淆模式描述
625 |             'hex_encoding': '十六进制编码',
626 |             'unicode_encoding': 'Unicode编码',
627 |             'string_concatenation': '字符串拼接',
628 |             'array_manipulation': '数组操作混淆',
629 |             'eval_with_arguments': '带参数的eval调用',
630 |             'reversed_string': '反转字符串',
631 |             'base64_like': '疑似Base64编码',
632 |             'unusual_variable_names': '异常变量名',
633 |             
634 |             # 可疑模式描述
635 |             'self_executing': '自执行函数',
636 |             'conditional_eval': '条件eval调用',
637 |             'try_catch_eval': 'try-catch中的eval',
638 |             'hidden_eval': '隐藏的eval调用',
639 |             'document_manipulation_with_eval': '使用eval操作DOM',
640 |             'url_to_eval': '从URL提取数据执行eval',
641 |             'cookie_manipulation': 'Cookie操作',
642 |             'user_agent_check': 'User-Agent检查',
643 |             'referrer_check': 'Referrer检查'
644 |         }
645 |         
646 |         return descriptions.get(pattern_name, pattern_name)
647 |     
648 |     def _get_pattern_risk_level(self, pattern_name: str) -> int:
649 |         """
650 |         获取模式的风险等级
651 |         
652 |         Args:
653 |             pattern_name: 模式名称
654 |             
655 |         Returns:
656 |             风险等级
657 |         """
658 |         risk_levels = {
659 |             'self_executing': 3,
660 |             'conditional_eval': 4,
661 |             'try_catch_eval': 4,
662 |             'hidden_eval': 5,
663 |             'document_manipulation_with_eval': 5,
664 |             'url_to_eval': 5,
665 |             'cookie_manipulation': 3,
666 |             'user_agent_check': 1,
667 |             'referrer_check': 1
668 |         }
669 |         
670 |         return risk_levels.get(pattern_name, 2)
671 |         
672 | 


--------------------------------------------------------------------------------