├── targets_test.txt ├── requirements.txt ├── test.css ├── .gitignore ├── keywords_example.txt ├── core ├── config.py └── detector │ ├── keyword_detector.py │ ├── special_hiding_detector.py │ ├── headless_browser_detector.py │ ├── html_detector.py │ └── js_detector.py ├── utils ├── logging_utils.py ├── file_utils.py ├── html_utils.py ├── common_utils.py ├── css_utils.py ├── js_utils.py └── network_utils.py ├── README.md ├── test_dark_link.html └── YuanZhao.py /targets_test.txt: -------------------------------------------------------------------------------- 1 | ./test_dark_link.html -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.9.3 2 | lxml>=4.6.3 3 | requests>=2.25.1 4 | selenium>=4.0.0 5 | urllib3>=1.26.7 6 | chardet>=4.0.0 7 | 8 | # 无头浏览器依赖 9 | webdriver-manager>=3.5.0 # 自动安装和管理ChromeDriver 10 | -------------------------------------------------------------------------------- /test.css: -------------------------------------------------------------------------------- 1 | /* normal css */ 2 | @import url("https://fonts.googleapis.com/css?family=Roboto"); 3 | .banner { background-image: url("https://cdn.example.com/images/hero.jpg"); } 4 | .icon { background-image: url(/assets/icon.png); } 5 | .cursor { cursor: url("https://static.example.com/cur.cur"), auto; } 6 | .hidden { display: none; } 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | 8 | # Temporary files 9 | *.swp 10 | *.swo 11 | *~ 12 | .project 13 | .settings 14 | .classpath 15 | *.launch 16 | 17 | # IDE 18 | .idea/ 19 | .vscode/ 20 | *.sublime-project 21 | *.sublime-workspace 22 | 23 | # OS 24 | .DS_Store 25 | Thumbs.db 26 | 27 | # Testing 28 | .tox/ 29 | .coverage 30 | .coverage.* 31 | .cache 32 | nosetests.xml 33 | pytest.xml 34 | 35 | # Environment 36 | .env 37 | .env.local 38 | .env.development.local 39 | .env.test.local 40 | .env.production.local 41 | 42 | # Build 43 | /build/ 44 | dist/ 45 | *.egg 46 | *.egg-info/ 47 | bug*.md 48 | 49 | # Reports and logs 50 | reports/ 51 | logs/ 52 | *.log 53 | # 调试临时文件会保存到reports目录并以时间戳命名,无需具体文件名忽略 54 | reports/* 55 | 56 | # Virtual environments 57 | venv/ 58 | env/ 59 | pyvenv/ 60 | 61 | # OS generated files 62 | .DS_Store 63 | .DS_Store? 64 | ._* 65 | .Spotlight-V100 66 | .Trashes 67 | -------------------------------------------------------------------------------- /keywords_example.txt: -------------------------------------------------------------------------------- 1 | bet365, gambling, 9 2 | 皇冠体育, gambling, 9 3 | 火博, gambling, 9 4 | 金年会, gambling, 9 5 | 威尼斯人, gambling, 8 6 | 澳门赌场, gambling, 8 7 | 时时彩, gambling, 10 8 | 六合彩, gambling, 10 9 | 赔率, gambling, 7 10 | 投注, gambling, 8 11 | 赌博, gambling, 10 12 | 博彩, gambling, 10 13 | 赌球, gambling, 10 14 | 彩金, gambling, 9 15 | 线上赌场, gambling, 10 16 | 真人娱乐, gambling, 8 17 | 百家乐, gambling, 9 18 | 轮盘, gambling, 8 19 | 老虎机, gambling, 9 20 | 快三, gambling, 9 21 | 彩票软件, gambling, 9 22 | 体彩预测, gambling, 8 23 | 色情, porn, 9 24 | 成人, porn, 8 25 | AV, porn, 9 26 | 黄色网站, porn, 10 27 | 性爱, porn, 9 28 | 黄色, porn, 10 29 | 肉棒, porn, 10 30 | 爆乳, porn, 10 31 | 射精, porn, 10 32 | H视频, porn, 10 33 | 裸聊, porn, 9 34 | 露骨, porn, 8 35 | 情色, porn, 9 36 | 木马, malware, 10 37 | 病毒, malware, 9 38 | 勒索软件, malware, 10 39 | 挖矿, malware, 8 40 | 黑客攻击, malware, 10 41 | DDoS, malware, 10 42 | 后门, malware, 10 43 | 远程控制, malware, 9 44 | 账号密码, phishing, 8 45 | 银行登录, phishing, 10 46 | 支付验证, phishing, 9 47 | 登录, phishing, 6 48 | 账号, phishing, 6 49 | 密码, phishing, 6 50 | 支付, phishing, 8 51 | 转账, phishing, 9 52 | 银行卡, phishing, 8 53 | 验证码, phishing, 7 54 | 高利贷, other, 10 55 | 网贷, other, 7 56 | 小额贷, other, 8 57 | 民间借贷, other, 7 58 | 校园贷, other, 10 59 | 私服, other, 7 60 | 外挂, other, 8 61 | 传奇私服, other, 9 62 | 新开私服, other, 8 63 | 破解版, other, 7 64 | 黑客, other, 8 65 | 渗透测试, other, 5 66 | 漏洞扫描, other, 6 67 | 破解软件, other, 8 68 | 注册机, other, 7 69 | 激活码, other, 6 70 | 黑客工具, other, 9 71 | .cm, other, 7 72 | .tk, other, 6 73 | .ga, other, 6 74 | .ml, other, 6 75 | .tf, other, 6 76 | .gq, other, 6 77 | display:none, other, 9 78 | visibility:hidden, other, 9 79 | opacity:0, other, 8 80 | position:absolute, other, 6 81 | z-index:-1, other, 7 82 | text-indent:-9999px, other, 8 83 | document.write, other, 7 84 | eval(, other, 9 85 | setTimeout("", other, 8 86 | location.href=, other, 7 87 | window.open(, other, 6 88 | XMLHttpRequest, other, 5 89 | fetch(, other, 5 90 | 翻墙, other, 7 91 | VPN, other, 6 92 | 暴力, other, 9 93 | 血腥, other, 8 94 | 恐怖, other, 7 95 | 毒品, other, 10 96 | 大麻, other, 10 97 | 冰毒, other, 10 98 | 摇头丸, other, 10 99 | -------------------------------------------------------------------------------- /core/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 配置管理模块 5 | """ 6 | 7 | class Config: 8 | """扫描配置类""" 9 | 10 | def __init__(self): 11 | # 扫描目标配置 12 | self.target_type = None # 'local_file', 'local_directory', 'internal_url', 'external_url' 13 | self.target = None 14 | self.crawl_depth = 1 15 | self.depth = self.crawl_depth # 兼容属性 16 | 17 | # 扫描模式配置 18 | self.scan_mode = 'standard' # 'fast', 'standard', 'deep' 19 | self.mode = self.scan_mode # 兼容属性 20 | self.threads = 4 21 | self.timeout = 30 22 | self.internal_timeout = 60 # 内网URL超时时间(秒) 23 | self.external_timeout = 30 # 公网URL超时时间(秒) 24 | self.proxy = None 25 | self.exclude = [] 26 | 27 | # 关键字配置 28 | self.keywords_file = None 29 | 30 | # 报告配置 31 | self.report_type = 'txt' 32 | self.report_file = None 33 | 34 | # 调试模式 35 | self.debug = False 36 | # 调试日志读取参数 37 | self.debug_log_wait_ms = 1500 38 | self.debug_log_checks = 3 39 | self.debug_log_interval_ms = 500 40 | 41 | # 日志器 42 | import logging 43 | self.logger = logging.getLogger('YuanZhao') 44 | 45 | # 无头浏览器配置 46 | self.use_headless_browser = False # 是否启用无头浏览器 47 | self.headless_browser = 'chrome' # 无头浏览器类型 48 | self.js_wait_time = 3 # JavaScript执行等待时间(秒) 49 | self.headless_timeout = 60 # 无头浏览器超时时间(秒) 50 | self.headless_auto_download = False # 是否自动下载驱动 51 | self.headless_driver_path = None # 本地驱动路径 52 | 53 | # 文件类型配置 54 | self.html_extensions = ['.html', '.htm', '.shtml', '.xhtml', '.php', '.asp', '.aspx', '.jsp'] 55 | self.css_extensions = ['.css', '.less', '.scss', '.sass'] 56 | self.js_extensions = ['.js', '.jsx', '.ts', '.tsx'] 57 | self.image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'] 58 | 59 | # 扫描配置项 60 | self.scan_html = True 61 | self.scan_js = True 62 | self.scan_css = True 63 | self.scan_comments = True 64 | self.scan_meta = True 65 | self.scan_iframe = True 66 | self.scan_dom = True 67 | self.scan_encoding = True 68 | self.scan_steganography = True 69 | self.scan_special_hiding = True 70 | self.scan_keywords = True 71 | 72 | # 根据扫描模式调整配置 73 | self._set_mode_config() 74 | # 计算当前模式下需要扫描的扩展名 75 | self.file_extensions = self.get_file_extensions_to_scan() 76 | 77 | def _set_mode_config(self): 78 | """根据扫描模式设置相应的配置""" 79 | if self.scan_mode == 'fast': 80 | # 快速模式:只进行基础扫描 81 | self.scan_html = True 82 | self.scan_js = True 83 | self.scan_css = True 84 | self.scan_comments = True 85 | self.scan_meta = True 86 | self.scan_iframe = False 87 | self.scan_dom = False 88 | self.scan_encoding = False 89 | self.scan_steganography = False 90 | self.scan_special_hiding = False 91 | self.scan_keywords = True 92 | 93 | elif self.scan_mode == 'standard': 94 | # 标准模式:进行大部分扫描 95 | self.scan_html = True 96 | self.scan_js = True 97 | self.scan_css = True 98 | self.scan_comments = True 99 | self.scan_meta = True 100 | self.scan_iframe = True 101 | self.scan_dom = True 102 | self.scan_encoding = True 103 | self.scan_steganography = False 104 | self.scan_special_hiding = True 105 | self.scan_keywords = True 106 | 107 | elif self.scan_mode == 'deep': 108 | # 深度模式:进行所有扫描 109 | self.scan_html = True 110 | self.scan_js = True 111 | self.scan_css = True 112 | self.scan_comments = True 113 | self.scan_meta = True 114 | self.scan_iframe = True 115 | self.scan_dom = True 116 | self.scan_encoding = True 117 | self.scan_steganography = True 118 | self.scan_special_hiding = True 119 | self.scan_keywords = True 120 | # 同步更新扩展名列表 121 | self.file_extensions = self.get_file_extensions_to_scan() 122 | 123 | def update_mode(self, mode): 124 | """更新扫描模式""" 125 | self.scan_mode = mode 126 | self._set_mode_config() 127 | 128 | def get_file_extensions_to_scan(self): 129 | """获取需要扫描的文件扩展名列表""" 130 | extensions = [] 131 | 132 | if self.scan_html: 133 | extensions.extend(self.html_extensions) 134 | 135 | if self.scan_js: 136 | extensions.extend(self.js_extensions) 137 | 138 | if self.scan_css: 139 | extensions.extend(self.css_extensions) 140 | 141 | return list(set(extensions)) # 去重 142 | 143 | def get_proxy_dict(self): 144 | """将代理字符串转换为requests使用的代理字典格式""" 145 | if not self.proxy: 146 | return None 147 | 148 | proxies = { 149 | 'http': self.proxy, 150 | 'https': self.proxy 151 | } 152 | return proxies 153 | 154 | def __str__(self): 155 | """返回配置的字符串表示""" 156 | return ( 157 | f"Config(" 158 | f"target_type={self.target_type}, " 159 | f"target={self.target}, " 160 | f"scan_mode={self.scan_mode}, " 161 | f"threads={self.threads}, " 162 | f"timeout={self.timeout}, " 163 | f"internal_timeout={self.internal_timeout}, " 164 | f"external_timeout={self.external_timeout}, " 165 | f"report_type={self.report_type}, " 166 | f"report_file={self.report_file})" 167 | ) 168 | 169 | def get_config_dict(self): 170 | """返回配置的字典表示,用于日志记录""" 171 | return { 172 | 'target_type': self.target_type, 173 | 'target': self.target, 174 | 'crawl_depth': self.crawl_depth, 175 | 'scan_mode': self.scan_mode, 176 | 'threads': self.threads, 177 | 'timeout': self.timeout, 178 | 'internal_timeout': self.internal_timeout, 179 | 'external_timeout': self.external_timeout, 180 | 'proxy': '***' if self.proxy else None, 181 | 'keywords_file': self.keywords_file, 182 | 'report_type': self.report_type, 183 | 'report_file': self.report_file, 184 | 'debug': self.debug 185 | } 186 | 187 | -------------------------------------------------------------------------------- /utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 日志处理工具模块 5 | """ 6 | 7 | import os 8 | import logging 9 | import sys 10 | from datetime import datetime 11 | 12 | class Logger: 13 | """ 14 | 自定义日志类 15 | """ 16 | def __init__(self, name='YuanZhao', log_dir=None, level=logging.INFO, use_console=True): 17 | """ 18 | 初始化日志记录器 19 | 20 | Args: 21 | name (str): 日志名称 22 | log_dir (str): 日志文件目录 23 | level (int): 日志级别 24 | use_console (bool): 是否输出到控制台 25 | """ 26 | self.logger = logging.getLogger(name) 27 | self.logger.setLevel(level) 28 | self.logger.handlers.clear() 29 | 30 | # 创建格式化器 31 | formatter = logging.Formatter( 32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 33 | datefmt='%Y-%m-%d %H:%M:%S' 34 | ) 35 | 36 | # 控制台输出 37 | if use_console: 38 | console_handler = logging.StreamHandler(sys.stdout) 39 | console_handler.setLevel(level) 40 | console_handler.setFormatter(formatter) 41 | self.logger.addHandler(console_handler) 42 | 43 | # 文件输出 44 | if log_dir: 45 | os.makedirs(log_dir, exist_ok=True) 46 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 47 | log_file = os.path.join(log_dir, f'YuanZhao_{timestamp}.log') 48 | file_handler = logging.FileHandler(log_file, encoding='utf-8') 49 | file_handler.setLevel(level) 50 | file_handler.setFormatter(formatter) 51 | self.logger.addHandler(file_handler) 52 | 53 | def debug(self, message): 54 | """记录调试信息""" 55 | self.logger.debug(message) 56 | 57 | def info(self, message): 58 | """记录普通信息""" 59 | self.logger.info(message) 60 | 61 | def warning(self, message): 62 | """记录警告信息""" 63 | self.logger.warning(message) 64 | 65 | def error(self, message, exc_info=False): 66 | """记录错误信息""" 67 | self.logger.error(message, exc_info=exc_info) 68 | 69 | def critical(self, message, exc_info=False): 70 | """记录严重错误信息""" 71 | self.logger.critical(message, exc_info=exc_info) 72 | 73 | def setup_logging(log_dir=None, level=logging.INFO): 74 | """ 75 | 全局日志配置 76 | 77 | Args: 78 | log_dir (str): 日志文件目录 79 | level (int): 日志级别 80 | 81 | Returns: 82 | Logger: 日志记录器实例 83 | """ 84 | return Logger('YuanZhao', log_dir, level).logger 85 | 86 | def log_exception(logger, exception, message="发生异常"): 87 | """ 88 | 记录异常信息 89 | 90 | Args: 91 | logger: 日志记录器 92 | exception: 异常对象 93 | message (str): 错误消息 94 | """ 95 | logger.error(f"{message}: {str(exception)}", exc_info=True) 96 | 97 | def log_progress(logger, current, total, message="处理进度"): 98 | """ 99 | 记录进度信息 100 | 101 | Args: 102 | logger: 日志记录器 103 | current (int): 当前进度 104 | total (int): 总进度 105 | message (str): 进度消息 106 | """ 107 | if total > 0: 108 | percentage = (current / total) * 100 109 | logger.info(f"{message}: {current}/{total} ({percentage:.1f}%)") 110 | 111 | def log_scan_result(logger, file_path, issues): 112 | """ 113 | 记录扫描结果 114 | 115 | Args: 116 | logger: 日志记录器 117 | file_path (str): 文件路径 118 | issues (list): 发现的问题列表 119 | """ 120 | if issues: 121 | logger.warning(f"文件 {file_path} 发现 {len(issues)} 个问题") 122 | import logging as _logging 123 | if logger.level <= _logging.DEBUG: 124 | for issue in issues: 125 | logger.warning(f" - {issue}") 126 | else: 127 | # 聚合重复项,仅输出前若干项 128 | counts = {} 129 | for issue in issues: 130 | counts[issue] = counts.get(issue, 0) + 1 131 | shown = 0 132 | for text, cnt in counts.items(): 133 | logger.warning(f" - {text} x{cnt}") 134 | shown += 1 135 | if shown >= 8: 136 | break 137 | if len(counts) > shown: 138 | logger.warning(f" ... 还有 {len(counts) - shown} 项未展示(非verbose模式)") 139 | else: 140 | logger.debug(f"文件 {file_path} 未发现问题") 141 | 142 | def log_keyword_match(logger, file_path, keyword, category, weight, context): 143 | """ 144 | 记录关键字匹配信息 145 | 146 | Args: 147 | logger: 日志记录器 148 | file_path (str): 文件路径 149 | keyword (str): 匹配的关键字 150 | category (str): 关键字类别 151 | weight (int): 风险权重 152 | context (str): 上下文信息 153 | """ 154 | logger.warning( 155 | f"关键字匹配 - 文件: {file_path}, " 156 | f"关键字: {keyword}, 类别: {category}, 风险权重: {weight}\n" 157 | f"上下文: {context}" 158 | ) 159 | 160 | def log_suspicious_url(logger, file_path, url, risk_level, context): 161 | """ 162 | 记录可疑URL信息 163 | 164 | Args: 165 | logger: 日志记录器 166 | file_path (str): 文件路径 167 | url (str): 可疑URL 168 | risk_level (str): 风险等级 169 | context (str): 上下文信息 170 | """ 171 | logger.warning( 172 | f"可疑URL - 文件: {file_path}, " 173 | f"URL: {url}, 风险等级: {risk_level}\n" 174 | f"上下文: {context}" 175 | ) 176 | 177 | def log_hidden_technique(logger, file_path, technique, risk_level, context): 178 | """ 179 | 记录隐藏技术信息 180 | 181 | Args: 182 | logger: 日志记录器 183 | file_path (str): 文件路径 184 | technique (str): 隐藏技术 185 | risk_level (str): 风险等级 186 | context (str): 上下文信息 187 | """ 188 | logger.warning( 189 | f"隐藏技术 - 文件: {file_path}, " 190 | f"技术: {technique}, 风险等级: {risk_level}\n" 191 | f"上下文: {context}" 192 | ) 193 | 194 | def log_file_skipped(logger, file_path, reason): 195 | """ 196 | 记录跳过的文件信息 197 | 198 | Args: 199 | logger: 日志记录器 200 | file_path (str): 文件路径 201 | reason (str): 跳过原因 202 | """ 203 | logger.debug(f"跳过文件 {file_path}: {reason}") 204 | 205 | def log_config(logger, config_dict): 206 | """ 207 | 记录配置信息 208 | 209 | Args: 210 | logger: 日志记录器 211 | config_dict (dict): 配置字典 212 | """ 213 | logger.info("扫描配置:") 214 | for key, value in config_dict.items(): 215 | logger.info(f" {key}: {value}") 216 | 217 | def log_summary(logger, total_files, scanned_files, issues_found, scan_time): 218 | """ 219 | 记录扫描总结信息 220 | 221 | Args: 222 | logger: 日志记录器 223 | total_files (int): 文件总数 224 | scanned_files (int): 已扫描文件数 225 | issues_found (int): 发现的问题数 226 | scan_time (float): 扫描耗时(秒) 227 | """ 228 | logger.info("扫描总结:") 229 | logger.info(f" 总文件数: {total_files}") 230 | logger.info(f" 已扫描文件: {scanned_files}") 231 | logger.info(f" 发现问题: {issues_found}") 232 | logger.info(f" 扫描耗时: {scan_time:.2f} 秒") 233 | try: 234 | if scan_time > 0: 235 | logger.info(f" 平均速度: {scanned_files/scan_time:.2f} 文件/秒") 236 | else: 237 | logger.info(" 平均速度: N/A (耗时为0)") 238 | except Exception: 239 | logger.info(" 平均速度: N/A") 240 | 241 | # 根据问题数量给出警告级别 242 | if issues_found > 50: 243 | logger.critical(f"发现大量问题 ({issues_found}),建议立即检查") 244 | elif issues_found > 10: 245 | logger.error(f"发现较多问题 ({issues_found}),需要关注") 246 | elif issues_found > 0: 247 | logger.warning(f"发现少量问题 ({issues_found}),建议查看") 248 | else: 249 | logger.info("未发现明显问题") 250 | 251 | -------------------------------------------------------------------------------- /utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 文件处理工具模块 5 | """ 6 | 7 | import os 8 | import logging 9 | import chardet 10 | from typing import List 11 | 12 | logger = logging.getLogger('YuanZhao.utils.file') 13 | 14 | def read_file(file_path: str, max_size: int = 10 * 1024 * 1024) -> str: 15 | """ 16 | 读取文件内容,自动检测编码 17 | 18 | Args: 19 | file_path: 文件路径 20 | max_size: 最大文件大小(默认10MB) 21 | 22 | Returns: 23 | 文件内容 24 | """ 25 | try: 26 | # 检查文件大小 27 | file_size = os.path.getsize(file_path) 28 | if file_size > max_size: 29 | logger.warning(f"文件过大,将读取前{max_size/1024/1024:.1f}MB: {file_path}") 30 | 31 | # 检测文件编码 32 | with open(file_path, 'rb') as f: 33 | raw_data = f.read(min(file_size, 10000)) 34 | result = chardet.detect(raw_data) 35 | encoding = result['encoding'] or 'utf-8' 36 | 37 | # 读取文件内容 38 | with open(file_path, 'r', encoding=encoding, errors='replace') as f: 39 | content = f.read(max_size) 40 | 41 | return content 42 | 43 | except Exception as e: 44 | logger.error(f"读取文件失败: {file_path}, 错误: {str(e)}") 45 | return '' 46 | 47 | def get_files_to_scan(directory: str, extensions: List[str]) -> List[str]: 48 | """ 49 | 递归获取目录中所有指定扩展名的文件 50 | 51 | Args: 52 | directory: 目录路径 53 | extensions: 需要扫描的文件扩展名列表 54 | 55 | Returns: 56 | 文件路径列表 57 | """ 58 | files_to_scan = [] 59 | 60 | try: 61 | for root, dirs, files in os.walk(directory): 62 | # 过滤掉隐藏目录 63 | dirs[:] = [d for d in dirs if not d.startswith('.')] 64 | 65 | for file in files: 66 | # 过滤掉隐藏文件 67 | if file.startswith('.'): 68 | continue 69 | 70 | # 检查文件扩展名 71 | _, ext = os.path.splitext(file.lower()) 72 | if ext in extensions: 73 | file_path = os.path.join(root, file) 74 | files_to_scan.append(file_path) 75 | 76 | logger.info(f"找到 {len(files_to_scan)} 个需要扫描的文件") 77 | 78 | except Exception as e: 79 | logger.error(f"获取文件列表失败: {str(e)}") 80 | 81 | return files_to_scan 82 | 83 | def is_binary_file(file_path: str) -> bool: 84 | """ 85 | 检查文件是否为二进制文件 86 | 87 | Args: 88 | file_path: 文件路径 89 | 90 | Returns: 91 | 是否为二进制文件 92 | """ 93 | try: 94 | with open(file_path, 'rb') as f: 95 | chunk = f.read(1024) 96 | 97 | # 检查是否包含null字节 98 | if b'\x00' in chunk: 99 | return True 100 | 101 | # 检查非文本字符的比例 102 | text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100))) 103 | non_text = sum(1 for byte in chunk if byte not in text_chars) 104 | 105 | # 如果超过30%的字符是非文本字符,则认为是二进制文件 106 | return non_text / len(chunk) > 0.3 107 | 108 | except Exception as e: 109 | logger.error(f"检查文件类型失败: {file_path}, 错误: {str(e)}") 110 | return False 111 | 112 | def get_file_info(file_path: str) -> dict: 113 | """ 114 | 获取文件信息 115 | 116 | Args: 117 | file_path: 文件路径 118 | 119 | Returns: 120 | 文件信息字典 121 | """ 122 | try: 123 | stat_info = os.stat(file_path) 124 | 125 | info = { 126 | 'path': file_path, 127 | 'size': stat_info.st_size, 128 | 'created_time': stat_info.st_ctime, 129 | 'modified_time': stat_info.st_mtime, 130 | 'is_binary': is_binary_file(file_path) 131 | } 132 | 133 | return info 134 | 135 | except Exception as e: 136 | logger.error(f"获取文件信息失败: {file_path}, 错误: {str(e)}") 137 | return {} 138 | 139 | def ensure_directory(directory: str): 140 | """ 141 | 确保目录存在,如果不存在则创建 142 | 143 | Args: 144 | directory: 目录路径 145 | """ 146 | try: 147 | if not os.path.exists(directory): 148 | os.makedirs(directory) 149 | logger.info(f"创建目录: {directory}") 150 | except Exception as e: 151 | logger.error(f"创建目录失败: {directory}, 错误: {str(e)}") 152 | raise 153 | 154 | def get_relative_path(file_path: str, base_directory: str) -> str: 155 | """ 156 | 获取文件相对于基础目录的路径 157 | 158 | Args: 159 | file_path: 文件路径 160 | base_directory: 基础目录 161 | 162 | Returns: 163 | 相对路径 164 | """ 165 | try: 166 | return os.path.relpath(file_path, base_directory) 167 | except Exception as e: 168 | logger.error(f"获取相对路径失败: {str(e)}") 169 | return file_path 170 | 171 | def filter_files_by_size(files: List[str], min_size: int = 0, max_size: int = None) -> List[str]: 172 | """ 173 | 根据文件大小过滤文件列表 174 | 175 | Args: 176 | files: 文件路径列表 177 | min_size: 最小文件大小(字节) 178 | max_size: 最大文件大小(字节) 179 | 180 | Returns: 181 | 过滤后的文件列表 182 | """ 183 | filtered_files = [] 184 | 185 | for file_path in files: 186 | try: 187 | file_size = os.path.getsize(file_path) 188 | 189 | if file_size < min_size: 190 | continue 191 | 192 | if max_size is not None and file_size > max_size: 193 | continue 194 | 195 | filtered_files.append(file_path) 196 | 197 | except Exception as e: 198 | logger.warning(f"获取文件大小失败: {file_path}, 错误: {str(e)}") 199 | 200 | return filtered_files 201 | 202 | def _match_exclude(path: str, exclude_patterns: List[str]) -> bool: 203 | try: 204 | import fnmatch 205 | for pattern in exclude_patterns or []: 206 | if fnmatch.fnmatch(path, pattern) or (pattern.endswith('/') and path.replace('\\','/').startswith(pattern.rstrip('/'))): 207 | return True 208 | except Exception: 209 | pass 210 | return False 211 | 212 | # 兼容性函数,为了支持scanner.py中的导入(扩展签名) 213 | def get_file_list(directory: str, recursive: bool = True, depth: int = 1, extensions: List[str] = None, exclude: List[str] = None) -> List[str]: 214 | """ 215 | 获取目录中的文件列表,支持递归、深度限制与排除模式 216 | 217 | Args: 218 | directory: 目录路径 219 | recursive: 是否递归 220 | depth: 递归深度(包含根层级) 221 | extensions: 需要扫描的文件扩展名列表 222 | exclude: 排除的文件或目录通配符列表 223 | Returns: 224 | 文件路径列表 225 | """ 226 | results: List[str] = [] 227 | try: 228 | extensions = [ext.lower() for ext in (extensions or [])] 229 | base_depth = directory.rstrip('\\/').count(os.sep) 230 | for root, dirs, files in os.walk(directory): 231 | # 处理深度 232 | current_depth = root.rstrip('\\/').count(os.sep) - base_depth 233 | if not recursive or current_depth >= depth: 234 | dirs[:] = [] 235 | # 排除目录 236 | if exclude: 237 | dirs[:] = [d for d in dirs if not _match_exclude(os.path.join(root, d), exclude)] 238 | for file in files: 239 | path = os.path.join(root, file) 240 | if exclude and _match_exclude(path, exclude): 241 | continue 242 | if file.startswith('.'): 243 | continue 244 | _, ext = os.path.splitext(file.lower()) 245 | if not extensions or ext in extensions: 246 | results.append(path) 247 | logger.info(f"找到 {len(results)} 个需要扫描的文件") 248 | except Exception as e: 249 | logger.error(f"获取文件列表失败: {str(e)}") 250 | return results 251 | 252 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 渊照 - 专业暗链扫描工具 2 | 3 | 「渊照」是一款功能强大的专业暗链扫描工具,专注于检测网站、HTML文件或目录中的隐蔽链接、隐藏元素和恶意代码。该工具能够智能识别扫描目标类型(本地文件/目录、内网URL、公网URL),并自动调整扫描策略以获得最佳效果,是安全人员进行网站安全审计和应急响应的理想工具。 4 | 5 | ## 功能特性 6 | 7 | ### 智能目标识别与处理 8 | - **多类型目标支持**:自动识别和扫描本地文件、本地目录、内网URL和公网URL 9 | - **差异化扫描策略**:根据目标类型应用最优扫描策略 10 | - **递归目录扫描**:支持可配置的扫描深度 11 | - **文件过滤机制**:支持通过通配符排除特定文件或目录 12 | 13 | ### 核心扫描能力 14 | - **多层次检测机制**:HTML代码检测、JavaScript代码分析、CSS代码检测、元标签扫描、注释内容分析 15 | - **高级威胁识别**:加密/编码链接检测、可疑域名检测、随机生成域名检测、短链接服务检测、非标准端口检测、可疑查询参数检测 16 | - **特殊隐藏手法检测**:CSS隐藏技术、颜色隐藏、零宽字符隐藏、字体大小隐藏等 17 | - **关键字匹配系统**:支持CSV格式自定义关键字文件,包含关键字、类别和风险权重 18 | - **智能风险评分**:基于多维度风险评估 19 | 20 | ### 无头浏览器增强检测 21 | - **动态内容捕获**:使用Chrome无头浏览器执行JavaScript并捕获动态内容 22 | - **DOM操作监控**:跟踪动态DOM修改 23 | - **iframe深度分析**:渲染和分析iframe内容 24 | - **网络请求捕获**:监控HTTP请求和重定向链 25 | 26 | ### 全面的报告系统 27 | - **多种报告格式**:文本报告(txt)、HTML报告(html)、JSON报告(json)、CSV报告(csv) 28 | - **丰富的报告内容**:扫描概览、问题详情、风险评估、上下文展示 29 | - **来源类型标注**:在可疑链接中增加 `context_type` 字段(如 `html/js/css/comments`),用于区分链接的来源场景,便于后续数据分析与过滤 30 | - **来源标签与位置**:统一输出 `source_tag`(如 `debug/normal`)与定位范围 `position (start,end)`,HTML/CSV/JSON 报告保持一致 31 | - **风险排序与阈值展示**:HTML报告对“可疑链接”按风险降序展示,并默认仅展示风险≥4的项,减少噪音;关键字匹配表支持从上下文提取可点击链接 32 | 33 | ### 灵活的配置选项 34 | - **多种扫描模式**:fast/standard/deep 35 | - **性能优化选项**:可配置并发线程数、请求超时设置、代理服务器支持 36 | - **关键词来源**:支持从 `keywords_example.txt` 或自定义 `--keyword-file` 读取,文件允许 `#` 注释行,CSV格式:`关键字,类别,风险权重` 37 | 38 | ## 安装指南 39 | 40 | ### 环境要求 41 | - Python 3.8+ 42 | 43 | ### 安装依赖 44 | ```bash 45 | pip install -r requirements.txt 46 | ``` 47 | 48 | ## 使用方法 49 | 50 | ### 查看帮助信息 51 | ```bash 52 | python YuanZhao.py --help 53 | ``` 54 | 55 | ### 完整使用案例命令 56 | 57 | #### 1. 本地文件扫描场景 58 | ```bash 59 | # 基本扫描 - 单个HTML文件 60 | python YuanZhao.py /path/to/file.html 61 | 62 | # 高级扫描 + HTML报告 63 | python YuanZhao.py /path/to/file.html -m standard -f html 64 | 65 | # 详细日志模式 66 | python YuanZhao.py /path/to/suspicious.html --verbose 67 | 68 | # 自定义输出目录 69 | python YuanZhao.py /path/to/file.html -o /custom/report/dir 70 | 71 | # 特定报告格式(JSON) 72 | python YuanZhao.py /path/to/file.html -f json 73 | ``` 74 | 75 | #### 2. 本地目录扫描场景 76 | ```bash 77 | # 默认深度扫描目录 78 | python YuanZhao.py /path/to/website 79 | 80 | # 自定义深度扫描(仅当前目录和一级子目录) 81 | python YuanZhao.py /path/to/website -d 1 82 | 83 | # 深度递归扫描 84 | python YuanZhao.py /path/to/website -d 5 85 | 86 | # 排除特定文件/目录 87 | python YuanZhao.py /path/to/website --exclude "*.jpg" "*.png" "logs/*" "vendor/" 88 | 89 | # 调整线程数(提高性能) 90 | python YuanZhao.py /path/to/website -t 16 91 | 92 | # 完整模式 + 多格式报告 93 | python YuanZhao.py /path/to/website -m deep -f html -o security_reports --threads 12 94 | ``` 95 | 96 | #### 3. 网络URL扫描场景 97 | ```bash 98 | # 基本网站扫描 99 | python YuanZhao.py https://example.com 100 | 101 | # 内网地址扫描 102 | python YuanZhao.py http://192.168.1.100 103 | 104 | # 本地开发服务器扫描 105 | python YuanZhao.py http://localhost:8080 106 | 107 | # 带路径的URL扫描 108 | python YuanZhao.py https://example.com/news/article 109 | 110 | # 设置超时时间(公网默认使用全局超时,内网未显式设置时会按较长超时) 111 | python YuanZhao.py https://example.com --timeout 60 112 | 113 | # 使用代理服务器 114 | python YuanZhao.py https://example.com --proxy http://127.0.0.1:8080 115 | 116 | # 带认证的代理 117 | python YuanZhao.py https://example.com --proxy http://username:password@proxy.example.com:8080 118 | ``` 119 | 120 | #### 4. 高级功能场景 121 | ```bash 122 | # 无头浏览器扫描(动态内容) 123 | python YuanZhao.py https://dynamic-website.com --headless 124 | 125 | # 无头浏览器 + 延长等待时间 126 | python YuanZhao.py https://heavy-js-website.com --headless --js-wait 10 127 | 128 | # 无头浏览器超时时间 129 | python YuanZhao.py https://example.com --headless --headless-timeout 120 130 | 131 | # 自定义关键字检测 132 | python YuanZhao.py /path/to/target --keyword-file custom_keywords.txt 133 | 134 | # 基础模式快速扫描 135 | python YuanZhao.py https://example.com -m fast -d 1 -t 5 136 | 137 | # 全部模式深度扫描 138 | python YuanZhao.py /path/to/important-site -m deep -d 3 -f html --verbose 139 | ``` 140 | 141 | #### 5. 批量目标扫描(多链接/多路径) 142 | ```bash 143 | # 方式A:指定列表文件(每行一个目标:URL/文件/目录) 144 | python YuanZhao.py --target-file e:\targets.txt -m deep -f html -o reports --verbose 145 | 146 | # 方式B:直接把 .txt 作为 target 传入(同样按列表处理) 147 | python YuanZhao.py e:\targets.txt -m deep -f html -o reports --verbose 148 | 149 | # 示例列表文件内容 150 | # https://example.com 151 | # e:\webroot 152 | # e:\webroot\index.html 153 | ``` 154 | 155 | #### 6. 特定场景优化命令 156 | ```bash 157 | # 应急响应场景 158 | python YuanZhao.py /compromised/webroot -m deep -f html -o incident_response --keyword-file malware_keywords.txt --verbose 159 | 160 | # 定期安全审计 161 | python YuanZhao.py /path/to/webroot -d 3 -m standard -f json -o weekly_scan_$(date +%Y%m%d) 162 | 163 | # 新闻页面专项扫描 164 | python YuanZhao.py https://example.com/news -m deep -d 1 -t 8 --verbose 165 | 166 | # 大规模并行扫描 167 | python YuanZhao.py /large/website -d 2 -t 20 --exclude "*.zip" "*.rar" "backup/*" 168 | 169 | # 自动化集成扫描(生成JSON报告) 170 | python YuanZhao.py https://example.com -f json -o automated_scan_results --no-color 171 | ``` 172 | ### 自定义关键字文件格式 173 | ``` 174 | 关键字文件为CSV格式,每行包含三个字段: 175 | 176 | 关键字,类别,风险权重 177 | poker,gambling,8 178 | casino,gambling,9 179 | malware,malware,10 180 | phishing,phishing,9 181 | ``` 182 | 183 | 类别可选值:gambling (博彩)、porn (色情)、malware (恶意软件)、phishing (钓鱼)、other (其他) 184 | 风险权重范围:1-10(10为最高风险) 185 | 默认关键字文件:项目根目录 `keywords_example.txt`(若未指定 `--keyword-file` 将自动加载)。文件允许以 `#` 开头的注释行。 186 | 187 | ## 主要参数说明 188 | 189 | ### 基本参数 190 | - `target`: 扫描目标(文件路径、目录路径或URL)- 必需参数 191 | - `-d, --depth`: 递归扫描深度(默认:3,0表示仅扫描当前文件/目录) 192 | - `-m, --mode`: 扫描模式(fast/standard/deep,默认:deep) 193 | - `-t, --threads`: 并发线程数(默认:8) 194 | 195 | ### 报告相关参数 196 | - `-o, --output`: 报告输出目录(默认:./reports) 197 | - `-f, --format`: 报告格式(txt/html/json/csv,默认:txt) 198 | 199 | ### 网络相关参数 200 | - `--timeout`: 请求超时时间(秒,默认:30)。公网目标默认使用此值,内网目标未显式设置 `internal_timeout` 时按较长超时(约为全局超时的两倍)。 201 | - `--proxy`: 代理设置(支持带认证与不带认证的HTTP代理),示例:`http://127.0.0.1:8080` 或 `http://user:pass@host:8080` 202 | 203 | ### 高级参数 204 | - `--keyword-file`: 自定义关键字文件路径 205 | - `--target-file`: 批量目标列表文件路径(每行一个目标:URL/文件/目录) 206 | - `--exclude`: 排除的文件或目录 207 | - `--verbose`: 显示详细日志信息 208 | - `--no-color`: 禁用彩色输出(适用于自动化脚本) 209 | 210 | ### 无头浏览器参数 211 | - `--headless`: 启用无头浏览器扫描 212 | - `--browser-type`: 无头浏览器类型(支持: chrome,默认: chrome) 213 | - `--js-wait`: JavaScript执行等待时间(秒,默认: 3) 214 | - `--headless-timeout`: 无头浏览器超时时间(秒,默认: 60) 215 | - `--headless-binary`: Chrome二进制路径(例如:`C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe`) 216 | - `--headless-driver`: ChromeDriver路径(例如:`C:\\drivers\\chromedriver.exe`) 217 | 218 | ## 常见问题解答 219 | 220 | **Q: 扫描结果中的误报如何处理?** 221 | A: 可通过以下方式降低噪音: 222 | - 使用自定义关键字文件调整权重 223 | - 利用报告的风险阈值(HTML默认展示风险≥4)聚焦高风险项 224 | - 依赖优化后的CSS检测逻辑与可信CDN白名单,避免将正常资源识别为可疑 225 | 226 | **Q: 如何提高大型网站的扫描效率?** 227 | A: 增加线程数、设置合理的爬取深度,或先使用基础模式(`fast`)进行初步筛选。对于公网网站,建议控制扫描范围。 228 | 229 | **Q: 为什么有些动态生成的链接没被检测到?** 230 | A: 启用无头浏览器模式`--headless`并适当增加JavaScript执行等待时间`--js-wait`。 231 | 232 | **Q: 使用无头浏览器时需要注意什么?** 233 | A: 使用无头浏览器会增加资源消耗和时间,建议适当降低线程数,为复杂页面增加等待时间,仅在必要时启用。 234 | 235 | ## 项目结构 236 | 237 | ``` 238 | YuanZhao/ 239 | ├── YuanZhao.py # 主程序入口 240 | ├── requirements.txt # 依赖列表 241 | ├── README.md # 项目说明 242 | ├── core/ # 核心模块 243 | │ ├── scanner.py # 扫描引擎 244 | │ ├── detector/ # 各类检测器 245 | │ ├── reporter.py # 报告生成器 246 | │ └── config.py # 配置管理 247 | ├── utils/ # 工具类 248 | └── keywords_example.txt # 关键字示例文件 249 | ``` 250 | 251 | ## 许可证与免责声明 252 | 253 | 本工具仅供安全测试和应急响应使用,请确保您有足够的授权对目标进行扫描,避免对未经授权的系统进行测试。 254 | 255 | ## 开发者提示(工具接口) 256 | - CSS工具正式接口:`extract_css_properties/remove_css_comments/extract_css_comments` 257 | - 统一正式接口(`extract_css_properties/remove_css_comments/extract_css_comments`)。 258 | 259 | ## 开发者选项(日志与报告) 260 | - `debug_log_wait_ms`:调试读取日志的初始等待时间(毫秒),默认 1500 261 | - `debug_log_checks`:日志稳定性检查次数,默认 3 262 | - `debug_log_interval_ms`:每次稳定性检查的间隔(毫秒),默认 500 263 | - 提取统计日志级别:常规运行为 `debug`(匹配数与总提取数),在 `--verbose` 场景下查看更详细日志 264 | - 报告来源字段:`context_type`(html/js/css/comments)与 `source_tag`(debug/normal)用于区分来源与路径 265 | - 关键字匹配报告:支持从上下文自动提取 `http(s)` 链接并渲染为可点击链接 266 | - HTML报告“可疑链接详情”默认按风险降序并过滤低风险项(≥4显示) 267 | ## Star History 268 | 269 | 270 | 271 | 272 | 273 | Star History Chart 274 | 275 | 276 | -------------------------------------------------------------------------------- /test_dark_link.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 暗链测试页面 - 无头浏览器检测测试 8 | 62 | 161 | 162 | 163 |

正常网站内容

164 | 165 |
166 |

欢迎访问我们的网站

167 |

这是一个正常的网站内容段落,介绍网站的主要功能和服务。

168 | 正常链接 169 |
170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 |
183 | 隐藏的博彩链接 184 |
185 | 186 |
187 | 隐藏的色情链接 188 |
189 | 190 |
191 | 赌博网站推荐:http://bet365-clone.gq 192 |
193 | 194 |
195 | 私服游戏下载:http://game-private-server.tk 196 |
197 | 198 |
199 | 黑客工具:http://hacker-tools.ml 200 |
201 | 202 |
203 | 时时彩预测:http://lottery-predict.cm 204 |
205 | 206 |
207 | 色情视频:http://adult-videos.ga 208 |
209 | 210 |
211 | 高利贷服务:http://loan-shark.tf 212 |
213 | 214 | 215 | 216 | 217 | 218 |

正常文本http://hidden-evil-site.tk继续正常文本

219 | 220 | 221 |

大量空格后的隐藏内容 http://hidden-link-using-spaces.ga

222 | 223 | 224 |
225 |
226 |
227 | 多层嵌套隐藏的链接 228 |
229 |
230 |
231 | 232 | 249 | 250 | 256 | 257 | 258 | -------------------------------------------------------------------------------- /core/detector/keyword_detector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 关键字检测器模块 5 | """ 6 | 7 | import re 8 | import logging 9 | from typing import List, Dict, Tuple 10 | import chardet 11 | 12 | logger = logging.getLogger('YuanZhao.detector.keyword') 13 | 14 | class KeywordDetector: 15 | """关键字检测器""" 16 | 17 | def __init__(self, config): 18 | self.config = config 19 | self.keywords = [] # 存储关键字列表 [(keyword, category, weight), ...] 20 | self.keyword_patterns = [] # 编译后的正则表达式模式列表 21 | 22 | def load_keywords(self, keywords_file: str) -> bool: 23 | """从文件加载关键字""" 24 | try: 25 | # 检测文件编码 26 | with open(keywords_file, 'rb') as f: 27 | raw_data = f.read(10000) 28 | result = chardet.detect(raw_data) 29 | encoding = result['encoding'] or 'utf-8' 30 | 31 | # 读取关键字文件 32 | with open(keywords_file, 'r', encoding=encoding) as f: 33 | import csv 34 | reader = csv.reader(f) 35 | for line_num, parts in enumerate(reader, 1): 36 | # 去除空行 37 | if not parts or all((p.strip() == '' for p in parts)): 38 | continue 39 | # 忽略注释行 40 | if parts and parts[0].strip().startswith('#'): 41 | continue 42 | if len(parts) < 3: 43 | logger.warning(f"关键字文件第{line_num}行格式错误,跳过: {parts}") 44 | continue 45 | keyword = parts[0].strip() 46 | category = parts[1].strip() 47 | # 验证风险权重 48 | try: 49 | weight = int(parts[2].strip()) 50 | if not 1 <= weight <= 10: 51 | logger.warning(f"关键字文件第{line_num}行风险权重超出范围(1-10),使用默认值5: {parts}") 52 | weight = 5 53 | except Exception: 54 | logger.warning(f"关键字文件第{line_num}行风险权重不是数字,使用默认值5: {parts}") 55 | weight = 5 56 | valid_categories = ['gambling', 'porn', 'malware', 'phishing', 'other'] 57 | if category not in valid_categories: 58 | logger.warning(f"关键字文件第{line_num}行类别无效,使用默认类别other: {parts}") 59 | category = 'other' 60 | self.keywords.append((keyword, category, weight)) 61 | 62 | # 编译正则表达式模式 63 | self._compile_keyword_patterns() 64 | 65 | logger.info(f"成功加载 {len(self.keywords)} 个关键字") 66 | return True 67 | 68 | except Exception as e: 69 | logger.error(f"加载关键字文件失败: {str(e)}", exc_info=True) 70 | # 如果加载失败,使用内置的默认关键字 71 | self._load_default_keywords() 72 | return False 73 | 74 | def _load_default_keywords(self): 75 | """默认从项目根目录读取 keywords_example.txt""" 76 | import os 77 | try: 78 | root = os.getcwd() 79 | path = os.path.join(root, 'keywords_example.txt') 80 | if os.path.exists(path): 81 | self.load_keywords(path) 82 | return 83 | logger.warning("未找到默认关键字文件 keywords_example.txt,关键字功能将受限") 84 | self.keywords = [] 85 | self.keyword_patterns = [] 86 | except Exception as e: 87 | logger.error(f"加载默认关键字失败: {str(e)}") 88 | self.keywords = [] 89 | self.keyword_patterns = [] 90 | 91 | def _compile_keyword_patterns(self): 92 | """编译关键字正则表达式模式""" 93 | self.keyword_patterns = [] 94 | 95 | for keyword, category, weight in self.keywords: 96 | if keyword.isascii() and re.fullmatch(r'[A-Za-z]+', keyword) and len(keyword) <= 2: 97 | pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE) 98 | else: 99 | pattern = re.compile(re.escape(keyword), re.IGNORECASE) 100 | self.keyword_patterns.append((pattern, keyword, category, weight)) 101 | 102 | def detect(self, content: str, source: str) -> List[Dict]: 103 | """检测内容中的关键字匹配""" 104 | results = [] 105 | 106 | # 如果没有加载关键字,使用默认关键字 107 | if not self.keywords: 108 | self._load_default_keywords() 109 | 110 | try: 111 | # 对每个关键字模式进行匹配 112 | for pattern, original_keyword, category, weight in self.keyword_patterns: 113 | for match in pattern.finditer(content): 114 | # 获取匹配上下文 115 | context = self._get_context(content, match.start(), match.end()) 116 | 117 | # 构建结果 118 | result = { 119 | 'keyword': original_keyword, 120 | 'category': self._get_category_name(category), 121 | 'weight': weight, 122 | 'source': source, 123 | 'context': context, 124 | 'match_position': match.start() 125 | } 126 | 127 | # 避免重复添加相同位置的匹配 128 | if not self._is_duplicate_match(results, result): 129 | results.append(result) 130 | 131 | # 按风险权重排序 132 | results.sort(key=lambda x: x['weight'], reverse=True) 133 | 134 | except Exception as e: 135 | logger.error(f"关键字检测失败: {str(e)}", exc_info=True) 136 | 137 | return results 138 | 139 | def _get_category_name(self, category: str) -> str: 140 | """获取类别的中文名称""" 141 | category_names = { 142 | 'gambling': '博彩', 143 | 'porn': '色情', 144 | 'malware': '恶意软件', 145 | 'phishing': '钓鱼', 146 | 'other': '其他' 147 | } 148 | 149 | return category_names.get(category, '其他') 150 | 151 | def _get_context(self, content: str, start: int, end: int, context_size: int = 50) -> str: 152 | """获取匹配内容的上下文""" 153 | start_context = max(0, start - context_size) 154 | end_context = min(len(content), end + context_size) 155 | 156 | context = content[start_context:end_context] 157 | context = context.replace('\n', ' ').replace('\r', ' ') 158 | 159 | # 截断过长的上下文 160 | if len(context) > 200: 161 | context = context[:100] + '...' + context[-100:] 162 | 163 | return context 164 | 165 | def _is_duplicate_match(self, existing_results: List[Dict], new_result: Dict) -> bool: 166 | """检查是否为重复的匹配""" 167 | # 检查是否在相同位置附近有相同关键字的匹配 168 | position = new_result['match_position'] 169 | keyword = new_result['keyword'] 170 | source = new_result['source'] 171 | 172 | for result in existing_results: 173 | if (result['keyword'] == keyword and 174 | result['source'] == source and 175 | abs(result['match_position'] - position) < 10): 176 | return True 177 | 178 | return False 179 | 180 | def get_keyword_statistics(self) -> Dict: 181 | """获取关键字统计信息""" 182 | stats = { 183 | 'total_keywords': len(self.keywords), 184 | 'by_category': {} 185 | } 186 | 187 | # 按类别统计 188 | for _, category, _ in self.keywords: 189 | category_name = self._get_category_name(category) 190 | if category_name not in stats['by_category']: 191 | stats['by_category'][category_name] = 0 192 | stats['by_category'][category_name] += 1 193 | 194 | return stats 195 | 196 | def add_keyword(self, keyword: str, category: str = 'other', weight: int = 5): 197 | """动态添加关键字""" 198 | # 验证参数 199 | if not keyword or not keyword.strip(): 200 | logger.warning("尝试添加空关键字,跳过") 201 | return False 202 | 203 | weight = max(1, min(10, weight)) # 限制在1-10范围内 204 | 205 | valid_categories = ['gambling', 'porn', 'malware', 'phishing', 'other'] 206 | if category not in valid_categories: 207 | category = 'other' 208 | 209 | # 检查是否已存在 210 | for existing_keyword, _, _ in self.keywords: 211 | if existing_keyword == keyword: 212 | logger.warning(f"关键字 '{keyword}' 已存在") 213 | return False 214 | 215 | # 添加关键字 216 | self.keywords.append((keyword, category, weight)) 217 | 218 | # 编译新的模式 219 | if keyword.isascii() and re.fullmatch(r'[A-Za-z]+', keyword) and len(keyword) <= 2: 220 | pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE) 221 | else: 222 | pattern = re.compile(re.escape(keyword), re.IGNORECASE) 223 | self.keyword_patterns.append((pattern, keyword, category, weight)) 224 | 225 | logger.info(f"成功添加关键字: {keyword} (类别: {category}, 权重: {weight})") 226 | return True 227 | 228 | def clear_keywords(self): 229 | """清空所有关键字""" 230 | self.keywords = [] 231 | self.keyword_patterns = [] 232 | logger.info("已清空所有关键字") 233 | 234 | -------------------------------------------------------------------------------- /utils/html_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTML处理工具模块 5 | """ 6 | 7 | import re 8 | import logging 9 | from typing import List, Dict, Optional 10 | from bs4 import BeautifulSoup, Comment 11 | 12 | logger = logging.getLogger('YuanZhao.utils.html') 13 | 14 | def clean_html(html_content: str) -> str: 15 | """ 16 | 清理HTML内容,去除空白字符等 17 | 18 | Args: 19 | html_content: HTML内容 20 | 21 | Returns: 22 | 清理后的HTML内容 23 | """ 24 | try: 25 | # 移除多余的空白字符 26 | html_content = re.sub(r'\s+', ' ', html_content) 27 | # 移除首尾空白 28 | html_content = html_content.strip() 29 | return html_content 30 | except Exception as e: 31 | logger.error(f"清理HTML失败: {str(e)}") 32 | return html_content 33 | 34 | def extract_html_comments(html_content: str) -> List[Dict[str, str]]: 35 | """ 36 | 提取HTML注释 37 | 38 | Args: 39 | html_content: HTML内容 40 | 41 | Returns: 42 | 注释列表,每项包含注释内容和位置 43 | """ 44 | comments = [] 45 | 46 | try: 47 | # 使用正则表达式提取注释 48 | comment_pattern = re.compile(r'', re.DOTALL) 49 | matches = comment_pattern.finditer(html_content) 50 | 51 | for match in matches: 52 | comment_content = match.group(1) 53 | start_pos = match.start(0) 54 | end_pos = match.end(0) 55 | 56 | comments.append({ 57 | 'content': comment_content.strip(), 58 | 'position': (start_pos, end_pos) 59 | }) 60 | 61 | except Exception as e: 62 | logger.error(f"提取HTML注释失败: {str(e)}") 63 | 64 | return comments 65 | 66 | def extract_script_tags(html_content: str) -> List[Dict[str, str]]: 67 | """ 68 | 提取HTML中的script标签 69 | 70 | Args: 71 | html_content: HTML内容 72 | 73 | Returns: 74 | script标签列表 75 | """ 76 | scripts = [] 77 | 78 | try: 79 | soup = BeautifulSoup(html_content, 'lxml') 80 | script_tags = soup.find_all('script') 81 | 82 | for script in script_tags: 83 | script_info = { 84 | 'src': script.get('src', ''), 85 | 'content': script.string or '', 86 | 'type': script.get('type', ''), 87 | 'language': script.get('language', '') 88 | } 89 | 90 | # 获取script标签的原始字符串 91 | if script: # 确保script不为None 92 | script_info['original_tag'] = str(script) 93 | else: 94 | script_info['original_tag'] = '' 95 | 96 | scripts.append(script_info) 97 | 98 | except Exception as e: 99 | logger.error(f"提取script标签失败: {str(e)}") 100 | 101 | # 如果BeautifulSoup失败,尝试使用正则表达式 102 | try: 103 | script_pattern = re.compile(r']*>(.*?)', re.DOTALL | re.IGNORECASE) 104 | matches = script_pattern.finditer(html_content) 105 | 106 | for match in matches: 107 | scripts.append({ 108 | 'src': '', 109 | 'content': match.group(1) or '', 110 | 'type': '', 111 | 'language': '', 112 | 'original_tag': match.group(0) 113 | }) 114 | except Exception as fallback_error: 115 | logger.error(f"正则提取script标签也失败: {str(fallback_error)}") 116 | 117 | return scripts 118 | 119 | def extract_link_tags(html_content: str) -> List[Dict[str, str]]: 120 | """ 121 | 提取HTML中的link标签 122 | 123 | Args: 124 | html_content: HTML内容 125 | 126 | Returns: 127 | link标签列表 128 | """ 129 | links = [] 130 | 131 | try: 132 | soup = BeautifulSoup(html_content, 'lxml') 133 | link_tags = soup.find_all('link') 134 | 135 | for link in link_tags: 136 | links.append({ 137 | 'href': link.get('href', ''), 138 | 'rel': link.get('rel', ''), 139 | 'type': link.get('type', ''), 140 | 'original_tag': str(link) if link else '' 141 | }) 142 | 143 | except Exception as e: 144 | logger.error(f"提取link标签失败: {str(e)}") 145 | 146 | return links 147 | 148 | def extract_meta_tags(html_content: str) -> List[Dict[str, str]]: 149 | """ 150 | 提取HTML中的meta标签 151 | 152 | Args: 153 | html_content: HTML内容 154 | 155 | Returns: 156 | meta标签列表 157 | """ 158 | metas = [] 159 | 160 | try: 161 | soup = BeautifulSoup(html_content, 'lxml') 162 | meta_tags = soup.find_all('meta') 163 | 164 | for meta in meta_tags: 165 | meta_info = { 166 | 'name': meta.get('name', ''), 167 | 'content': meta.get('content', ''), 168 | 'http-equiv': meta.get('http-equiv', ''), 169 | 'charset': meta.get('charset', ''), 170 | 'original_tag': str(meta) if meta else '' 171 | } 172 | metas.append(meta_info) 173 | 174 | except Exception as e: 175 | logger.error(f"提取meta标签失败: {str(e)}") 176 | 177 | return metas 178 | 179 | def extract_iframe_tags(html_content: str) -> List[Dict[str, str]]: 180 | """ 181 | 提取HTML中的iframe标签 182 | 183 | Args: 184 | html_content: HTML内容 185 | 186 | Returns: 187 | iframe标签列表 188 | """ 189 | iframes = [] 190 | 191 | try: 192 | soup = BeautifulSoup(html_content, 'lxml') 193 | iframe_tags = soup.find_all('iframe') 194 | 195 | for iframe in iframe_tags: 196 | iframes.append({ 197 | 'src': iframe.get('src', ''), 198 | 'width': iframe.get('width', ''), 199 | 'height': iframe.get('height', ''), 200 | 'style': iframe.get('style', ''), 201 | 'original_tag': str(iframe) if iframe else '' 202 | }) 203 | 204 | except Exception as e: 205 | logger.error(f"提取iframe标签失败: {str(e)}") 206 | 207 | return iframes 208 | 209 | def extract_all_tags(html_content: str, tag_name: str) -> List[BeautifulSoup]: 210 | """ 211 | 提取指定标签的所有实例 212 | 213 | Args: 214 | html_content: HTML内容 215 | tag_name: 标签名称 216 | 217 | Returns: 218 | 标签列表 219 | """ 220 | tags = [] 221 | 222 | try: 223 | soup = BeautifulSoup(html_content, 'lxml') 224 | tags = soup.find_all(tag_name) 225 | except Exception as e: 226 | logger.error(f"提取{tag_name}标签失败: {str(e)}") 227 | 228 | return tags 229 | 230 | def get_dom_structure(html_content: str, max_depth: int = 3) -> Dict: 231 | """ 232 | 获取DOM结构概览 233 | 234 | Args: 235 | html_content: HTML内容 236 | max_depth: 最大深度 237 | 238 | Returns: 239 | DOM结构字典 240 | """ 241 | try: 242 | soup = BeautifulSoup(html_content, 'lxml') 243 | 244 | def _process_element(element, depth): 245 | if depth > max_depth: 246 | return {} 247 | 248 | tag_info = { 249 | 'tag': element.name, 250 | 'attributes': {k: v for k, v in element.attrs.items()}, 251 | 'children': [] 252 | } 253 | 254 | for child in element.children: 255 | if hasattr(child, 'name') and child.name: 256 | tag_info['children'].append(_process_element(child, depth + 1)) 257 | 258 | return tag_info 259 | 260 | return _process_element(soup.find('html') or soup, 0) 261 | 262 | except Exception as e: 263 | logger.error(f"获取DOM结构失败: {str(e)}") 264 | return {} 265 | 266 | def find_hidden_elements(html_content: str) -> List[Dict[str, str]]: 267 | """ 268 | 查找可能被隐藏的元素 269 | 270 | Args: 271 | html_content: HTML内容 272 | 273 | Returns: 274 | 隐藏元素列表 275 | """ 276 | hidden_elements = [] 277 | 278 | try: 279 | soup = BeautifulSoup(html_content, 'lxml') 280 | 281 | # 查找可能隐藏的元素 282 | for element in soup.find_all(): 283 | # 检查style属性 284 | style = element.get('style', '').lower() 285 | 286 | if any(hidden in style for hidden in ['display:none', 'visibility:hidden', 'opacity:0']): 287 | hidden_elements.append({ 288 | 'tag': element.name, 289 | 'style': style, 290 | 'content': element.get_text(), 291 | 'original_tag': str(element) if element else '' 292 | }) 293 | 294 | # 检查hidden属性 295 | if element.get('hidden') is not None: 296 | hidden_elements.append({ 297 | 'tag': element.name, 298 | 'reason': 'hidden attribute', 299 | 'content': element.get_text(), 300 | 'original_tag': str(element) if element else '' 301 | }) 302 | 303 | except Exception as e: 304 | logger.error(f"查找隐藏元素失败: {str(e)}") 305 | 306 | return hidden_elements 307 | 308 | def extract_text_from_html(html_content: str) -> str: 309 | """ 310 | 从HTML中提取纯文本 311 | 312 | Args: 313 | html_content: HTML内容 314 | 315 | Returns: 316 | 提取的纯文本 317 | """ 318 | try: 319 | soup = BeautifulSoup(html_content, 'lxml') 320 | 321 | # 移除script和style标签 322 | for script in soup(['script', 'style']): 323 | if script: 324 | script.decompose() 325 | 326 | # 提取文本 327 | text = soup.get_text(separator=' ', strip=True) 328 | 329 | # 清理空白字符 330 | text = re.sub(r'\s+', ' ', text) 331 | 332 | return text 333 | 334 | except Exception as e: 335 | logger.error(f"提取HTML文本失败: {str(e)}") 336 | return html_content 337 | 338 | def remove_html_tags(html_content: str, keep_whitespace: bool = False) -> str: 339 | """ 340 | 移除HTML标签 341 | 342 | Args: 343 | html_content: HTML内容 344 | keep_whitespace: 是否保留空白 345 | 346 | Returns: 347 | 移除标签后的文本 348 | """ 349 | try: 350 | # 使用正则表达式移除标签 351 | text = re.sub(r'<[^>]+>', '', html_content) 352 | 353 | if not keep_whitespace: 354 | # 移除多余的空白字符 355 | text = re.sub(r'\s+', ' ', text).strip() 356 | 357 | return text 358 | 359 | except Exception as e: 360 | logger.error(f"移除HTML标签失败: {str(e)}") 361 | return html_content 362 | 363 | def get_character_encoding(html_content: str) -> Optional[str]: 364 | """ 365 | 获取HTML文档的字符编码 366 | 367 | Args: 368 | html_content: HTML内容 369 | 370 | Returns: 371 | 字符编码 372 | """ 373 | try: 374 | # 检查meta标签中的charset 375 | charset_match = re.search(r']+charset=["\']?([^"\'>\s]+)', html_content, re.IGNORECASE) 376 | if charset_match: 377 | return charset_match.group(1).lower() 378 | 379 | # 检查http-equiv中的content-type 380 | content_type_match = re.search(r']+http-equiv=["\']?content-type["\']?[^>]*content=["\']?[^"\']*charset=([^"\'>\s;]+)', html_content, re.IGNORECASE) 381 | if content_type_match: 382 | return content_type_match.group(1).lower() 383 | 384 | return None 385 | 386 | except Exception as e: 387 | logger.error(f"获取字符编码失败: {str(e)}") 388 | return None 389 | 390 | # 兼容性函数,为了支持html_detector.py中的导入 391 | def extract_comments(html_content: str) -> List[Dict[str, str]]: 392 | """ 393 | 提取HTML注释(extract_html_comments的别名) 394 | 395 | Args: 396 | html_content: HTML内容 397 | 398 | Returns: 399 | 注释列表 400 | """ 401 | return extract_html_comments(html_content) 402 | -------------------------------------------------------------------------------- /utils/common_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 通用工具模块 5 | """ 6 | 7 | import re 8 | import time 9 | import hashlib 10 | import logging 11 | import os 12 | from typing import List, Dict, Any, Optional, Tuple 13 | 14 | logger = logging.getLogger('YuanZhao.utils.common') 15 | 16 | def calculate_file_hash(file_path: str, hash_type: str = 'md5') -> Optional[str]: 17 | """ 18 | 计算文件哈希值 19 | 20 | Args: 21 | file_path: 文件路径 22 | hash_type: 哈希算法类型 (md5, sha1, sha256) 23 | 24 | Returns: 25 | 哈希值字符串 26 | """ 27 | try: 28 | hash_func = getattr(hashlib, hash_type) 29 | hash_obj = hash_func() 30 | 31 | with open(file_path, 'rb') as f: 32 | while True: 33 | data = f.read(65536) # 64KB chunks 34 | if not data: 35 | break 36 | hash_obj.update(data) 37 | 38 | return hash_obj.hexdigest() 39 | 40 | except Exception as e: 41 | logger.error(f"计算文件哈希失败: {file_path}, 错误: {str(e)}") 42 | return None 43 | 44 | def calculate_string_hash(string: str, hash_type: str = 'md5') -> Optional[str]: 45 | """ 46 | 计算字符串哈希值 47 | 48 | Args: 49 | string: 输入字符串 50 | hash_type: 哈希算法类型 51 | 52 | Returns: 53 | 哈希值字符串 54 | """ 55 | try: 56 | hash_func = getattr(hashlib, hash_type) 57 | return hash_func(string.encode('utf-8')).hexdigest() 58 | except Exception as e: 59 | logger.error(f"计算字符串哈希失败: {str(e)}") 60 | return None 61 | 62 | def clean_text(text: str) -> str: 63 | """ 64 | 清理文本,去除控制字符和多余空白 65 | 66 | Args: 67 | text: 输入文本 68 | 69 | Returns: 70 | 清理后的文本 71 | """ 72 | try: 73 | # 移除控制字符,保留换行和制表符 74 | text = ''.join(char for char in text if char.isprintable() or char in '\n\t') 75 | # 清理多余空白 76 | text = re.sub(r'\s+', ' ', text) 77 | return text.strip() 78 | except Exception as e: 79 | logger.error(f"清理文本失败: {str(e)}") 80 | return text 81 | 82 | def extract_text_between(text: str, start_marker: str, end_marker: str) -> List[str]: 83 | """ 84 | 提取两个标记之间的文本 85 | 86 | Args: 87 | text: 原始文本 88 | start_marker: 开始标记 89 | end_marker: 结束标记 90 | 91 | Returns: 92 | 提取的文本列表 93 | """ 94 | try: 95 | pattern = re.compile(re.escape(start_marker) + '(.*?)' + re.escape(end_marker), re.DOTALL) 96 | return pattern.findall(text) 97 | except Exception as e: 98 | logger.error(f"提取文本失败: {str(e)}") 99 | return [] 100 | 101 | def detect_encoding(text: str) -> Optional[str]: 102 | """ 103 | 检测文本编码(传入为 str 时返回默认编码) 104 | """ 105 | try: 106 | # 对已解码的 str 返回 utf-8,避免误导性“探测” 107 | return 'utf-8' 108 | except Exception as e: 109 | logger.error(f"检测编码失败: {str(e)}") 110 | return None 111 | 112 | def safe_decode(bytes_data: bytes, default_encoding: str = 'utf-8') -> str: 113 | """ 114 | 安全解码字节数据 115 | 116 | Args: 117 | bytes_data: 字节数据 118 | default_encoding: 默认编码 119 | 120 | Returns: 121 | 解码后的字符串 122 | """ 123 | try: 124 | # 尝试多种编码 125 | encodings = [default_encoding, 'gbk', 'gb2312', 'iso-8859-1'] 126 | 127 | for encoding in encodings: 128 | try: 129 | return bytes_data.decode(encoding) 130 | except UnicodeDecodeError: 131 | continue 132 | 133 | # 如果都失败,使用replace模式 134 | return bytes_data.decode(default_encoding, errors='replace') 135 | 136 | except Exception as e: 137 | logger.error(f"安全解码失败: {str(e)}") 138 | return str(bytes_data) 139 | 140 | def format_size(size_bytes: int) -> str: 141 | """ 142 | 格式化文件大小 143 | 144 | Args: 145 | size_bytes: 字节大小 146 | 147 | Returns: 148 | 格式化的大小字符串 149 | """ 150 | try: 151 | for unit in ['B', 'KB', 'MB', 'GB', 'TB']: 152 | if size_bytes < 1024.0: 153 | return f"{size_bytes:.2f} {unit}" 154 | size_bytes /= 1024.0 155 | return f"{size_bytes:.2f} PB" 156 | except Exception as e: 157 | logger.error(f"格式化大小失败: {str(e)}") 158 | return f"{size_bytes} B" 159 | 160 | def format_time(seconds: float) -> str: 161 | """ 162 | 格式化时间 163 | 164 | Args: 165 | seconds: 秒数 166 | 167 | Returns: 168 | 格式化的时间字符串 169 | """ 170 | try: 171 | if seconds < 1: 172 | return f"{seconds * 1000:.2f} ms" 173 | elif seconds < 60: 174 | return f"{seconds:.2f} s" 175 | elif seconds < 3600: 176 | minutes, seconds = divmod(seconds, 60) 177 | return f"{int(minutes)} m {seconds:.2f} s" 178 | else: 179 | hours, remainder = divmod(seconds, 3600) 180 | minutes, seconds = divmod(remainder, 60) 181 | return f"{int(hours)} h {int(minutes)} m {seconds:.2f} s" 182 | except Exception as e: 183 | logger.error(f"格式化时间失败: {str(e)}") 184 | return f"{seconds} s" 185 | 186 | def get_file_extension(file_path: str) -> str: 187 | """ 188 | 获取文件扩展名 189 | 190 | Args: 191 | file_path: 文件路径 192 | 193 | Returns: 194 | 扩展名(小写) 195 | """ 196 | try: 197 | _, ext = os.path.splitext(file_path.lower()) 198 | return ext 199 | except Exception as e: 200 | logger.error(f"获取文件扩展名失败: {str(e)}") 201 | return '' 202 | 203 | def validate_ip_address(ip: str) -> bool: 204 | """ 205 | 验证IP地址格式 206 | 207 | Args: 208 | ip: IP地址字符串 209 | 210 | Returns: 211 | 是否为有效IP地址 212 | """ 213 | try: 214 | # IPv4地址验证 215 | pattern = re.compile(r'^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$') 216 | return bool(pattern.match(ip)) 217 | except Exception as e: 218 | logger.error(f"验证IP地址失败: {str(e)}") 219 | return False 220 | 221 | def count_occurrences(text: str, keyword: str, case_sensitive: bool = False) -> int: 222 | """ 223 | 统计关键字出现次数 224 | 225 | Args: 226 | text: 文本内容 227 | keyword: 关键字 228 | case_sensitive: 是否区分大小写 229 | 230 | Returns: 231 | 出现次数 232 | """ 233 | try: 234 | if not case_sensitive: 235 | text = text.lower() 236 | keyword = keyword.lower() 237 | 238 | return text.count(keyword) 239 | except Exception as e: 240 | logger.error(f"统计关键字失败: {str(e)}") 241 | return 0 242 | 243 | def is_valid_email(email: str) -> bool: 244 | """ 245 | 验证邮箱格式 246 | 247 | Args: 248 | email: 邮箱地址 249 | 250 | Returns: 251 | 是否为有效邮箱 252 | """ 253 | try: 254 | pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') 255 | return bool(pattern.match(email)) 256 | except Exception as e: 257 | logger.error(f"验证邮箱失败: {str(e)}") 258 | return False 259 | 260 | def sanitize_filename(filename: str) -> str: 261 | """ 262 | 清理文件名,移除特殊字符 263 | 264 | Args: 265 | filename: 原始文件名 266 | 267 | Returns: 268 | 清理后的文件名 269 | """ 270 | try: 271 | # 移除或替换特殊字符 272 | sanitized = re.sub(r'[\\/:*?"<>|]', '_', filename) 273 | # 移除控制字符 274 | sanitized = ''.join(char for char in sanitized if char.isprintable() or char.isspace()) 275 | # 限制长度 276 | max_length = 200 277 | if len(sanitized) > max_length: 278 | name, ext = os.path.splitext(sanitized) 279 | sanitized = name[:max_length - len(ext)] + ext 280 | return sanitized.strip() or 'unnamed' 281 | except Exception as e: 282 | logger.error(f"清理文件名失败: {str(e)}") 283 | return 'unnamed' 284 | 285 | def merge_dicts(dict1: Dict, dict2: Dict, deep: bool = True) -> Dict: 286 | """ 287 | 合并两个字典 288 | 289 | Args: 290 | dict1: 第一个字典 291 | dict2: 第二个字典 292 | deep: 是否深度合并 293 | 294 | Returns: 295 | 合并后的字典 296 | """ 297 | try: 298 | result = dict1.copy() 299 | 300 | if deep: 301 | for key, value in dict2.items(): 302 | if key in result and isinstance(result[key], dict) and isinstance(value, dict): 303 | result[key] = merge_dicts(result[key], value, deep=True) 304 | else: 305 | result[key] = value 306 | else: 307 | result.update(dict2) 308 | 309 | return result 310 | except Exception as e: 311 | logger.error(f"合并字典失败: {str(e)}") 312 | return dict1 313 | 314 | def remove_duplicates_preserve_order(items: List) -> List: 315 | """ 316 | 移除列表中的重复项,保留原始顺序 317 | 318 | Args: 319 | items: 输入列表 320 | 321 | Returns: 322 | 去重后的列表 323 | """ 324 | try: 325 | seen = set() 326 | return [item for item in items if not (item in seen or seen.add(item))] 327 | except Exception as e: 328 | logger.error(f"去重失败: {str(e)}") 329 | return items 330 | 331 | def truncate_text(text: str, max_length: int, suffix: str = '...') -> str: 332 | """ 333 | 截断文本 334 | 335 | Args: 336 | text: 输入文本 337 | max_length: 最大长度 338 | suffix: 后缀 339 | 340 | Returns: 341 | 截断后的文本 342 | """ 343 | try: 344 | if len(text) <= max_length: 345 | return text 346 | return text[:max_length - len(suffix)] + suffix 347 | except Exception as e: 348 | logger.error(f"截断文本失败: {str(e)}") 349 | return text 350 | 351 | def retry(func, max_retries: int = 3, delay: float = 1.0, exceptions: tuple = (Exception,)) -> Any: 352 | """ 353 | 重试装饰器 354 | 355 | Args: 356 | func: 要重试的函数 357 | max_retries: 最大重试次数 358 | delay: 重试间隔(秒) 359 | exceptions: 捕获的异常类型 360 | 361 | Returns: 362 | 函数执行结果 363 | """ 364 | def wrapper(*args, **kwargs): 365 | last_exception = None 366 | 367 | for attempt in range(max_retries): 368 | try: 369 | return func(*args, **kwargs) 370 | except exceptions as e: 371 | last_exception = e 372 | if attempt < max_retries - 1: 373 | logger.warning(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}, {delay}秒后重试...") 374 | time.sleep(delay) 375 | 376 | logger.error(f"所有尝试都失败了: {str(last_exception)}") 377 | raise last_exception 378 | 379 | return wrapper 380 | 381 | # 移除末尾的导入语句 382 | 383 | # 兼容性函数,为了支持html_detector.py中的导入 384 | def extract_text_between_markers(text: str, start_marker: str, end_marker: str) -> List[str]: 385 | """ 386 | 提取两个标记之间的文本(extract_text_between的别名) 387 | 388 | Args: 389 | text: 原始文本 390 | start_marker: 开始标记 391 | end_marker: 结束标记 392 | 393 | Returns: 394 | 提取的文本列表 395 | """ 396 | return extract_text_between(text, start_marker, end_marker) 397 | 398 | def get_context(text: str, position: int, context_length: int = 50) -> str: 399 | """ 400 | 获取文本中指定位置的上下文 401 | 402 | Args: 403 | text: 原始文本 404 | position: 目标位置 405 | context_length: 上下文长度 406 | 407 | Returns: 408 | 包含上下文的文本 409 | """ 410 | try: 411 | # 计算上下文的起始和结束位置 412 | context_start = max(0, position - context_length) 413 | context_end = min(len(text), position + context_length) 414 | 415 | # 提取上下文 416 | context = text[context_start:context_end] 417 | 418 | # 添加省略号 419 | prefix = '...' if context_start > 0 else '' 420 | suffix = '...' if context_end < len(text) else '' 421 | 422 | return f"{prefix}{context}{suffix}" 423 | except Exception as e: 424 | logger.error(f"获取上下文失败: {str(e)}") 425 | return text 426 | 427 | def calculate_entropy(text: str) -> float: 428 | """ 429 | 计算文本的熵值 430 | 431 | Args: 432 | text: 输入文本 433 | 434 | Returns: 435 | 熵值 436 | """ 437 | try: 438 | import math 439 | 440 | # 计算字符频率 441 | frequency = {} 442 | for char in text: 443 | if char in frequency: 444 | frequency[char] += 1 445 | else: 446 | frequency[char] = 1 447 | 448 | # 计算熵 449 | entropy = 0.0 450 | total_chars = len(text) 451 | 452 | for count in frequency.values(): 453 | probability = count / total_chars 454 | entropy -= probability * math.log2(probability) 455 | 456 | return entropy 457 | except Exception as e: 458 | logger.error(f"计算熵值失败: {str(e)}") 459 | return 0.0 460 | 461 | -------------------------------------------------------------------------------- /YuanZhao.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 渊照 - 暗链扫描工具 6 | """ 7 | 8 | import os 9 | import sys 10 | import argparse 11 | import logging 12 | import re 13 | from datetime import datetime 14 | from urllib.parse import urlparse 15 | 16 | # 添加项目根目录到Python路径 17 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 18 | 19 | from utils.logging_utils import setup_logging, log_config, log_summary 20 | from core.config import Config 21 | from core.scanner import Scanner 22 | from core.reporter import Reporter 23 | 24 | def parse_arguments(): 25 | """ 26 | 解析命令行参数 27 | """ 28 | description = '''渊照 - 专业暗链扫描工具 29 | 30 | 用于智能检测网站、HTML文件或目录中的可疑暗链、隐藏元素和恶意代码。 31 | 支持自动识别扫描目标类型(本地文件/目录、内网URL、公网URL),并应用最优扫描策略。 32 | 提供多种扫描模式和报告格式,具备强大的检测能力和灵活的配置选项。 33 | 34 | 主要功能: 35 | - 基础扫描:HTML代码、JavaScript代码、CSS代码、元标签、注释扫描 36 | - 高级扫描:加密/编码链接检测、隐写术检测、DOM操作检测、iframe检测 37 | - 特殊隐藏手法检测:颜色隐藏、绝对定位隐藏、零宽字符隐藏、字体大小隐藏等 38 | - 关键字匹配:支持自定义关键字文件,按类别组织关键字,进行多语言匹配 39 | - 优化的HTML报告:清晰展示可疑链接信息,上下文列直接显示从日志中检测到的完整问题 40 | ''' 41 | 42 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawDescriptionHelpFormatter) 43 | 44 | # 扫描目标 45 | parser.add_argument('target', help='扫描目标:文件路径、目录路径或URL(支持http/https协议)') 46 | 47 | # 扫描配置 48 | parser.add_argument('-d', '--depth', type=int, default=3, 49 | help='递归扫描深度(默认:3,0表示仅扫描当前文件/目录)') 50 | parser.add_argument('-m', '--mode', choices=['fast', 'standard', 'deep'], default='deep', 51 | help='''扫描模式: 52 | fast(基础):仅检测基本的暗链与明显可疑元素,快速 53 | standard(高级):增加JS/HTML/CSS分析与隐藏元素检测 54 | deep(完整):执行全部检测模块,适合深度审计''') 55 | parser.add_argument('-t', '--threads', type=int, default=8, 56 | help='并发线程数(默认:8,范围1-100)') 57 | parser.add_argument('-o', '--output', help='报告输出目录(默认:./reports)') 58 | parser.add_argument('-f', '--format', choices=['txt', 'html', 'json', 'csv'], default='txt', 59 | help='''报告格式(默认:txt): 60 | txt:简洁的文本报告,适合快速查看和日志记录 61 | html:详细的网页报告,包含样式和表格,上下文列直接显示问题链接 62 | json:结构化数据,适合程序处理和自动化集成 63 | csv:表格数据,适合导入电子表格软件进行进一步分析''') 64 | 65 | # 高级配置 66 | parser.add_argument('--timeout', type=int, default=30, 67 | help='请求超时时间(秒,默认:30)。注意:工具会根据目标类型(内网/公网)自动优化超时设置') 68 | parser.add_argument('--proxy', help='''代理设置,格式: 69 | http://username:password@host:port(有认证)或 70 | http://host:port(无认证)''') 71 | parser.add_argument('--keyword-file', help='''自定义关键字文件路径(CSV格式) 72 | 格式示例:关键字,类别,风险权重 73 | 类别可选:gambling(博彩), porn(色情), malware(恶意软件), phishing(钓鱼), other(其他) 74 | 风险权重范围:1-10,10为最高风险''') 75 | parser.add_argument('--exclude', nargs='+', help='排除的文件或目录(支持通配符,如 "*.log" "node_modules/")') 76 | parser.add_argument('--no-color', action='store_true', help='禁用彩色输出') 77 | parser.add_argument('--verbose', action='store_true', default=False, help='显示详细日志信息,包括检测过程和调试内容') 78 | 79 | # 无头浏览器选项 80 | parser.add_argument('--headless', action='store_true', help='启用无头浏览器扫描 (增强动态内容检测)') 81 | parser.add_argument('--browser-type', choices=['chrome'], default='chrome', help='无头浏览器类型 (默认: chrome)') 82 | parser.add_argument('--js-wait', type=int, default=3, help='JavaScript执行等待时间 (秒, 默认: 3)') 83 | parser.add_argument('--headless-timeout', type=int, default=60, help='无头浏览器超时时间 (秒, 默认: 60)') 84 | parser.add_argument('--headless-binary', help='Chrome二进制路径 (例如: C\\Program Files\\Google\\Chrome\\Application\\chrome.exe)') 85 | parser.add_argument('--headless-driver', help='ChromeDriver路径 (例如: C\\drivers\\chromedriver.exe)') 86 | parser.add_argument('--target-file', help='目标列表文件,每行一个目标') 87 | 88 | # 添加使用示例 89 | parser.epilog = ''' 90 | 使用示例: 91 | # 扫描单个HTML文件 92 | python YuanZhao.py test.html 93 | 94 | # 扫描目录及其子目录(深度为2) 95 | python YuanZhao.py ./website -d 2 96 | 97 | # 扫描URL,使用高级模式,保存为HTML格式报告 98 | python YuanZhao.py https://example.com -m standard -f html 99 | 100 | # 使用自定义关键字文件,禁用彩色输出 101 | python YuanZhao.py ./website --keyword-file custom_keywords.txt --no-color 102 | 103 | # 完整扫描公网网站并生成HTML报告(优化后格式,在上下文列显示完整问题链接) 104 | python YuanZhao.py https://example.com -m deep -d 1 -t 8 --timeout 30 -f html --verbose 105 | 106 | # 扫描特定新闻页面并在可疑链接详情中显示问题信息 107 | python YuanZhao.py https://example.com/news.php -m deep -d 1 -t 8 --timeout 30 -f html --verbose 108 | 109 | # 对内网网站进行深度扫描,使用较长超时时间 110 | python YuanZhao.py http://192.168.1.100 -d 4 -m deep --timeout 60 -f html -o intranet_reports 111 | 112 | # 扫描并排除特定文件类型 113 | python YuanZhao.py ./website --exclude "*.log" "temp/*" "node_modules/" 114 | 115 | # 使用无头浏览器增强扫描动态内容 116 | python YuanZhao.py https://example.com --headless --js-wait 5 117 | ''' 118 | 119 | return parser.parse_args() 120 | 121 | def validate_arguments(args): 122 | """ 123 | 验证命令行参数 124 | """ 125 | # 验证目标是否存在(如果是文件或目录) 126 | if not args.target.startswith(('http://', 'https://')): 127 | if not os.path.exists(args.target): 128 | print(f"错误:目标 '{args.target}' 不存在") 129 | return False 130 | if args.target.lower().endswith('.txt'): 131 | try: 132 | with open(args.target, 'r', encoding='utf-8') as f: 133 | lines = [line.strip() for line in f.readlines() if line.strip()] 134 | if not lines: 135 | print("错误:目标列表文件为空") 136 | return False 137 | except Exception: 138 | print("错误:无法读取目标列表文件") 139 | return False 140 | 141 | # 验证关键字文件 142 | if args.keyword_file and not os.path.exists(args.keyword_file): 143 | print(f"错误:关键字文件 '{args.keyword_file}' 不存在") 144 | return False 145 | if args.target_file and not os.path.exists(args.target_file): 146 | print(f"错误:目标列表文件 '{args.target_file}' 不存在") 147 | return False 148 | 149 | # 验证线程数 150 | if args.threads < 1 or args.threads > 100: 151 | print("错误:线程数必须在1-100之间") 152 | return False 153 | 154 | # 验证扫描深度 155 | if args.depth < 0: 156 | print("错误:扫描深度不能为负数") 157 | return False 158 | 159 | return True 160 | 161 | def main(): 162 | """ 163 | 主函数 164 | """ 165 | # 解析参数 166 | args = parse_arguments() 167 | 168 | # 验证参数 169 | if not validate_arguments(args): 170 | sys.exit(1) 171 | 172 | # 创建报告目录 173 | report_dir = args.output or os.path.join(os.getcwd(), 'reports') 174 | os.makedirs(report_dir, exist_ok=True) 175 | 176 | # 设置日志 177 | log_level = logging.DEBUG if args.verbose else logging.INFO 178 | logger = setup_logging(log_dir=report_dir, level=log_level) 179 | 180 | # 记录开始时间 181 | start_time = datetime.now() 182 | logger.info(f"开始扫描:{args.target}") 183 | logger.info(f"扫描模式:{args.mode}") 184 | 185 | # 创建配置 186 | config = Config() 187 | 188 | # 设置配置属性 189 | # 判断目标类型 190 | if args.target.startswith(('http://', 'https://')): 191 | # 检查是否为内网链接 192 | parsed_url = urlparse(args.target) 193 | domain = parsed_url.netloc 194 | # 内网域名/IP特征 195 | if (re.match(r'^127\.0\.0\.1(:\d+)?$', domain) or 196 | re.match(r'^localhost(:\d+)?$', domain) or 197 | re.match(r'^10\.\d+\.\d+\.\d+(:\d+)?$', domain) or 198 | re.match(r'^172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+(:\d+)?$', domain) or 199 | re.match(r'^192\.168\.\d+\.\d+(:\d+)?$', domain)): 200 | config.target_type = 'internal_url' 201 | else: 202 | config.target_type = 'external_url' 203 | elif os.path.isfile(args.target): 204 | config.target_type = 'local_file' 205 | elif os.path.isdir(args.target): 206 | config.target_type = 'local_directory' 207 | else: 208 | config.target_type = 'unknown' 209 | 210 | config.target = args.target 211 | config.crawl_depth = args.depth 212 | config.depth = args.depth # 同步更新depth属性 213 | 214 | # 映射扫描模式(仅使用新名称) 215 | mode_mapping = { 216 | 'fast': 'fast', 217 | 'standard': 'standard', 218 | 'deep': 'deep' 219 | } 220 | config.scan_mode = mode_mapping.get(args.mode, 'standard') 221 | config.mode = config.scan_mode # 同步更新mode属性 222 | config._set_mode_config() # 更新模式相关配置 223 | 224 | config.threads = args.threads 225 | config.timeout = args.timeout 226 | config.proxy = args.proxy 227 | config.keywords_file = args.keyword_file 228 | config.report_type = args.format 229 | config.report_file = os.path.join(report_dir, f"scan_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{args.format}") 230 | config.debug = args.verbose 231 | # 排除规则 232 | config.exclude = args.exclude or [] 233 | 234 | # 设置无头浏览器配置 235 | config.use_headless_browser = args.headless 236 | config.headless_browser = args.browser_type 237 | config.js_wait_time = args.js_wait 238 | config.headless_timeout = args.headless_timeout 239 | config.headless_binary = args.headless_binary 240 | config.headless_driver_path = args.headless_driver 241 | if args.headless: 242 | config.headless_auto_download = True 243 | 244 | # 记录配置 245 | log_config(logger, config.get_config_dict()) 246 | 247 | try: 248 | targets = [] 249 | if args.target_file: 250 | with open(args.target_file, 'r', encoding='utf-8') as f: 251 | targets = [line.strip() for line in f.readlines() if line.strip()] 252 | summary_target = f"目标列表: {args.target_file} ({len(targets)} 项)" 253 | elif not args.target.startswith(('http://', 'https://')) and args.target.lower().endswith('.txt'): 254 | with open(args.target, 'r', encoding='utf-8') as f: 255 | targets = [line.strip() for line in f.readlines() if line.strip()] 256 | summary_target = f"目标列表: {args.target} ({len(targets)} 项)" 257 | else: 258 | targets = [args.target] 259 | summary_target = args.target 260 | agg = { 261 | 'total_files': 0, 262 | 'scanned_files': 0, 263 | 'scanned_urls': 0, 264 | 'total_issues': 0, 265 | 'suspicious_links': [], 266 | 'hidden_elements': [], 267 | 'keyword_matches': [], 268 | 'js_issues': [], 269 | 'css_issues': [], 270 | 'scan_time': 0 271 | } 272 | for tgt in targets: 273 | if tgt.startswith(('http://', 'https://')): 274 | parsed_url = urlparse(tgt) 275 | domain = parsed_url.netloc 276 | if (re.match(r'^127\.0\.0\.1(:\d+)?$', domain) or 277 | re.match(r'^localhost(:\d+)?$', domain) or 278 | re.match(r'^10\.\d+\.\d+\.\d+(:\d+)?$', domain) or 279 | re.match(r'^172\.(?:1[6-9]|2\d|3[01])\.\d+\.\d+(:\d+)?$', domain) or 280 | re.match(r'^192\.168\.\d+\.\d+(:\d+)?$', domain)): 281 | config.target_type = 'internal_url' 282 | else: 283 | config.target_type = 'external_url' 284 | elif os.path.isfile(tgt): 285 | config.target_type = 'local_file' 286 | elif os.path.isdir(tgt): 287 | config.target_type = 'local_directory' 288 | else: 289 | continue 290 | config.target = tgt 291 | scanner = Scanner(config) 292 | res = scanner.scan() 293 | agg['total_files'] += res.get('total_files', 0) 294 | agg['scanned_files'] += res.get('scanned_files', 0) 295 | agg['scanned_urls'] += res.get('scanned_urls', 0) 296 | agg['total_issues'] += res.get('total_issues', 0) 297 | agg['suspicious_links'].extend(res.get('suspicious_links', [])) 298 | agg['hidden_elements'].extend(res.get('hidden_elements', [])) 299 | agg['keyword_matches'].extend(res.get('keyword_matches', [])) 300 | agg['js_issues'].extend(res.get('js_issues', [])) 301 | agg['css_issues'].extend(res.get('css_issues', [])) 302 | end_time = datetime.now() 303 | duration = str(end_time - start_time) 304 | config.target = summary_target 305 | reporter = Reporter(config) 306 | report_file = reporter.generate_report(agg, duration) 307 | scan_time = (end_time - start_time).total_seconds() 308 | log_summary( 309 | logger, 310 | total_files=agg.get('total_files', 0), 311 | scanned_files=agg.get('scanned_files', 0), 312 | issues_found=agg.get('total_issues', 0), 313 | scan_time=scan_time 314 | ) 315 | logger.info(f"扫描完成!报告已保存至:{report_file}") 316 | print(f"\n扫描完成!报告已保存至:{report_file}") 317 | 318 | except Exception as e: 319 | logger.error(f"扫描过程中发生错误:{str(e)}", exc_info=True) 320 | print(f"错误:扫描过程中发生错误 - {str(e)}") 321 | sys.exit(1) 322 | 323 | if __name__ == '__main__': 324 | main() 325 | 326 | -------------------------------------------------------------------------------- /utils/css_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | CSS处理工具模块 5 | """ 6 | 7 | import re 8 | import logging 9 | from typing import List, Dict, Optional 10 | 11 | logger = logging.getLogger('YuanZhao.utils.css') 12 | 13 | def extract_css_urls(css_content: str) -> List[Dict[str, str]]: 14 | """ 15 | 提取CSS中的URL 16 | 17 | Args: 18 | css_content: CSS内容 19 | 20 | Returns: 21 | URL列表 22 | """ 23 | urls = [] 24 | 25 | try: 26 | # 匹配CSS中的url()函数 27 | url_pattern = re.compile(r'url\s*\(\s*(["\']?)([^"\'\)]+)\1\s*\)', re.IGNORECASE) 28 | matches = url_pattern.finditer(css_content) 29 | 30 | for match in matches: 31 | url = match.group(2) 32 | start_pos = match.start(0) 33 | end_pos = match.end(0) 34 | 35 | # 获取上下文 36 | context_start = max(0, start_pos - 50) 37 | context_end = min(len(css_content), end_pos + 50) 38 | context = css_content[context_start:context_end] 39 | 40 | urls.append({ 41 | 'url': url, 42 | 'original': match.group(0), 43 | 'context': context, 44 | 'position': (start_pos, end_pos) 45 | }) 46 | 47 | except Exception as e: 48 | logger.error(f"提取CSS URL失败: {str(e)}") 49 | 50 | return urls 51 | 52 | def extract_import_rules(css_content: str) -> List[Dict[str, str]]: 53 | """ 54 | 提取CSS中的@import规则 55 | 56 | Args: 57 | css_content: CSS内容 58 | 59 | Returns: 60 | @import规则列表 61 | """ 62 | import_rules = [] 63 | 64 | try: 65 | # 匹配@import规则 66 | import_pattern = re.compile(r'@import\s+(["\']?)([^"\';\n]+)\1\s*([^;\n]*)\s*;', re.IGNORECASE) 67 | matches = import_pattern.finditer(css_content) 68 | 69 | for match in matches: 70 | url = match.group(2) 71 | media = match.group(3) 72 | start_pos = match.start(0) 73 | end_pos = match.end(0) 74 | 75 | import_rules.append({ 76 | 'url': url, 77 | 'media': media, 78 | 'original': match.group(0), 79 | 'position': (start_pos, end_pos) 80 | }) 81 | 82 | except Exception as e: 83 | logger.error(f"提取CSS @import规则失败: {str(e)}") 84 | 85 | return import_rules 86 | 87 | def extract_selectors(css_content: str) -> List[Dict[str, str]]: 88 | """ 89 | 提取CSS选择器 90 | 91 | Args: 92 | css_content: CSS内容 93 | 94 | Returns: 95 | 选择器列表 96 | """ 97 | selectors = [] 98 | 99 | try: 100 | # 移除注释 101 | css_content = remove_css_comments(css_content) 102 | 103 | # 匹配CSS规则 104 | rule_pattern = re.compile(r'([^{]+)\s*{[^}]*}', re.DOTALL) 105 | rules = rule_pattern.finditer(css_content) 106 | 107 | for rule in rules: 108 | selector_text = rule.group(1).strip() 109 | 110 | # 分割多个选择器 111 | for selector in selector_text.split(','): 112 | selector = selector.strip() 113 | if selector: 114 | selectors.append({ 115 | 'selector': selector, 116 | 'position': (rule.start(1), rule.end(1)) 117 | }) 118 | 119 | except Exception as e: 120 | logger.error(f"提取CSS选择器失败: {str(e)}") 121 | 122 | return selectors 123 | 124 | def extract_css_properties(css_content: str) -> List[Dict[str, str]]: 125 | """ 126 | 提取CSS属性 127 | 128 | Args: 129 | css_content: CSS内容 130 | 131 | Returns: 132 | CSS属性列表 133 | """ 134 | properties = [] 135 | 136 | try: 137 | # 移除注释 138 | css_content = remove_css_comments(css_content) 139 | 140 | # 匹配CSS规则体 141 | body_pattern = re.compile(r'\{([^}]*)\}', re.DOTALL) 142 | bodies = body_pattern.finditer(css_content) 143 | 144 | for body in bodies: 145 | body_content = body.group(1) 146 | body_start = body.start(1) 147 | 148 | # 匹配属性 149 | prop_pattern = re.compile(r'([^:;\s]+)\s*:\s*([^;]+);') 150 | props = prop_pattern.finditer(body_content) 151 | 152 | for prop in props: 153 | prop_name = prop.group(1).strip() 154 | prop_value = prop.group(2).strip() 155 | 156 | properties.append({ 157 | 'property': prop_name, 158 | 'value': prop_value, 159 | 'position': (body_start + prop.start(1), body_start + prop.end(1)) 160 | }) 161 | 162 | except Exception as e: 163 | logger.error(f"提取CSS属性失败: {str(e)}") 164 | 165 | return properties 166 | 167 | def detect_hidden_elements(css_content: str) -> List[Dict[str, str]]: 168 | """ 169 | 检测可能用于隐藏元素的CSS规则 170 | 171 | Args: 172 | css_content: CSS内容 173 | 174 | Returns: 175 | 隐藏规则列表 176 | """ 177 | hidden_rules = [] 178 | 179 | # 隐藏元素的属性模式 180 | hiding_patterns = [ 181 | (r'display\s*:\s*none', 'display: none'), 182 | (r'visibility\s*:\s*hidden', 'visibility: hidden'), 183 | (r'opacity\s*:\s*0', 'opacity: 0'), 184 | (r'position\s*:\s*absolute.*left\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*top\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*width\s*:\s*\d+px\s*;.*height\s*:\s*\d+px', 'absolute positioned tiny element'), 185 | (r'position\s*:\s*absolute.*left\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)\s*;.*top\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)', 'absolute positioned'), 186 | (r'overflow\s*:\s*hidden', 'overflow: hidden'), 187 | (r'clip\s*:\s*rect\(0\s*px\s*0\s*px\s*0\s*px\s*0\s*px\)', 'clip: rect'), 188 | (r'font-size\s*:\s*0(?:px)?', 'font-size: 0'), 189 | (r'line-height\s*:\s*0(?:px)?', 'line-height: 0'), 190 | (r'text-indent\s*:\s*[-+]?\d+(?:\.\d+)?(?:px|em|%)', 'text-indent'), 191 | (r'color\s*:\s*transparent', 'color: transparent'), 192 | (r'background-color\s*:\s*transparent', 'background-color: transparent'), 193 | (r'height\s*:\s*0(?:px)?', 'height: 0'), 194 | (r'width\s*:\s*0(?:px)?', 'width: 0'), 195 | ] 196 | 197 | try: 198 | # 移除注释 199 | css_content = remove_css_comments(css_content) 200 | 201 | # 匹配CSS规则 202 | rule_pattern = re.compile(r'([^{]+)\s*{([^}]*)}', re.DOTALL) 203 | rules = rule_pattern.finditer(css_content) 204 | 205 | for rule in rules: 206 | selector = rule.group(1).strip() 207 | body = rule.group(2) 208 | start_pos = rule.start(0) 209 | end_pos = rule.end(0) 210 | 211 | # 检查每个隐藏模式 212 | for pattern_str, hiding_type in hiding_patterns: 213 | pattern = re.compile(pattern_str, re.DOTALL | re.IGNORECASE) 214 | 215 | if pattern.search(body): 216 | hidden_rules.append({ 217 | 'type': hiding_type, 218 | 'selector': selector, 219 | 'css': body.strip(), 220 | 'original_rule': rule.group(0), 221 | 'position': (start_pos, end_pos) 222 | }) 223 | break # 每个规则只记录一次 224 | 225 | except Exception as e: 226 | logger.error(f"检测隐藏元素失败: {str(e)}") 227 | 228 | return hidden_rules 229 | 230 | def detect_suspicious_selectors(css_content: str) -> List[Dict[str, str]]: 231 | """ 232 | 检测可疑的CSS选择器 233 | 234 | Args: 235 | css_content: CSS内容 236 | Returns: 237 | 可疑选择器列表 238 | """ 239 | suspicious_selectors = [] 240 | 241 | # 可疑选择器模式 242 | suspicious_patterns = [ 243 | # 随机字符串类名或ID 244 | (r'\.(\w{8,})[^\w\-]', 'long_random_class'), 245 | (r'#(\w{8,})[^\w\-]', 'long_random_id'), 246 | # 连续数字类名或ID 247 | (r'\.(\d{4,})[^\w\-]', 'numeric_class'), 248 | (r'#(\d{4,})[^\w\-]', 'numeric_id'), 249 | # 特殊字符选择器 250 | (r'[\[\*\+\~\^\$\|]', 'complex_selector'), 251 | ] 252 | 253 | try: 254 | # 移除注释 255 | css_content = remove_css_comments(css_content) 256 | 257 | # 匹配CSS规则 258 | rule_pattern = re.compile(r'([^{]+)\s*{[^}]*}', re.DOTALL) 259 | rules = rule_pattern.finditer(css_content) 260 | 261 | for rule in rules: 262 | selector_text = rule.group(1).strip() 263 | 264 | # 检查每个可疑模式 265 | for pattern_str, selector_type in suspicious_patterns: 266 | pattern = re.compile(pattern_str, re.DOTALL) 267 | 268 | if pattern.search(selector_text): 269 | suspicious_selectors.append({ 270 | 'type': selector_type, 271 | 'selector': selector_text, 272 | 'position': (rule.start(1), rule.end(1)) 273 | }) 274 | break # 每个选择器只记录一次 275 | 276 | except Exception as e: 277 | logger.error(f"检测可疑选择器失败: {str(e)}") 278 | 279 | return suspicious_selectors 280 | 281 | def remove_css_comments(css_content: str) -> str: 282 | """ 283 | 移除CSS注释 284 | 285 | Args: 286 | css_content: CSS内容 287 | 288 | Returns: 289 | 移除注释后的CSS内容 290 | """ 291 | try: 292 | # 移除CSS注释 293 | css_content = re.sub(r'/\*.*?\*/', '', css_content, flags=re.DOTALL) 294 | return css_content 295 | except Exception as e: 296 | logger.error(f"移除CSS注释失败: {str(e)}") 297 | return css_content 298 | 299 | 300 | def analyze_complexity(css_content: str) -> Dict[str, int]: 301 | """ 302 | 分析CSS复杂度 303 | 304 | Args: 305 | css_content: CSS内容 306 | 307 | Returns: 308 | 包含复杂度指标的字典 309 | """ 310 | complexity = { 311 | 'rules_count': 0, 312 | 'selectors_count': 0, 313 | 'properties_count': 0, 314 | 'imports_count': 0, 315 | 'media_queries_count': 0 316 | } 317 | 318 | try: 319 | # 移除注释 320 | css_content = remove_css_comments(css_content) 321 | 322 | # 计算规则数量 323 | rule_pattern = re.compile(r'\{[^}]*\}', re.DOTALL) 324 | complexity['rules_count'] = len(rule_pattern.findall(css_content)) 325 | 326 | # 计算选择器数量 327 | selectors = extract_selectors(css_content) 328 | complexity['selectors_count'] = len(selectors) 329 | 330 | # 计算属性数量 331 | properties = extract_css_properties(css_content) 332 | complexity['properties_count'] = len(properties) 333 | 334 | # 计算导入规则数量 335 | imports = extract_import_rules(css_content) 336 | complexity['imports_count'] = len(imports) 337 | 338 | # 计算媒体查询数量 339 | media_query_pattern = re.compile(r'@media\s+[^\{]*\{[^}]*\}', re.DOTALL) 340 | complexity['media_queries_count'] = len(media_query_pattern.findall(css_content)) 341 | 342 | except Exception as e: 343 | logger.error(f"分析CSS复杂度失败: {str(e)}") 344 | 345 | return complexity 346 | 347 | def extract_css_comments(css_content: str) -> List[Dict[str, str]]: 348 | """ 349 | 提取CSS注释 350 | 351 | Args: 352 | css_content: CSS内容 353 | 354 | Returns: 355 | 注释列表 356 | """ 357 | comments = [] 358 | 359 | try: 360 | comment_pattern = re.compile(r'/\*(.*?)\*/', re.DOTALL) 361 | matches = comment_pattern.finditer(css_content) 362 | 363 | for match in matches: 364 | comment_content = match.group(1).strip() 365 | start_pos = match.start(0) 366 | end_pos = match.end(0) 367 | 368 | comments.append({ 369 | 'content': comment_content, 370 | 'position': (start_pos, end_pos) 371 | }) 372 | 373 | except Exception as e: 374 | logger.error(f"提取CSS注释失败: {str(e)}") 375 | 376 | return comments 377 | 378 | def analyze_css_complexity(css_content: str) -> Dict[str, int]: 379 | """ 380 | 分析CSS复杂度 381 | 382 | Args: 383 | css_content: CSS内容 384 | 385 | Returns: 386 | 复杂度指标 387 | """ 388 | try: 389 | # 移除注释 390 | css_content = remove_css_comments(css_content) 391 | 392 | # 计算规则数量 393 | rule_pattern = re.compile(r'[^\s\n\r]+\s*{[^}]*}', re.DOTALL) 394 | rules = rule_pattern.findall(css_content) 395 | rule_count = len(rules) 396 | 397 | # 计算选择器数量 398 | selectors = extract_selectors(css_content) 399 | selector_count = len(selectors) 400 | 401 | # 计算属性数量 402 | properties = extract_css_properties(css_content) 403 | property_count = len(properties) 404 | 405 | # 计算URL数量 406 | urls = extract_css_urls(css_content) 407 | url_count = len(urls) 408 | 409 | return { 410 | 'rule_count': rule_count, 411 | 'selector_count': selector_count, 412 | 'property_count': property_count, 413 | 'url_count': url_count, 414 | 'file_size': len(css_content), 415 | } 416 | 417 | except Exception as e: 418 | logger.error(f"分析CSS复杂度失败: {str(e)}") 419 | return {} 420 | 421 | def find_duplicate_rules(css_content: str) -> List[Dict[str, str]]: 422 | """ 423 | 查找重复的CSS规则 424 | 425 | Args: 426 | css_content: CSS内容 427 | 428 | Returns: 429 | 重复规则列表 430 | """ 431 | duplicate_rules = [] 432 | seen_rules = {} 433 | 434 | try: 435 | # 移除注释 436 | css_content = remove_css_comments(css_content) 437 | 438 | # 匹配CSS规则 439 | rule_pattern = re.compile(r'([^{]+)\s*{([^}]*)}', re.DOTALL) 440 | rules = rule_pattern.finditer(css_content) 441 | 442 | for rule in rules: 443 | selector = rule.group(1).strip() 444 | body = rule.group(2).strip() 445 | 446 | # 使用body作为键,查找重复 447 | if body in seen_rules: 448 | duplicate_rules.append({ 449 | 'selector': selector, 450 | 'duplicate_selector': seen_rules[body], 451 | 'css_body': body 452 | }) 453 | else: 454 | seen_rules[body] = selector 455 | 456 | except Exception as e: 457 | logger.error(f"查找重复规则失败: {str(e)}") 458 | 459 | return duplicate_rules 460 | -------------------------------------------------------------------------------- /core/detector/special_hiding_detector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 特殊隐藏技术检测器模块 5 | """ 6 | 7 | import re 8 | import logging 9 | from typing import List, Dict 10 | 11 | logger = logging.getLogger('YuanZhao.detector.special_hiding') 12 | 13 | class SpecialHidingDetector: 14 | """特殊隐藏技术检测器""" 15 | 16 | def __init__(self, config): 17 | self.config = config 18 | self._init_patterns() 19 | 20 | def _init_patterns(self): 21 | """初始化正则表达式模式""" 22 | # 零宽字符模式 23 | self.zero_width_chars = [ 24 | '\u200B', # 零宽空格 25 | '\u200C', # 零宽不连字 26 | '\u200D', # 零宽连字 27 | '\u2060', # 字连接符 28 | '\uFEFF', # 字节顺序标记 29 | ] 30 | self.zero_width_pattern = re.compile('|'.join(re.escape(c) for c in self.zero_width_chars)) 31 | 32 | # 空白字符堆积 33 | self.whitespace_pattern = re.compile(r'(\s|\t|\r|\n){10,}') 34 | 35 | # 颜色隐藏(颜色接近背景色) 36 | self.color_pattern = re.compile( 37 | r'color\s*:\s*(#\w{3,6}|rgba?\([^)]+\))', 38 | re.IGNORECASE 39 | ) 40 | self.background_color_pattern = re.compile( 41 | r'background-color\s*:\s*(#\w{3,6}|rgba?\([^)]+\))', 42 | re.IGNORECASE 43 | ) 44 | 45 | # 绝对定位隐藏(离屏元素) 46 | self.absolute_position_pattern = re.compile( 47 | r'position\s*:\s*absolute.*?(left|top|bottom|right)\s*:\s*(-?\d+(?:\.\d+)?(?:px|em|%)?)', 48 | re.IGNORECASE | re.DOTALL 49 | ) 50 | 51 | # 字体大小隐藏 52 | self.font_size_pattern = re.compile( 53 | r'font-size\s*:\s*(0|0\.\d+)', 54 | re.IGNORECASE 55 | ) 56 | 57 | # 文本缩进隐藏 58 | self.text_indent_pattern = re.compile( 59 | r'text-indent\s*:\s*(-\d+(?:\.\d+)?(?:px|em|%))', 60 | re.IGNORECASE 61 | ) 62 | 63 | # 透明度隐藏 64 | self.opacity_pattern = re.compile( 65 | r'opacity\s*:\s*(0|0\.\d+)', 66 | re.IGNORECASE 67 | ) 68 | self.visibility_pattern = re.compile( 69 | r'visibility\s*:\s*hidden', 70 | re.IGNORECASE 71 | ) 72 | self.display_none_pattern = re.compile( 73 | r'display\s*:\s*none', 74 | re.IGNORECASE 75 | ) 76 | 77 | # 多层嵌套隐藏 78 | self.nested_elements_pattern = re.compile( 79 | r'<(div|span|p|a)[^>]*>\s*<(div|span|p|a)[^>]*>\s*<(div|span|p|a)[^>]*>', 80 | re.IGNORECASE 81 | ) 82 | 83 | # HTML实体编码隐藏 84 | self.html_entity_pattern = re.compile(r'&#(\d+);|&#x([0-9a-f]+);') 85 | 86 | # 可疑的编码混合 87 | self.mixed_encoding_pattern = re.compile( 88 | r'https?://(?:[\w\-._~:/?#[\]@!$&\'()*+,;=]|%[0-9a-fA-F]{2})+', 89 | re.IGNORECASE 90 | ) 91 | 92 | def detect(self, content: str, source: str) -> List[Dict]: 93 | """检测特殊隐藏技术""" 94 | results = [] 95 | 96 | try: 97 | # 检测零宽字符 98 | zero_width_results = self._detect_zero_width_chars(content, source) 99 | results.extend(zero_width_results) 100 | 101 | # 检测空白字符堆积 102 | whitespace_results = self._detect_whitespace(content, source) 103 | results.extend(whitespace_results) 104 | 105 | # 检测颜色隐藏 106 | color_results = self._detect_color_hiding(content, source) 107 | results.extend(color_results) 108 | 109 | # 检测绝对定位隐藏 110 | position_results = self._detect_position_hiding(content, source) 111 | results.extend(position_results) 112 | 113 | # 检测字体大小隐藏 114 | font_size_results = self._detect_font_size_hiding(content, source) 115 | results.extend(font_size_results) 116 | 117 | # 检测文本缩进隐藏 118 | indent_results = self._detect_text_indent_hiding(content, source) 119 | results.extend(indent_results) 120 | 121 | # 检测透明度隐藏 122 | opacity_results = self._detect_opacity_hiding(content, source) 123 | results.extend(opacity_results) 124 | 125 | # 检测多层嵌套隐藏 126 | nested_results = self._detect_nested_elements(content, source) 127 | results.extend(nested_results) 128 | 129 | # 检测HTML实体编码隐藏 130 | entity_results = self._detect_html_entities(content, source) 131 | results.extend(entity_results) 132 | 133 | except Exception as e: 134 | logger.error(f"特殊隐藏技术检测失败: {str(e)}", exc_info=True) 135 | 136 | return results 137 | 138 | def _detect_zero_width_chars(self, content: str, source: str) -> List[Dict]: 139 | """检测零宽字符""" 140 | results = [] 141 | 142 | matches = list(self.zero_width_pattern.finditer(content)) 143 | if matches: 144 | # 收集所有零宽字符的上下文 145 | context = self._get_context(content, matches[0].start(), matches[-1].end(), 100) 146 | 147 | # 解码隐藏内容(如果可能) 148 | hidden_content = self._extract_hidden_content(content, self.zero_width_chars) 149 | 150 | results.append({ 151 | 'link': f'零宽字符隐藏 ({len(matches)}个字符)', 152 | 'source': source, 153 | 'type': 'zero_width_hiding', 154 | 'detection_method': 'regex', 155 | 'risk_level': '高', 156 | 'context': context, 157 | 'hidden_content': hidden_content if hidden_content else None 158 | }) 159 | 160 | return results 161 | 162 | def _detect_whitespace(self, content: str, source: str) -> List[Dict]: 163 | """检测空白字符堆积""" 164 | results = [] 165 | 166 | for match in self.whitespace_pattern.finditer(content): 167 | # 检查是否在HTML标签之间或注释中 168 | context = self._get_context(content, match.start(), match.end(), 50) 169 | 170 | # 只有在标签之间大量空白才认为可疑 171 | if '<' not in context and '>' not in context: 172 | results.append({ 173 | 'link': f'空白字符堆积 ({len(match.group(1))}个字符)', 174 | 'source': source, 175 | 'type': 'whitespace_hiding', 176 | 'detection_method': 'regex', 177 | 'risk_level': '中', 178 | 'context': context 179 | }) 180 | 181 | return results 182 | 183 | def _detect_color_hiding(self, content: str, source: str) -> List[Dict]: 184 | """检测颜色隐藏""" 185 | results = [] 186 | 187 | # 找到所有颜色定义 188 | for color_match in self.color_pattern.finditer(content): 189 | color = color_match.group(1) 190 | 191 | # 在同一段落中查找背景颜色 192 | start_pos = max(0, color_match.start() - 200) 193 | end_pos = min(len(content), color_match.end() + 200) 194 | segment = content[start_pos:end_pos] 195 | 196 | bg_match = self.background_color_pattern.search(segment) 197 | if bg_match: 198 | bg_color = bg_match.group(1) 199 | 200 | # 如果颜色非常接近背景色,标记为可疑 201 | if self._colors_are_similar(color, bg_color): 202 | results.append({ 203 | 'link': f'颜色隐藏 (文字:{color}, 背景:{bg_color})', 204 | 'source': source, 205 | 'type': 'color_hiding', 206 | 'detection_method': 'regex', 207 | 'risk_level': '高', 208 | 'context': self._get_context(content, color_match.start(), color_match.end()) 209 | }) 210 | 211 | return results 212 | 213 | def _detect_position_hiding(self, content: str, source: str) -> List[Dict]: 214 | """检测绝对定位隐藏""" 215 | results = [] 216 | 217 | for match in self.absolute_position_pattern.finditer(content): 218 | direction = match.group(1).lower() 219 | value = match.group(2) 220 | 221 | # 提取数值部分 222 | num_value = float(re.search(r'([-\d.]+)', value).group(1)) 223 | 224 | # 如果位置在屏幕外(非常大的负值或正值) 225 | if abs(num_value) > 1000: 226 | results.append({ 227 | 'link': f'绝对定位隐藏 ({direction}:{value})', 228 | 'source': source, 229 | 'type': 'position_hiding', 230 | 'detection_method': 'regex', 231 | 'risk_level': '高', 232 | 'context': self._get_context(content, match.start(), match.end()) 233 | }) 234 | 235 | return results 236 | 237 | def _detect_font_size_hiding(self, content: str, source: str) -> List[Dict]: 238 | """检测字体大小隐藏""" 239 | results = [] 240 | 241 | for match in self.font_size_pattern.finditer(content): 242 | size = match.group(1) 243 | 244 | results.append({ 245 | 'link': f'字体大小隐藏 (size:{size})', 246 | 'source': source, 247 | 'type': 'font_size_hiding', 248 | 'detection_method': 'regex', 249 | 'risk_level': '高', 250 | 'context': self._get_context(content, match.start(), match.end()) 251 | }) 252 | 253 | return results 254 | 255 | def _detect_text_indent_hiding(self, content: str, source: str) -> List[Dict]: 256 | """检测文本缩进隐藏""" 257 | results = [] 258 | 259 | for match in self.text_indent_pattern.finditer(content): 260 | indent = match.group(1) 261 | 262 | # 提取数值部分 263 | num_value = float(re.search(r'([-\d.]+)', indent).group(1)) 264 | 265 | # 如果缩进很大(负值),可能是隐藏文本 266 | if num_value < -50: 267 | results.append({ 268 | 'link': f'文本缩进隐藏 (indent:{indent})', 269 | 'source': source, 270 | 'type': 'text_indent_hiding', 271 | 'detection_method': 'regex', 272 | 'risk_level': '高', 273 | 'context': self._get_context(content, match.start(), match.end()) 274 | }) 275 | 276 | return results 277 | 278 | def _detect_opacity_hiding(self, content: str, source: str) -> List[Dict]: 279 | """检测透明度隐藏""" 280 | results = [] 281 | 282 | # 检测opacity 283 | for match in self.opacity_pattern.finditer(content): 284 | opacity = match.group(1) 285 | results.append({ 286 | 'link': f'透明度隐藏 (opacity:{opacity})', 287 | 'source': source, 288 | 'type': 'opacity_hiding', 289 | 'detection_method': 'regex', 290 | 'risk_level': '高', 291 | 'context': self._get_context(content, match.start(), match.end()) 292 | }) 293 | 294 | # 检测visibility:hidden 295 | for match in self.visibility_pattern.finditer(content): 296 | results.append({ 297 | 'link': '可见性隐藏 (visibility:hidden)', 298 | 'source': source, 299 | 'type': 'visibility_hiding', 300 | 'detection_method': 'regex', 301 | 'risk_level': '高', 302 | 'context': self._get_context(content, match.start(), match.end()) 303 | }) 304 | 305 | # 检测display:none 306 | for match in self.display_none_pattern.finditer(content): 307 | results.append({ 308 | 'link': '显示隐藏 (display:none)', 309 | 'source': source, 310 | 'type': 'display_hiding', 311 | 'detection_method': 'regex', 312 | 'risk_level': '高', 313 | 'context': self._get_context(content, match.start(), match.end()) 314 | }) 315 | 316 | return results 317 | 318 | def _detect_nested_elements(self, content: str, source: str) -> List[Dict]: 319 | """检测多层嵌套隐藏""" 320 | results = [] 321 | 322 | for match in self.nested_elements_pattern.finditer(content): 323 | results.append({ 324 | 'link': '多层嵌套隐藏', 325 | 'source': source, 326 | 'type': 'nested_hiding', 327 | 'detection_method': 'regex', 328 | 'risk_level': '中', 329 | 'context': self._get_context(content, match.start(), match.end()) 330 | }) 331 | 332 | return results 333 | 334 | def _detect_html_entities(self, content: str, source: str) -> List[Dict]: 335 | """检测HTML实体编码隐藏""" 336 | results = [] 337 | 338 | # 计算HTML实体的密度 339 | entity_matches = list(self.html_entity_pattern.finditer(content)) 340 | 341 | # 如果在较短的文本中有大量实体编码,可能是隐藏内容 342 | if len(entity_matches) > 10: 343 | # 尝试解码一些实体看看是否包含可疑内容 344 | sample = content[max(0, entity_matches[0].start() - 20):entity_matches[min(5, len(entity_matches)-1)].end() + 20] 345 | 346 | results.append({ 347 | 'link': f'HTML实体编码隐藏 ({len(entity_matches)}个实体)', 348 | 'source': source, 349 | 'type': 'entity_hiding', 350 | 'detection_method': 'regex', 351 | 'risk_level': '中', 352 | 'context': sample 353 | }) 354 | 355 | return results 356 | 357 | def _colors_are_similar(self, color1: str, color2: str) -> bool: 358 | """检查两个颜色是否相似""" 359 | # 这是一个简化的实现,实际应用中可能需要更复杂的颜色比较 360 | # 在这里我们只是检查是否完全相同或都是深色/浅色 361 | 362 | # 转换为小写以便比较 363 | color1 = color1.lower() 364 | color2 = color2.lower() 365 | 366 | # 如果完全相同,肯定是相似的 367 | if color1 == color2: 368 | return True 369 | 370 | # 检查是否都是深色(简化判断) 371 | dark_colors = ['#000', '#000000', 'black', 'rgb(0,0,0)'] 372 | if color1 in dark_colors and color2 in dark_colors: 373 | return True 374 | 375 | # 检查是否都是白色 376 | white_colors = ['#fff', '#ffffff', 'white', 'rgb(255,255,255)'] 377 | if color1 in white_colors and color2 in white_colors: 378 | return True 379 | 380 | return False 381 | 382 | def _extract_hidden_content(self, content: str, markers: List[str]) -> str: 383 | """从内容中提取使用特定标记隐藏的内容""" 384 | # 这个方法可以进一步扩展来提取使用零宽字符编码的隐藏内容 385 | # 目前只是一个简单的实现 386 | 387 | # 移除所有标记字符,看看是否有剩余的有意义内容 388 | clean_content = content 389 | for marker in markers: 390 | clean_content = clean_content.replace(marker, '') 391 | 392 | # 如果清理后的内容与原内容不同,返回清理后的内容(限制长度) 393 | if clean_content != content: 394 | return clean_content.strip()[:200] 395 | 396 | return None 397 | 398 | def _get_context(self, content: str, start: int, end: int, context_size: int = 50) -> str: 399 | """获取匹配内容的上下文""" 400 | start_context = max(0, start - context_size) 401 | end_context = min(len(content), end + context_size) 402 | 403 | context = content[start_context:end_context] 404 | context = context.replace('\n', ' ').replace('\r', ' ') 405 | 406 | # 移除零宽字符以便显示 407 | for char in self.zero_width_chars: 408 | context = context.replace(char, '') 409 | 410 | return context 411 | -------------------------------------------------------------------------------- /utils/js_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | JavaScript处理工具模块 5 | """ 6 | 7 | import re 8 | import logging 9 | from typing import List, Dict, Any 10 | 11 | logger = logging.getLogger('YuanZhao.utils.js') 12 | 13 | # 常见的可疑JavaScript模式 14 | SUSPICIOUS_PATTERNS = [ 15 | # 文档写入相关 16 | r'document\.write\s*\(', 17 | r'document\.writeln\s*\(', 18 | r'document\.createElement\s*\(\s*["\']script["\']\s*\)', 19 | 20 | # DOM操作相关 21 | r'appendChild\s*\(', 22 | r'insertBefore\s*\(', 23 | r'innerHTML\s*=', 24 | r'outerHTML\s*=', 25 | 26 | # 编码解码相关 27 | r'decodeURIComponent\s*\(', 28 | r'decodeURI\s*\(', 29 | r'eval\s*\(', 30 | r'Function\s*\(', 31 | r'fromCharCode\s*\(', 32 | 33 | # URL相关 34 | r'location\.href\s*=', 35 | r'window\.location\s*=', 36 | r'location\.replace\s*\(', 37 | r'location\.assign\s*\(', 38 | 39 | # 定时器相关 40 | r'setTimeout\s*\(', 41 | r'setInterval\s*\(', 42 | 43 | # AJAX相关 44 | r'XMLHttpRequest', 45 | r'fetch\s*\(', 46 | r'axios', 47 | 48 | # 混淆相关 49 | r'\+\s*"', # 字符串拼接 50 | r'["\']\s*\+\s*["\']', # 空字符串拼接 51 | r'\[\d+\]', # 数字索引访问 52 | ] 53 | 54 | def extract_suspicious_patterns(js_content: str) -> List[Dict[str, str]]: 55 | """ 56 | 提取可疑的JavaScript模式 57 | 58 | Args: 59 | js_content: JavaScript代码 60 | 61 | Returns: 62 | 可疑模式列表 63 | """ 64 | suspicious_matches = [] 65 | 66 | try: 67 | for pattern_str in SUSPICIOUS_PATTERNS: 68 | pattern = re.compile(pattern_str, re.IGNORECASE) 69 | matches = pattern.finditer(js_content) 70 | 71 | for match in matches: 72 | code_segment = match.group(0) 73 | start_pos = match.start(0) 74 | end_pos = match.end(0) 75 | 76 | # 获取上下文 77 | context = get_code_context(js_content, start_pos, end_pos) 78 | 79 | suspicious_matches.append({ 80 | 'pattern': pattern_str, 81 | 'code_segment': code_segment, 82 | 'context': context, 83 | 'position': (start_pos, end_pos) 84 | }) 85 | 86 | except Exception as e: 87 | logger.error(f"提取可疑模式失败: {str(e)}") 88 | 89 | return suspicious_matches 90 | 91 | def get_code_context(js_content: str, start_pos: int, end_pos: int, context_lines: int = 3) -> str: 92 | """ 93 | 获取代码上下文 94 | 95 | Args: 96 | js_content: 完整代码 97 | start_pos: 开始位置 98 | end_pos: 结束位置 99 | context_lines: 上下文行数 100 | 101 | Returns: 102 | 包含上下文的代码段 103 | """ 104 | try: 105 | # 获取行号 106 | lines = js_content.split('\n') 107 | current_line = 0 108 | char_count = 0 109 | 110 | for i, line in enumerate(lines): 111 | char_count += len(line) + 1 # +1 for newline 112 | if char_count > start_pos: 113 | current_line = i 114 | break 115 | 116 | # 获取上下文行 117 | start_line = max(0, current_line - context_lines) 118 | end_line = min(len(lines), current_line + context_lines + 1) 119 | 120 | context_lines = lines[start_line:end_line] 121 | 122 | return '\n'.join(context_lines) 123 | 124 | except Exception as e: 125 | logger.error(f"获取代码上下文失败: {str(e)}") 126 | # 回退到简单的字符上下文 127 | context_start = max(0, start_pos - 100) 128 | context_end = min(len(js_content), end_pos + 100) 129 | return js_content[context_start:context_end] 130 | 131 | def detect_dynamic_urls(js_content: str) -> List[Dict[str, str]]: 132 | """ 133 | 检测动态生成的URL 134 | 135 | Args: 136 | js_content: JavaScript代码 137 | 138 | Returns: 139 | 动态URL列表 140 | """ 141 | dynamic_urls = [] 142 | 143 | # 检测常见的URL赋值模式 144 | url_patterns = [ 145 | re.compile(r'(?:href|src|url)\s*=\s*([^;\n]+);', re.DOTALL), 146 | re.compile(r'(?:location\.href|window\.location)\s*=\s*([^;\n]+);', re.DOTALL), 147 | re.compile(r'fetch\s*\(\s*([^)]+)\s*\)', re.DOTALL), 148 | re.compile(r'\.open\s*\(\s*["\'](get|post|put|delete)["\']\s*,\s*([^)]+)\s*\)', re.DOTALL), 149 | ] 150 | 151 | try: 152 | for pattern in url_patterns: 153 | matches = pattern.finditer(js_content) 154 | 155 | for match in matches: 156 | code_segment = match.group(0) 157 | start_pos = match.start(0) 158 | end_pos = match.end(0) 159 | 160 | # 判断是否包含变量或表达式 161 | if any(ch in code_segment for ch in ['+', '\'', '"', '`', '[', ']', '(', ')']): 162 | # 优先尝试从表达式中提取规范化URL常量 163 | url_const = None 164 | m_http = re.search(r'["\'`]\s*(https?://[^"\'`\s]+)\s*["\'`]', code_segment) 165 | if m_http: 166 | url_const = m_http.group(1) 167 | m_proto = re.search(r'["\'`]\s*(//[^"\'`\s]+)\s*["\'`]', code_segment) 168 | if (not url_const) and m_proto: 169 | url_const = 'https:' + m_proto.group(1) 170 | dynamic_urls.append({ 171 | 'url': url_const if url_const else None, 172 | 'expression': code_segment, 173 | 'reason': '动态构建的URL', 174 | 'context': get_code_context(js_content, start_pos, end_pos), 175 | 'position': (start_pos, end_pos) 176 | }) 177 | 178 | except Exception as e: 179 | logger.error(f"检测动态URL失败: {str(e)}") 180 | 181 | return dynamic_urls 182 | 183 | def detect_obfuscated_code(js_content: str) -> List[Dict[str, str]]: 184 | """ 185 | 检测混淆的JavaScript代码 186 | 187 | Args: 188 | js_content: JavaScript代码 189 | 190 | Returns: 191 | 混淆代码列表 192 | """ 193 | obfuscated_segments = [] 194 | 195 | # 检测常见的混淆模式 196 | obfuscation_patterns = [ 197 | # 大量的字符串拼接 198 | (r'("[^"\\]*(?:\\.[^"\\]*)*"\s*\+\s*){3,}', 'multiple_string_concatenation'), 199 | # 长的十六进制字符串 200 | (r'(\\x[0-9a-fA-F]{2}){10,}', 'hex_encoding'), 201 | # Unicode编码 202 | (r'(\\u[0-9a-fA-F]{4}){5,}', 'unicode_encoding'), 203 | # 数组混淆 204 | (r'(\[\s*\d+\s*\]\s*\+){3,}', 'array_obfuscation'), 205 | # eval + 字符串 206 | (r'eval\s*\(\s*["\'](?:[^"\'\\]|\\.)*["\']\s*\)', 'eval_with_string'), 207 | # 大量的变量替换 208 | (r'(var|let|const)\s+[a-z]\s*=\s*[^;]+;\s*[a-z]\s*\+\s*=[^;]+;', 'variable_replacement'), 209 | ] 210 | 211 | try: 212 | for pattern_str, obfuscation_type in obfuscation_patterns: 213 | pattern = re.compile(pattern_str, re.DOTALL) 214 | matches = pattern.finditer(js_content) 215 | 216 | for match in matches: 217 | code_segment = match.group(0) 218 | start_pos = match.start(0) 219 | end_pos = match.end(0) 220 | 221 | obfuscated_segments.append({ 222 | 'type': obfuscation_type, 223 | 'code_segment': code_segment, 224 | 'context': get_code_context(js_content, start_pos, end_pos), 225 | 'position': (start_pos, end_pos) 226 | }) 227 | 228 | except Exception as e: 229 | logger.error(f"检测混淆代码失败: {str(e)}") 230 | 231 | return obfuscated_segments 232 | 233 | def extract_function_calls(js_content: str, function_name: str) -> List[Dict[str, str]]: 234 | """ 235 | 提取特定函数调用 236 | 237 | Args: 238 | js_content: JavaScript代码 239 | function_name: 函数名 240 | 241 | Returns: 242 | 函数调用列表 243 | """ 244 | function_calls = [] 245 | 246 | try: 247 | # 构建函数调用的正则表达式 248 | pattern_str = rf'{re.escape(function_name)}\s*\(\s*([^)]*)\s*\)' # 避免 re.escape 对 \ 进行转义 249 | pattern = re.compile(pattern_str, re.DOTALL) 250 | matches = pattern.finditer(js_content) 251 | 252 | for match in matches: 253 | full_call = match.group(0) 254 | arguments = match.group(1) 255 | start_pos = match.start(0) 256 | end_pos = match.end(0) 257 | 258 | function_calls.append({ 259 | 'function': function_name, 260 | 'arguments': arguments, 261 | 'full_call': full_call, 262 | 'context': get_code_context(js_content, start_pos, end_pos), 263 | 'position': (start_pos, end_pos) 264 | }) 265 | 266 | except Exception as e: 267 | logger.error(f"提取函数调用失败: {str(e)}") 268 | 269 | return function_calls 270 | 271 | def detect_document_modification(js_content: str) -> List[Dict[str, str]]: 272 | """ 273 | 检测文档修改操作 274 | 275 | Args: 276 | js_content: JavaScript代码 277 | 278 | Returns: 279 | 文档修改操作列表 280 | """ 281 | modifications = [] 282 | 283 | # 文档修改相关的模式 284 | modification_patterns = [ 285 | (r'document\.write\s*\(', 'document.write'), 286 | (r'document\.writeln\s*\(', 'document.writeln'), 287 | (r'innerHTML\s*=', 'innerHTML assignment'), 288 | (r'outerHTML\s*=', 'outerHTML assignment'), 289 | (r'appendChild\s*\(', 'appendChild'), 290 | (r'insertBefore\s*\(', 'insertBefore'), 291 | (r'insertAdjacentHTML\s*\(', 'insertAdjacentHTML'), 292 | (r'createElement\s*\(', 'createElement'), 293 | ] 294 | 295 | try: 296 | for pattern_str, modification_type in modification_patterns: 297 | pattern = re.compile(pattern_str, re.IGNORECASE) 298 | matches = pattern.finditer(js_content) 299 | 300 | for match in matches: 301 | code_segment = match.group(0) 302 | start_pos = match.start(0) 303 | end_pos = match.end(0) 304 | 305 | target = modification_type 306 | value = code_segment 307 | modifications.append({ 308 | 'action': 'modify_document', 309 | 'target': target, 310 | 'value': value, 311 | 'description': modification_type, 312 | 'context': get_code_context(js_content, start_pos, end_pos), 313 | 'position': (start_pos, end_pos) 314 | }) 315 | 316 | except Exception as e: 317 | logger.error(f"检测文档修改失败: {str(e)}") 318 | 319 | return modifications 320 | 321 | def extract_variable_assignments(js_content: str, variable_name: str) -> List[Dict[str, str]]: 322 | """ 323 | 提取变量赋值 324 | 325 | Args: 326 | js_content: JavaScript代码 327 | variable_name: 变量名 328 | 329 | Returns: 330 | 变量赋值列表 331 | """ 332 | assignments = [] 333 | 334 | try: 335 | # 构建变量赋值的正则表达式 336 | pattern_str = rf'(?:var|let|const)?\s*{re.escape(variable_name)}\s*=\s*([^;\n]+)' # 避免 re.escape 对 \ 进行转义 337 | pattern = re.compile(pattern_str, re.DOTALL) 338 | matches = pattern.finditer(js_content) 339 | 340 | for match in matches: 341 | full_assignment = match.group(0) 342 | value = match.group(1) 343 | start_pos = match.start(0) 344 | end_pos = match.end(0) 345 | 346 | assignments.append({ 347 | 'variable': variable_name, 348 | 'value': value, 349 | 'full_assignment': full_assignment, 350 | 'context': get_code_context(js_content, start_pos, end_pos), 351 | 'position': (start_pos, end_pos) 352 | }) 353 | 354 | except Exception as e: 355 | logger.error(f"提取变量赋值失败: {str(e)}") 356 | 357 | return assignments 358 | 359 | def extract_comments(js_content: str) -> List[Dict[str, Any]]: 360 | """ 361 | 提取JavaScript注释 362 | 363 | Args: 364 | js_content: JavaScript代码 365 | 366 | Returns: 367 | 注释列表 368 | """ 369 | comments = [] 370 | 371 | try: 372 | # 匹配单行注释 373 | single_line_pattern = re.compile(r'//(.*?)$', re.MULTILINE) 374 | single_line_matches = single_line_pattern.finditer(js_content) 375 | 376 | for match in single_line_matches: 377 | comment_content = match.group(1).strip() 378 | start_pos = match.start(0) 379 | end_pos = match.end(0) 380 | 381 | comments.append({ 382 | 'type': 'single_line', 383 | 'content': comment_content, 384 | 'position': (start_pos, end_pos) 385 | }) 386 | 387 | # 匹配多行注释 388 | multi_line_pattern = re.compile(r'/\*(.*?)\*/', re.DOTALL) 389 | multi_line_matches = multi_line_pattern.finditer(js_content) 390 | 391 | for match in multi_line_matches: 392 | comment_content = match.group(1).strip() 393 | start_pos = match.start(0) 394 | end_pos = match.end(0) 395 | 396 | comments.append({ 397 | 'type': 'multi_line', 398 | 'content': comment_content, 399 | 'position': (start_pos, end_pos) 400 | }) 401 | 402 | except Exception as e: 403 | logger.error(f"提取JavaScript注释失败: {str(e)}") 404 | 405 | return comments 406 | 407 | def strip_comments(js_content: str) -> str: 408 | """ 409 | 移除JavaScript注释 410 | """ 411 | try: 412 | s = js_content 413 | out = [] 414 | i = 0 415 | n = len(s) 416 | in_sq = False 417 | in_dq = False 418 | in_bt = False 419 | while i < n: 420 | ch = s[i] 421 | if not in_sq and not in_dq and not in_bt and ch == '/' and i + 1 < n: 422 | nxt = s[i+1] 423 | if nxt == '/': 424 | j = i + 2 425 | while j < n and s[j] not in '\n\r': 426 | j += 1 427 | i = j 428 | continue 429 | if nxt == '*': 430 | j = i + 2 431 | while j + 1 < n and not (s[j] == '*' and s[j+1] == '/'): 432 | j += 1 433 | i = j + 2 if j + 1 < n else n 434 | continue 435 | out.append(ch) 436 | if ch == "'" and not in_dq and not in_bt: 437 | esc = i > 0 and s[i-1] == '\\' 438 | if not esc: 439 | in_sq = not in_sq 440 | elif ch == '"' and not in_sq and not in_bt: 441 | esc = i > 0 and s[i-1] == '\\' 442 | if not esc: 443 | in_dq = not in_dq 444 | elif ch == '`' and not in_sq and not in_dq: 445 | in_bt = not in_bt 446 | i += 1 447 | return ''.join(out) 448 | except Exception as e: 449 | logger.error(f"移除JavaScript注释失败: {str(e)}") 450 | return js_content 451 | 452 | # 兼容性函数,为了支持js_detector.py中的导入 453 | def identify_obfuscated_code(js_content: str) -> Dict[str, Any]: 454 | """ 455 | 识别混淆代码并返回聚合信息 456 | """ 457 | segments = detect_obfuscated_code(js_content) 458 | is_obf = len(segments) > 0 459 | patterns = [seg.get('type', '') for seg in segments] 460 | sample = segments[0].get('code_segment', '') if segments else '' 461 | return { 462 | 'is_obfuscated': is_obf, 463 | 'detected_patterns': patterns, 464 | 'sample': sample 465 | } 466 | 467 | ## 兼容别名已移除,请使用 detect_document_modification 468 | 469 | ## 兼容别名已移除,请使用 strip_comments 470 | 471 | -------------------------------------------------------------------------------- /utils/network_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 网络处理工具模块 5 | """ 6 | 7 | import os 8 | import re 9 | import logging 10 | import requests 11 | import ssl 12 | from typing import Dict, List, Tuple, Optional, Any 13 | from urllib.parse import urlparse, urljoin 14 | 15 | logger = logging.getLogger('YuanZhao.utils.network') 16 | 17 | # 常见URL模式正则表达式 18 | URL_PATTERNS = [ 19 | # 标准URL 20 | re.compile(r'https?://[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE), 21 | # 协议相对URL 22 | re.compile(r'//[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE), 23 | # 仅域名 24 | re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?', re.IGNORECASE), 25 | # IP地址形式 26 | re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b(?::\d{1,5})?', re.IGNORECASE), 27 | # JavaScript伪协议 28 | re.compile(r'javascript:[^\s"\'>]+', re.IGNORECASE), 29 | # data URI 30 | re.compile(r'data:[^;]+;base64,[^\s"\'>]+', re.IGNORECASE), 31 | # 相对路径 32 | re.compile(r'/[^\s"\'>]+', re.IGNORECASE), 33 | ] 34 | 35 | 36 | def normalize_url(url: str, base_url: Optional[str] = None) -> str: 37 | """ 38 | 规范化URL 39 | 40 | Args: 41 | url: 原始URL 42 | base_url: 基础URL,用于解析相对路径 43 | 44 | Returns: 45 | 规范化后的URL 46 | """ 47 | try: 48 | # 处理双斜杠开头的URL:优先https,或继承base_url协议 49 | if url.startswith('//'): 50 | if base_url: 51 | base_parsed = urlparse(base_url) 52 | scheme = base_parsed.scheme or 'https' 53 | return f'{scheme}:{url}' 54 | return f'https:{url}' 55 | 56 | # 处理相对路径 57 | if base_url and not (url.startswith('http://') or url.startswith('https://')): 58 | return urljoin(base_url, url) 59 | 60 | # 对于纯域名,默认添加https:// 61 | parsed = urlparse(url) 62 | if not parsed.scheme: 63 | return f'https://{url}' 64 | 65 | return url 66 | 67 | except Exception as e: 68 | logger.error(f"规范化URL失败: {url}, 错误: {str(e)}") 69 | return url 70 | 71 | def get_url_type(url: str) -> str: 72 | """ 73 | 获取URL类型 74 | 75 | Args: 76 | url: URL字符串 77 | 78 | Returns: 79 | URL类型 80 | """ 81 | if url.startswith('http://') or url.startswith('https://'): 82 | return 'absolute' 83 | elif url.startswith('//'): 84 | return 'protocol-relative' 85 | elif url.startswith('/'): 86 | return 'root-relative' 87 | else: 88 | return 'relative' 89 | 90 | def check_url_reachability(url: str, timeout: int = 5, headers: Optional[Dict] = None) -> Tuple[bool, Optional[str]]: 91 | """ 92 | 检查URL是否可达 93 | 94 | Args: 95 | url: 要检查的URL 96 | timeout: 超时时间(秒) 97 | headers: 请求头 98 | 99 | Returns: 100 | (是否可达, 状态码或错误信息) 101 | """ 102 | try: 103 | if headers is None: 104 | headers = { 105 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 106 | } 107 | 108 | response = requests.head(url, timeout=timeout, headers=headers, allow_redirects=True) 109 | return response.status_code < 400, str(response.status_code) 110 | 111 | except requests.exceptions.RequestException as e: 112 | logger.warning(f"URL检查失败: {url}, 错误: {str(e)}") 113 | return False, str(e) 114 | 115 | def validate_url(url: str) -> bool: 116 | """ 117 | 验证URL格式是否有效 118 | 119 | Args: 120 | url: 要验证的URL 121 | 122 | Returns: 123 | URL是否有效 124 | """ 125 | try: 126 | result = urlparse(url) 127 | 128 | # 对于绝对URL,需要有scheme和netloc 129 | if url.startswith('http://') or url.startswith('https://'): 130 | return all([result.scheme, result.netloc]) 131 | 132 | # 对于相对URL,返回True 133 | return True 134 | 135 | except Exception as e: 136 | logger.error(f"URL验证失败: {url}, 错误: {str(e)}") 137 | return False 138 | 139 | def get_domain(url: str) -> Optional[str]: 140 | """ 141 | 从URL中提取域名 142 | 143 | Args: 144 | url: URL字符串 145 | 146 | Returns: 147 | 域名 148 | """ 149 | try: 150 | parsed = urlparse(url) 151 | return parsed.netloc 152 | except Exception as e: 153 | logger.error(f"提取域名失败: {url}, 错误: {str(e)}") 154 | return None 155 | 156 | def is_external_link(url: str, base_domain: Optional[str] = None) -> bool: 157 | """ 158 | 判断是否为外部链接 159 | 160 | Args: 161 | url: 要检查的URL 162 | base_domain: 基础域名 163 | 164 | Returns: 165 | 是否为外部链接 166 | """ 167 | url_domain = get_domain(url) 168 | if not url_domain: 169 | return False 170 | if not base_domain: 171 | # 未提供基础域名时,尽量避免误报:只有显式协议的绝对链接视为外部 172 | return url.startswith(('http://', 'https://')) 173 | # 检查是否为同一域名或子域名 174 | # 同域或子域视为内部,其余为外部 175 | return not (url_domain == base_domain or url_domain.endswith(f'.{base_domain}')) 176 | 177 | # 兼容性函数,用于判断字符串是否为URL 178 | def is_url(text: str) -> bool: 179 | """ 180 | 判断字符串是否为URL 181 | 182 | Args: 183 | text: 要检查的文本 184 | 185 | Returns: 186 | 是否为URL 187 | """ 188 | try: 189 | # 首先检查是否为本地文件,如果是,直接返回False 190 | if os.path.isfile(text) or os.path.isdir(text): 191 | logger.debug(f"{text} 是本地文件或目录,不视为URL") 192 | return False 193 | 194 | # 检查是否以http://或https://开头 195 | if text.startswith(('http://', 'https://')): 196 | return True 197 | 198 | # 过滤典型代码符号,避免误判为URL 199 | if re.search(r"^(document|window|parent|this)\.[A-Za-z_]", text): 200 | return False 201 | if re.search(r"^[A-Za-z_][A-Za-z0-9_]*\s*\(", text): 202 | if not re.search(r"https?://", text): 203 | # 若函数调用前缀,但内容中存在引号包裹的URL片段,视为URL 204 | quoted = re.findall(r'"([^"]+)"|\'([^\']+)' , text) 205 | candidates = [q[0] or q[1] for q in quoted] 206 | if not any((p.search(seg) for seg in URL_PATTERNS for seg in candidates)): 207 | return False 208 | 209 | # 检查是否通过URL格式验证 210 | if not validate_url(text): 211 | return False 212 | 213 | # 检查是否匹配至少一个URL模式 214 | for pattern in URL_PATTERNS: 215 | if pattern.search(text): 216 | return True 217 | 218 | return False 219 | except Exception as e: 220 | logger.error(f"URL检查失败: {text}, 错误: {str(e)}") 221 | return False 222 | 223 | # 兼容性函数,validate_url的别名 224 | def is_valid_url(url: str) -> bool: 225 | """ 226 | 验证URL格式是否有效(validate_url的别名) 227 | 228 | Args: 229 | url: 要验证的URL 230 | 231 | Returns: 232 | URL是否有效 233 | """ 234 | return validate_url(url) 235 | 236 | def get_url_context(text: str, position: Tuple[int, int], context_length: int = 50) -> str: 237 | """ 238 | 获取URL在文本中的上下文 239 | 240 | Args: 241 | text: 原始文本 242 | position: URL在文本中的位置 (start, end) 243 | context_length: 上下文长度 244 | 245 | Returns: 246 | 包含上下文的文本 247 | """ 248 | start_pos, end_pos = position 249 | 250 | # 计算上下文的起始和结束位置 251 | context_start = max(0, start_pos - context_length) 252 | context_end = min(len(text), end_pos + context_length) 253 | 254 | # 提取上下文 255 | context = text[context_start:context_end] 256 | 257 | # 添加省略号 258 | prefix = '...' if context_start > 0 else '' 259 | suffix = '...' if context_end < len(text) else '' 260 | 261 | return f"{prefix}{context}{suffix}" 262 | 263 | def build_request_session(proxy: Optional[str] = None, timeout: int = 10) -> requests.Session: 264 | """ 265 | 构建请求会话 266 | 267 | Args: 268 | proxy: 代理设置 269 | timeout: 超时时间 270 | 271 | Returns: 272 | 请求会话对象 273 | """ 274 | session = requests.Session() 275 | 276 | # 设置默认请求头 277 | session.headers.update({ 278 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 279 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 280 | 'Accept-Language': 'zh-CN,zh;q=0.9', 281 | }) 282 | 283 | # 设置代理 284 | if proxy: 285 | proxies = { 286 | 'http': proxy, 287 | 'https': proxy 288 | } 289 | session.proxies.update(proxies) 290 | logger.info(f"设置代理: {proxy}") 291 | 292 | # 配置HTTPS适配器,启用兼容旧式TLS重协商 293 | class TLSAdapter(requests.adapters.HTTPAdapter): 294 | def init_poolmanager(self, *args, **kwargs): 295 | ctx = ssl.create_default_context() 296 | try: 297 | ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0) 298 | except Exception: 299 | pass 300 | kwargs['ssl_context'] = ctx 301 | return super().init_poolmanager(*args, **kwargs) 302 | def proxy_manager_for(self, *args, **kwargs): 303 | ctx = ssl.create_default_context() 304 | try: 305 | ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0) 306 | except Exception: 307 | pass 308 | kwargs['ssl_context'] = ctx 309 | return super().proxy_manager_for(*args, **kwargs) 310 | try: 311 | session.mount('https://', TLSAdapter(max_retries=3)) 312 | except Exception: 313 | pass 314 | # 超时需在请求时传递 315 | 316 | return session 317 | 318 | def fetch_url_content(url: str, session: Optional[requests.Session] = None, **kwargs) -> Optional[Tuple[str, dict]]: 319 | """ 320 | 获取URL内容或本地文件内容 321 | 322 | Args: 323 | url: 要获取的URL或本地文件路径 324 | session: 请求会话对象 325 | **kwargs: 其他请求参数 326 | 327 | Returns: 328 | 元组 (内容字符串, 头部信息字典),失败时返回None 329 | """ 330 | try: 331 | # 检查是否为本地文件路径 332 | if not url.startswith(('http://', 'https://')): 333 | # 尝试作为本地文件读取 334 | if os.path.isfile(url): 335 | logger.info(f"读取本地文件: {url}") 336 | with open(url, 'r', encoding='utf-8') as f: 337 | content = f.read() 338 | # 返回内容和模拟的头部信息 339 | return content, {'Content-Type': 'text/html'} 340 | else: 341 | logger.error(f"本地文件不存在: {url}") 342 | return None 343 | 344 | # 添加标准浏览器请求头以避免被反爬机制拦截 345 | default_headers = { 346 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 347 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 348 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 349 | 'Accept-Encoding': 'gzip, deflate, br', 350 | 'Connection': 'keep-alive', 351 | 'Upgrade-Insecure-Requests': '1', 352 | 'Cache-Control': 'max-age=0' 353 | } 354 | 355 | # 合并默认请求头和传入的请求头 356 | headers = default_headers.copy() 357 | if 'headers' in kwargs: 358 | headers.update(kwargs['headers']) 359 | kwargs['headers'] = headers 360 | 361 | # 增加重试机制 362 | timeout = kwargs.get('timeout', 10) 363 | if session: 364 | response = session.get(url, timeout=timeout, **kwargs) 365 | else: 366 | # 创建临时会话以设置重试策略 367 | temp_session = requests.Session() 368 | adapter = requests.adapters.HTTPAdapter(max_retries=3) 369 | temp_session.mount('http://', adapter) 370 | try: 371 | class TLSAdapter(requests.adapters.HTTPAdapter): 372 | def init_poolmanager(self, *args, **kwargs): 373 | ctx = ssl.create_default_context() 374 | try: 375 | ctx.options |= getattr(ssl, 'OP_LEGACY_SERVER_CONNECT', 0) 376 | except Exception: 377 | pass 378 | kwargs['ssl_context'] = ctx 379 | return super().init_poolmanager(*args, **kwargs) 380 | temp_session.mount('https://', TLSAdapter(max_retries=3)) 381 | except Exception: 382 | temp_session.mount('https://', adapter) 383 | response = temp_session.get(url, timeout=timeout, **kwargs) 384 | 385 | response.raise_for_status() 386 | 387 | # 尝试自动检测编码,并在失败时回退到原始字节解码 388 | enc = response.apparent_encoding or response.encoding or 'utf-8' 389 | try: 390 | response.encoding = enc 391 | text = response.text 392 | except Exception: 393 | try: 394 | text = response.content.decode(enc, errors='replace') 395 | except Exception: 396 | text = response.content.decode('utf-8', errors='replace') 397 | 398 | return text, dict(response.headers) 399 | 400 | except requests.exceptions.RequestException as e: 401 | logger.error(f"获取URL内容失败: {url}, 错误: {str(e)}") 402 | return None 403 | except Exception as e: 404 | logger.error(f"读取内容失败: {url}, 错误: {str(e)}") 405 | return None 406 | 407 | # 兼容性函数,为了支持html_detector.py中的导入 408 | def extract_domain(url: str) -> Optional[str]: 409 | """ 410 | 从URL中提取域名(get_domain的别名) 411 | 412 | Args: 413 | url: URL字符串 414 | 415 | Returns: 416 | 域名 417 | """ 418 | return get_domain(url) 419 | 420 | def analyze_url_risk(url: str) -> Dict[str, Any]: 421 | """ 422 | 评估URL风险等级 423 | Returns: {risk_level: int, reason: str} 424 | """ 425 | try: 426 | risk = 0 427 | reasons = [] 428 | parsed = urlparse(url) 429 | scheme = parsed.scheme.lower() 430 | domain = parsed.netloc.lower() 431 | # 协议风险 432 | if scheme == 'javascript': 433 | risk += 5 434 | reasons.append('JavaScript协议') 435 | elif scheme == 'data': 436 | risk += 4 437 | reasons.append('Data URI') 438 | elif scheme in ('http', 'https'): 439 | risk += 1 440 | # 端口风险 441 | if parsed.port and parsed.port not in [80, 443, 8080, 8443]: 442 | risk += 2 443 | reasons.append('非标准端口') 444 | # 可疑后缀与短链服务 445 | suspicious_tlds = ['pro', 'xyz', 'pw', 'top', 'loan', 'win', 'bid', 'online'] 446 | short_link_domains = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly', 'is.gd', 'adf.ly'] 447 | if any(domain.endswith('.' + tld) for tld in suspicious_tlds): 448 | risk += 2 449 | reasons.append('高风险域名后缀') 450 | if any(domain.endswith(sl) or domain == sl for sl in short_link_domains): 451 | risk += 3 452 | reasons.append('短链接服务') 453 | # 路径随机性 454 | if re.search(r'/[a-zA-Z0-9]{8,}\.(?:js|php)$', parsed.path): 455 | risk += 1 456 | reasons.append('可疑随机路径') 457 | return {'risk_level': min(risk, 10), 'reason': ', '.join(reasons) or '普通URL'} 458 | except Exception as e: 459 | logger.error(f"URL风险评估失败: {url}, 错误: {str(e)}") 460 | return {'risk_level': 0, 'reason': '评估失败'} 461 | 462 | def extract_urls(text: str, context_type: Optional[str] = None) -> List[Dict[str, Any]]: 463 | """ 464 | 从文本中提取所有URL 465 | 466 | Args: 467 | text: 要提取URL的文本 468 | 469 | Returns: 470 | 包含URL和上下文的字典列表 471 | """ 472 | results = [] 473 | urls_set = set() # 用于去重 474 | 475 | # 增加URL模式匹配 476 | url_patterns = [ 477 | re.compile(r'(https?://[\w._~:/?#[\]@!$&\'()*+,-;=]+)', re.IGNORECASE), 478 | re.compile(r'(/[-\w./?%&=]+)', re.IGNORECASE), 479 | re.compile(r'([a-zA-Z0-9][a-zA-Z0-9-]{0,61}[a-zA-Z0-9]\.[a-zA-Z]{2,}(?:/[^\s<>"]*)?)', re.IGNORECASE), 480 | re.compile(r'(javascript:[\w./?%&=;(),\'"`-]+)', re.IGNORECASE), 481 | re.compile(r'(data:[^;]+;base64,[^\s<>"]+)', re.IGNORECASE), 482 | ] 483 | 484 | logger.info(f"开始提取URL,文本长度: {len(text)}") 485 | 486 | for i, pattern in enumerate(url_patterns): 487 | matches = pattern.finditer(text) 488 | match_count = 0 489 | 490 | for match in matches: 491 | match_count += 1 492 | url = match.group(1) 493 | start = max(0, match.start() - 50) 494 | end = min(len(text), match.end() + 50) 495 | context = text[start:end] 496 | 497 | # 清理URL 498 | url = url.strip('"\'') 499 | 500 | # 跳过空URL 501 | if not url or len(url) < 3: 502 | continue 503 | 504 | # 跳过纯数字或不包含有效字符的URL 505 | if re.match(r'^\d+$', url): 506 | continue 507 | 508 | # 基本过滤:非http且非根相对路径、非伪协议时需校验TLD 509 | if not url.lower().startswith(('http://','https://','javascript:','data:')) and not url.startswith('/'): 510 | domain_part = url.split('/', 1)[0] 511 | tld = domain_part.rsplit('.', 1)[-1].lower() if '.' in domain_part else '' 512 | allowed_tlds = { 513 | 'com','org','net','cn','cc','io','me','xyz','tk','ga','gq','ml','cf','edu','gov','mil','biz','info' 514 | } 515 | if tld not in allowed_tlds: 516 | continue 517 | # 去重 518 | if url not in urls_set: 519 | urls_set.add(url) 520 | results.append({ 521 | 'url': url, 522 | 'context': context, 523 | 'position': (match.start(), match.end()), 524 | 'context_type': context_type or 'unknown' 525 | }) 526 | 527 | logger.debug(f"模式 {i} 匹配到 {match_count} 个URL") 528 | 529 | logger.debug(f"共提取到 {len(results)} 个唯一URL") 530 | return results 531 | EXTRA_PATTERNS = [ 532 | # 扩展的HTTP/HTTPS URL 533 | re.compile(r'https?://[\w\-\.]+(?:\.[\w\-]+)+[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+', re.IGNORECASE), 534 | # 没有协议的域名 535 | re.compile(r'\b[\w\-\.]+(?:\.[\w\-]+)+\b(?::\d{1,5})?/[\w\-\._~:/?#[\]@!\$&\'\(\)\*\+,;=.]*', re.IGNORECASE), 536 | # JavaScript伪协议 537 | re.compile(r'javascript:[^\s"\'>]+', re.IGNORECASE), 538 | # data URI 539 | re.compile(r'data:[^;]+;base64,[^\s"\'>]+', re.IGNORECASE), 540 | # 相对路径 541 | re.compile(r'\/[^\s"\'>]+', re.IGNORECASE), 542 | ] 543 | # 模式去重:基于正则字符串与flags,避免重复匹配与性能开销 544 | _unique_patterns = [] 545 | _seen = set() 546 | for _pat in URL_PATTERNS: 547 | _key = (_pat.pattern, _pat.flags) 548 | if _key not in _seen: 549 | _seen.add(_key) 550 | _unique_patterns.append(_pat) 551 | URL_PATTERNS = _unique_patterns 552 | -------------------------------------------------------------------------------- /core/detector/headless_browser_detector.py: -------------------------------------------------------------------------------- 1 | """无头浏览器检测器模块 2 | 3 | 用于通过Chrome无头浏览器检测动态生成的暗链和隐藏内容。 4 | 支持检测JavaScript动态生成的内容、DOM操作、iframe内容等。 5 | """ 6 | import logging 7 | from typing import List, Dict, Any 8 | from core.config import Config 9 | 10 | class HeadlessBrowserDetector: 11 | """无头浏览器检测器类""" 12 | 13 | def __init__(self, config: Config): 14 | """初始化无头浏览器检测器 15 | 16 | Args: 17 | config: 配置对象 18 | """ 19 | self.config = config 20 | self.logger = logging.getLogger(__name__) 21 | self.driver = None 22 | self._initialize_driver() 23 | 24 | def _initialize_driver(self): 25 | """初始化Chrome无头浏览器驱动""" 26 | try: 27 | # 动态导入,避免在不使用时产生依赖问题 28 | from selenium import webdriver 29 | from selenium.webdriver.chrome.options import Options 30 | from selenium.webdriver.chrome.service import Service 31 | import os 32 | driver_path = getattr(self.config, 'headless_driver_path', None) 33 | binary_path = getattr(self.config, 'headless_binary', None) 34 | 35 | # 创建Chrome选项 36 | chrome_options = Options() 37 | if binary_path: 38 | chrome_options.binary_location = binary_path 39 | chrome_options.add_argument('--headless') # 无头模式 40 | chrome_options.add_argument('--disable-gpu') # 禁用GPU加速 41 | chrome_options.add_argument('--no-sandbox') # 禁用沙箱 42 | chrome_options.add_argument('--disable-dev-shm-usage') # 解决内存问题 43 | chrome_options.add_argument('--window-size=1920,1080') # 设置窗口大小 44 | chrome_options.add_argument('--log-level=3') # 减少日志输出 45 | 46 | # 选择驱动来源:优先本地路径;否则在允许时自动下载 47 | if driver_path and os.path.exists(driver_path): 48 | service = Service(driver_path) 49 | else: 50 | if getattr(self.config, 'headless_auto_download', False): 51 | from webdriver_manager.chrome import ChromeDriverManager 52 | service = Service(ChromeDriverManager().install()) 53 | else: 54 | self.logger.error("未提供本地驱动路径且未启用自动下载,跳过无头浏览器初始化") 55 | return 56 | 57 | # 创建浏览器驱动 58 | self.driver = webdriver.Chrome(service=service, options=chrome_options) 59 | 60 | # 设置超时时间 61 | self.driver.set_page_load_timeout(self.config.headless_timeout) 62 | self.driver.set_script_timeout(self.config.headless_timeout) 63 | 64 | self.logger.info("Chrome无头浏览器初始化成功") 65 | 66 | except ImportError as e: 67 | self.logger.error(f"缺少无头浏览器相关依赖: {str(e)}") 68 | self.logger.error("请安装依赖: pip install selenium webdriver-manager") 69 | except Exception as e: 70 | self.logger.error(f"无头浏览器初始化失败: {str(e)}") 71 | 72 | def close(self): 73 | """释放浏览器驱动资源""" 74 | try: 75 | if self.driver: 76 | self.driver.quit() 77 | self.driver = None 78 | self.logger.info("已释放无头浏览器驱动") 79 | except Exception as e: 80 | self.logger.error(f"释放无头浏览器驱动失败: {str(e)}") 81 | 82 | def __del__(self): 83 | try: 84 | self.close() 85 | except Exception: 86 | pass 87 | 88 | def detect(self, url: str, content: str = None) -> List[Dict[str, Any]]: 89 | """使用无头浏览器检测暗链 90 | 91 | Args: 92 | url: 要检测的URL 93 | content: 可选,页面内容(如果已获取) 94 | 95 | Returns: 96 | 检测结果列表 97 | """ 98 | results = [] 99 | 100 | if not self.driver: 101 | self.logger.error("无头浏览器未初始化,跳过检测") 102 | return results 103 | 104 | try: 105 | from selenium.webdriver.support.ui import WebDriverWait 106 | # 加载页面 107 | self.logger.info(f"无头浏览器正在加载页面: {url}") 108 | self.driver.get(url) 109 | 110 | # 等待JavaScript执行完成 111 | try: 112 | WebDriverWait(self.driver, self.config.js_wait_time).until( 113 | lambda d: d.execute_script("return document.readyState") in ("complete", "interactive") 114 | ) 115 | except Exception: 116 | pass 117 | self.logger.info(f"等待页面加载/JS执行完成 (<= {self.config.js_wait_time}秒)") 118 | 119 | # 执行各项检测 120 | self.logger.info("开始执行动态链接检测") 121 | dynamic_links = self._detect_dynamic_links() 122 | results.extend(dynamic_links) 123 | 124 | self.logger.info("开始执行DOM操作检测") 125 | dom_operations = self._detect_dom_manipulations() 126 | results.extend(dom_operations) 127 | 128 | self.logger.info("开始执行iframe内容检测") 129 | iframe_content = self._detect_iframe_content() 130 | results.extend(iframe_content) 131 | 132 | self.logger.info("开始执行隐藏元素检测") 133 | hidden_elements = self._detect_hidden_elements() 134 | results.extend(hidden_elements) 135 | 136 | self.logger.info(f"无头浏览器检测完成,发现 {len(results)} 个可疑项") 137 | 138 | except Exception as e: 139 | self.logger.error(f"无头浏览器检测过程中出错: {str(e)}") 140 | 141 | return results 142 | 143 | def _detect_dynamic_links(self) -> List[Dict[str, Any]]: 144 | """检测动态生成的链接 145 | 146 | Returns: 147 | 检测到的可疑链接列表 148 | """ 149 | results = [] 150 | 151 | try: 152 | from selenium.webdriver.common.by import By 153 | # 获取所有链接元素 154 | links = self.driver.find_elements(By.TAG_NAME, 'a') 155 | self.logger.info(f"发现 {len(links)} 个链接元素") 156 | 157 | for link in links: 158 | try: 159 | href = link.get_attribute('href') 160 | if href: 161 | # 分析链接风险(使用现有工具类) 162 | from utils.network_utils import analyze_url_risk 163 | risk_info = analyze_url_risk(href) 164 | 165 | if risk_info['risk_level'] > 0: 166 | text = link.text.strip()[:100] # 限制文本长度 167 | results.append({ 168 | 'type': 'suspicious_url', 169 | 'url': href, 170 | 'risk_level': risk_info['risk_level'], 171 | 'context': f"动态生成链接: {text}", 172 | 'detection_method': 'headless_browser', 173 | 'element': 'a', 174 | 'risk_reason': risk_info.get('reason', '未知风险') 175 | }) 176 | except Exception as e: 177 | self.logger.error(f"分析动态链接时出错: {str(e)}") 178 | except Exception as e: 179 | self.logger.error(f"获取链接元素时出错: {str(e)}") 180 | 181 | return results 182 | 183 | def _detect_dom_manipulations(self) -> List[Dict[str, Any]]: 184 | """检测可疑的DOM操作 185 | 186 | Returns: 187 | 检测到的可疑DOM操作列表 188 | """ 189 | results = [] 190 | 191 | # 注入JavaScript以检测可疑的DOM操作 192 | monitor_script = r""" 193 | (function() { 194 | const suspiciousPatterns = []; 195 | 196 | // 初始化正则表达式 197 | const eval_pattern = /eval[\s]*\(/; 198 | const doc_write_pattern = /document\.write[\s]*\(/; 199 | const innerhtml_pattern = /innerHTML[\s]*=/; 200 | const base64_pattern = /base64/i; 201 | const fromCharCode_pattern = /fromCharCode/; 202 | const escape_pattern = /escape[\s]*\(/; 203 | const unescape_pattern = /unescape[\s]*\(/; 204 | 205 | // 检测可疑的JavaScript代码模式 206 | const scriptElements = document.querySelectorAll('script'); 207 | scriptElements.forEach(script => { 208 | if (script.textContent) { 209 | const content = script.textContent; 210 | if (eval_pattern.test(content) || 211 | doc_write_pattern.test(content) || 212 | innerhtml_pattern.test(content) || 213 | base64_pattern.test(content) || 214 | fromCharCode_pattern.test(content) || 215 | escape_pattern.test(content) || 216 | unescape_pattern.test(content)) { 217 | suspiciousPatterns.push({ 218 | type: 'suspicious_script', 219 | content: content.substring(0, 200) + '...', 220 | lineCount: content.split('\n').length 221 | }); 222 | } 223 | } 224 | }); 225 | 226 | // 检测动态创建的元素 227 | const dynamicElements = []; 228 | document.querySelectorAll('*').forEach(element => { 229 | if (element.tagName === 'SCRIPT' && element.getAttribute('src') === null && 230 | element.textContent.length > 50) { 231 | dynamicElements.push({tag: element.tagName, type: 'inline_script'}); 232 | } 233 | if (element.tagName === 'IFRAME') { 234 | dynamicElements.push({tag: element.tagName, src: element.getAttribute('src')}); 235 | } 236 | }); 237 | 238 | return {suspiciousPatterns, dynamicElements}; 239 | })(); 240 | """ 241 | 242 | try: 243 | result = self.driver.execute_script(monitor_script) 244 | 245 | # 分析可疑脚本模式 246 | for pattern in result['suspiciousPatterns']: 247 | risk_level = 8 # 较高风险 248 | results.append({ 249 | 'type': 'suspicious_dom_operation', 250 | 'technique': pattern['type'], 251 | 'risk_level': risk_level, 252 | 'context': f"检测到可疑脚本模式: {pattern['content']}", 253 | 'detection_method': 'headless_browser', 254 | 'risk_reason': '包含可疑JavaScript操作函数' 255 | }) 256 | 257 | # 分析动态创建的元素 258 | for element in result['dynamicElements']: 259 | if element['tag'] == 'IFRAME' and element.get('src'): 260 | from utils.network_utils import analyze_url_risk 261 | risk_info = analyze_url_risk(element['src']) 262 | if risk_info['risk_level'] > 0: 263 | results.append({ 264 | 'type': 'suspicious_iframe', 265 | 'url': element['src'], 266 | 'risk_level': risk_info['risk_level'], 267 | 'context': f"动态创建的iframe", 268 | 'detection_method': 'headless_browser', 269 | 'risk_reason': risk_info.get('reason', '可疑iframe') 270 | }) 271 | except Exception as e: 272 | self.logger.error(f"检测DOM操作时出错: {str(e)}") 273 | 274 | return results 275 | 276 | def _detect_iframe_content(self) -> List[Dict[str, Any]]: 277 | """检测iframe中的内容 278 | 279 | Returns: 280 | 检测到的iframe中的可疑内容列表 281 | """ 282 | results = [] 283 | 284 | try: 285 | from selenium.webdriver.common.by import By 286 | # 获取所有iframe 287 | iframes = self.driver.find_elements(By.TAG_NAME, 'iframe') 288 | self.logger.info(f"发现 {len(iframes)} 个iframe元素") 289 | 290 | for index, iframe in enumerate(iframes): 291 | try: 292 | iframe_src = iframe.get_attribute('src') 293 | self.logger.info(f"处理iframe {index + 1}/{len(iframes)}: {iframe_src or '无src属性'}") 294 | 295 | # 分析iframe的src属性 296 | if iframe_src: 297 | from utils.network_utils import analyze_url_risk 298 | risk_info = analyze_url_risk(iframe_src) 299 | 300 | if risk_info['risk_level'] > 0: 301 | results.append({ 302 | 'type': 'suspicious_iframe', 303 | 'url': iframe_src, 304 | 'risk_level': risk_info['risk_level'], 305 | 'context': f"iframe中的可疑链接", 306 | 'detection_method': 'headless_browser', 307 | 'risk_reason': risk_info.get('reason', '可疑iframe源') 308 | }) 309 | 310 | # 尝试切换到iframe上下文分析内容 311 | try: 312 | self.driver.switch_to.frame(iframe) 313 | 314 | # 获取iframe中的链接 315 | iframe_links = self.driver.find_elements(By.TAG_NAME, 'a') 316 | for link in iframe_links: 317 | href = link.get_attribute('href') 318 | if href: 319 | from utils.network_utils import analyze_url_risk 320 | risk_info = analyze_url_risk(href) 321 | 322 | if risk_info['risk_level'] > 0: 323 | results.append({ 324 | 'type': 'suspicious_url', 325 | 'url': href, 326 | 'risk_level': risk_info['risk_level'], 327 | 'context': f"iframe内部的可疑链接", 328 | 'detection_method': 'headless_browser', 329 | 'risk_reason': risk_info.get('reason', 'iframe内部链接风险') 330 | }) 331 | except Exception as iframe_e: 332 | self.logger.error(f"分析iframe内容时出错: {str(iframe_e)}") 333 | finally: 334 | # 确保切回主文档 335 | self.driver.switch_to.default_content() 336 | 337 | except Exception as e: 338 | self.logger.error(f"处理iframe时出错: {str(e)}") 339 | 340 | except Exception as e: 341 | self.logger.error(f"获取iframe元素时出错: {str(e)}") 342 | 343 | return results 344 | 345 | def _detect_hidden_elements(self) -> List[Dict[str, Any]]: 346 | """检测视觉上隐藏的元素 347 | 348 | Returns: 349 | 检测到的隐藏元素列表 350 | """ 351 | results = [] 352 | 353 | # 注入JavaScript获取隐藏元素 354 | hidden_elements_script = """ 355 | (function() { 356 | const hiddenElements = []; 357 | 358 | // 获取所有元素 359 | const allElements = document.querySelectorAll('*'); 360 | 361 | allElements.forEach(element => { 362 | const style = window.getComputedStyle(element); 363 | const rect = element.getBoundingClientRect(); 364 | 365 | // 检查各种隐藏技术 366 | const isHidden = 367 | style.display === 'none' || 368 | style.visibility === 'hidden' || 369 | style.opacity === '0' || 370 | rect.width <= 1 || 371 | rect.height <= 1 || 372 | parseInt(style.fontSize) <= 0 || 373 | element.offsetParent === null; 374 | 375 | // 检查绝对定位隐藏 376 | const isAbsPosHidden = 377 | style.position === 'absolute' && 378 | (parseInt(style.left) < -1000 || parseInt(style.top) < -1000 || 379 | parseInt(style.right) < -1000 || parseInt(style.bottom) < -1000); 380 | 381 | // 检查文本颜色与背景色相同 382 | const textColor = style.color; 383 | const bgColor = style.backgroundColor || style.background; 384 | const isSameColor = textColor === bgColor && textColor !== 'rgba(0, 0, 0, 0)'; 385 | 386 | // 检查是否包含链接或文本 387 | const hasLinks = element.querySelector('a') !== null; 388 | const hasText = element.textContent.trim().length > 0; 389 | const hasContent = hasLinks || hasText; 390 | 391 | if ((isHidden || isAbsPosHidden || isSameColor) && hasContent) { 392 | // 获取元素中的链接(如果有) 393 | const links = []; 394 | if (hasLinks) { 395 | const linkElements = element.querySelectorAll('a'); 396 | linkElements.forEach(link => { 397 | const href = link.getAttribute('href'); 398 | if (href) links.push(href); 399 | }); 400 | } 401 | 402 | hiddenElements.push({ 403 | tagName: element.tagName, 404 | id: element.id || '无ID', 405 | classes: element.className || '无类名', 406 | hiddenBy: isSameColor ? 'color_matching' : 407 | isAbsPosHidden ? 'absolute_position' : 'visibility', 408 | content: element.textContent.trim().substring(0, 200) + '...', 409 | hasLinks: hasLinks, 410 | links: links, 411 | textColor: textColor, 412 | bgColor: bgColor 413 | }); 414 | } 415 | }); 416 | 417 | return hiddenElements; 418 | })(); 419 | """ 420 | 421 | try: 422 | hidden_elements = self.driver.execute_script(hidden_elements_script) 423 | self.logger.info(f"发现 {len(hidden_elements)} 个隐藏元素") 424 | 425 | for elem in hidden_elements: 426 | # 计算风险等级 427 | risk_level = 8 if elem['hasLinks'] else 6 428 | 429 | # 构建风险描述 430 | context = f"隐藏元素 ({elem['tagName']}): {elem['content']}" 431 | if elem['hasLinks']: 432 | context += f" 包含 {len(elem['links'])} 个链接" 433 | 434 | result_item = { 435 | 'type': 'hidden_element', 436 | 'technique': elem['hiddenBy'], 437 | 'risk_level': risk_level, 438 | 'context': context, 439 | 'detection_method': 'headless_browser', 440 | 'risk_reason': '视觉上隐藏的元素可能包含暗链' 441 | } 442 | 443 | # 如果有链接,添加链接信息 444 | if elem['hasLinks'] and elem['links']: 445 | result_item['hidden_links'] = elem['links'] 446 | 447 | results.append(result_item) 448 | 449 | # 对于包含链接的隐藏元素,分别记录每个链接 450 | if elem['hasLinks'] and elem['links']: 451 | for link in elem['links']: 452 | from utils.network_utils import analyze_url_risk 453 | risk_info = analyze_url_risk(link) 454 | results.append({ 455 | 'type': 'suspicious_url', 456 | 'url': link, 457 | 'risk_level': max(risk_level, risk_info['risk_level']), 458 | 'context': f"隐藏元素中的链接: {link}", 459 | 'detection_method': 'headless_browser', 460 | 'risk_reason': f"隐藏在{elem['hiddenBy']}类型的{elem['tagName']}元素中" 461 | }) 462 | 463 | except Exception as e: 464 | self.logger.error(f"检测隐藏元素时出错: {str(e)}") 465 | 466 | return results 467 | 468 | def close(self): 469 | """关闭无头浏览器驱动 470 | 471 | 清理资源,避免内存泄漏 472 | """ 473 | if self.driver: 474 | try: 475 | self.driver.quit() 476 | self.logger.info("无头浏览器已关闭") 477 | except Exception as e: 478 | self.logger.error(f"关闭无头浏览器时出错: {str(e)}") 479 | finally: 480 | self.driver = None 481 | 482 | def __del__(self): 483 | """析构函数,确保资源被释放""" 484 | self.close() 485 | -------------------------------------------------------------------------------- /core/detector/html_detector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTML检测器模块 5 | """ 6 | 7 | import re 8 | from typing import List, Dict, Any 9 | from urllib.parse import urlparse 10 | 11 | from utils.html_utils import ( 12 | extract_script_tags, 13 | extract_link_tags, 14 | extract_meta_tags, 15 | extract_iframe_tags, 16 | find_hidden_elements, 17 | get_dom_structure, 18 | extract_comments 19 | ) 20 | from utils.network_utils import ( 21 | extract_urls, 22 | is_external_link, 23 | extract_domain 24 | ) 25 | from utils.common_utils import ( 26 | extract_text_between_markers, 27 | get_context 28 | ) 29 | 30 | class HTMLDetector: 31 | """ 32 | HTML内容检测器,用于检测HTML文件中的可疑链接和隐藏元素 33 | """ 34 | 35 | def __init__(self, config): 36 | """ 37 | 初始化HTML检测器 38 | 39 | Args: 40 | config: 扫描配置对象 41 | """ 42 | self.config = config 43 | self.logger = config.logger 44 | 45 | # 可疑HTML模式 46 | self.suspicious_patterns = { 47 | 'suspicious_attributes': re.compile(r'\bon\w+\s*=\s*["\']?javascript:', re.IGNORECASE), 48 | 'eval_inline': re.compile(r'\beval\s*\(', re.IGNORECASE), 49 | 'document_write': re.compile(r'\bdocument\.write\s*\(', re.IGNORECASE), 50 | 'base64_decode': re.compile(r'\batob\s*\(|\bfromCharCode\s*\(', re.IGNORECASE), 51 | 'data_uri': re.compile(r'data:[^;]+;base64,', re.IGNORECASE), 52 | 'remote_iframe': re.compile(r']+src=["\']?https?://', re.IGNORECASE), 53 | 'hidden_divs': re.compile(r'<(div|span|p|section|article)[^>]+style=["\'][^"\']*(display\s*:\s*none|visibility\s*:\s*hidden)[^"\']*["\']', re.IGNORECASE), 54 | 'obfuscated_attributes': re.compile(r'\b(data-|on)[a-z0-9_-]+\s*=\s*["\']?[^"\']*(\\\\x[0-9a-f]{2}|\\\\u[0-9a-f]{4})[^"\']*["\']?', re.IGNORECASE), 55 | } 56 | 57 | # 可疑域名模式 58 | self.suspicious_domain_patterns = [ 59 | re.compile(r'\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+\b(?:cn|cc|tk|ml|ga|cf|pro|xyz|pw|top|loan|win|bid|online)\b', re.IGNORECASE), 60 | re.compile(r'\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+\b(?:bet|casino|poker|gamble)\b', re.IGNORECASE), 61 | re.compile(r'\b(?:[a-z0-9]{8,}\.)+\b(?:[a-z]{2,})\b', re.IGNORECASE), # 检测8个字符以上的随机域名前缀 62 | ] 63 | 64 | def detect(self, file_path: str, content: str) -> List[Dict[str, Any]]: 65 | """ 66 | 检测HTML内容中的可疑元素 67 | 68 | Args: 69 | file_path: 文件路径或URL 70 | content: HTML内容 71 | 72 | Returns: 73 | 检测结果列表 74 | """ 75 | results = [] 76 | 77 | try: 78 | # 1. 检测可疑URL 79 | url_results = self._detect_suspicious_urls(file_path, content) 80 | results.extend(url_results) 81 | 82 | # 2. 检测可疑模式 83 | pattern_results = self._detect_suspicious_patterns(file_path, content) 84 | results.extend(pattern_results) 85 | 86 | # 3. 检测隐藏元素 87 | hidden_results = self._detect_hidden_elements(file_path, content) 88 | results.extend(hidden_results) 89 | 90 | # 4. 检测可疑注释 91 | comment_results = self._detect_suspicious_comments(file_path, content) 92 | results.extend(comment_results) 93 | 94 | # 5. 检测可疑Meta标签 95 | meta_results = self._detect_suspicious_meta(file_path, content) 96 | results.extend(meta_results) 97 | 98 | except Exception as e: 99 | self.logger.error(f"HTML检测过程中发生错误: {str(e)}", exc_info=True) 100 | 101 | return results 102 | 103 | def _detect_suspicious_urls(self, file_path: str, content: str) -> List[Dict[str, Any]]: 104 | """ 105 | 检测HTML中的可疑URL 106 | 107 | Args: 108 | file_path: 文件路径或URL 109 | content: HTML内容 110 | 111 | Returns: 112 | 可疑URL检测结果 113 | """ 114 | results = [] 115 | 116 | # 提取所有URL 117 | urls = extract_urls(content) 118 | 119 | for url_obj in urls: 120 | url = url_obj['url'] 121 | context = url_obj['context'] 122 | 123 | # 计算风险等级 124 | risk_level, reason = self._calculate_url_risk(url, context, file_path) 125 | 126 | if risk_level > 0: 127 | result = { 128 | 'type': 'suspicious_url', 129 | 'file_path': file_path, 130 | 'url': url, 131 | 'risk_level': risk_level, 132 | 'reason': reason, 133 | 'context': context 134 | } 135 | results.append(result) 136 | 137 | return results 138 | 139 | def _calculate_url_risk(self, url: str, context: str, source: str) -> tuple: 140 | """ 141 | 计算URL的风险等级 142 | 143 | Args: 144 | url: 要评估的URL 145 | context: URL的上下文 146 | 147 | Returns: 148 | (风险等级, 原因) 149 | """ 150 | risk_level = 0 151 | reason = [] 152 | 153 | # 外部/内部链接判断 154 | source_domain = None 155 | try: 156 | if isinstance(source, str) and source.startswith(('http://', 'https://')): 157 | source_domain = extract_domain(source) 158 | except Exception: 159 | source_domain = None 160 | is_abs = url.lower().startswith(('http://', 'https://')) 161 | if is_abs: 162 | # 仅对跨域外部链接提高基础风险 163 | try: 164 | link_domain = extract_domain(url) 165 | except Exception: 166 | link_domain = None 167 | from utils.network_utils import is_external_link 168 | trusted_domains = { 169 | # JS/CSS 通用CDN 170 | 'cdn.jsdelivr.net', 'cdnjs.cloudflare.com', 'code.jquery.com', 'ajax.googleapis.com', 171 | 'fonts.googleapis.com', 'fonts.gstatic.com', 'unpkg.com', 'www.unpkg.com', 172 | 'lib.baomitu.com', 'cdn.staticfile.org', 'staticfile.org', 'stackpath.bootstrapcdn.com', 173 | 'maxcdn.bootstrapcdn.com', 'bootcss.com', 'cdn.bootcss.com', 'bootcdn.net', 'cdn.bootcdn.net', 174 | # 常见站点/资源域(降低误报) 175 | 'hm.baidu.com', 'www.googletagmanager.com', 'busuanzi.ibruce.info', 176 | 'seccdn.libravatar.org', 'registry.npmmirror.com', 'icp.gov.moe', 177 | 'www.bilibili.com', 'hexo.io' 178 | } 179 | def _is_trusted(domain: str) -> bool: 180 | if not domain: 181 | return False 182 | for td in trusted_domains: 183 | if domain == td or domain.endswith('.' + td): 184 | return True 185 | return False 186 | if _is_trusted(link_domain): 187 | return 0, '可信CDN域名' 188 | elif is_external_link(url, source_domain): 189 | risk_level += 2 190 | reason.append('外部链接') 191 | 192 | # 检测可疑域名后缀(跨域时才计入) 193 | suspicious_tlds = ['pro', 'pw', 'top', 'loan', 'win', 'bid', 'online', 'tk', 'ga', 'gq', 'ml', 'cf'] 194 | parsed_url = urlparse(url) 195 | domain = parsed_url.netloc 196 | if _is_trusted(link_domain): 197 | pass 198 | elif is_external_link(url, source_domain): 199 | for tld in suspicious_tlds: 200 | if domain.endswith('.' + tld): 201 | risk_level += 2 202 | reason.append(f'高风险域名后缀: {tld}') 203 | break 204 | 205 | # 检测短随机字符串域名 206 | domain_parts = domain.split('.') 207 | if len(domain_parts) >= 2 and len(domain_parts[-2]) >= 8 and not any(c.isdigit() for c in domain_parts[-2]): 208 | risk_level += 2 209 | reason.append('可能为随机生成的可疑域名') 210 | 211 | # 检查是否使用了可疑端口 212 | parsed_url = urlparse(url) 213 | if parsed_url.port and parsed_url.port not in [80, 443, 8080, 8443]: 214 | risk_level += 2 215 | reason.append('使用非标准端口') 216 | 217 | # 检查是否包含可疑查询参数 218 | suspicious_params = ['redirect', 'proxy', 'referer', 'origin', 'callback'] 219 | if parsed_url.query: 220 | for param in suspicious_params: 221 | if param in parsed_url.query.lower(): 222 | risk_level += 1 223 | reason.append(f'包含可疑参数: {param}') 224 | break 225 | 226 | # 检查是否使用了短链接服务 227 | short_link_domains = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly', 'is.gd', 'adf.ly'] 228 | domain = extract_domain(url) 229 | if domain in short_link_domains: 230 | risk_level += 3 231 | reason.append('使用短链接服务') 232 | 233 | # 检查是否匹配可疑域名模式 234 | for pattern in self.suspicious_domain_patterns: 235 | if pattern.search(url): 236 | # 同域名不计入可疑域名模式 237 | if is_abs and source_domain and not is_external_link(url, source_domain): 238 | pass 239 | else: 240 | risk_level += 2 241 | reason.append('匹配可疑域名模式') 242 | break 243 | 244 | # 检查上下文是否包含可疑关键词 245 | suspicious_context_keywords = ['hidden', 'display:none', 'visibility:hidden', 'opacity:0'] 246 | for keyword in suspicious_context_keywords: 247 | if keyword.lower() in context.lower(): 248 | risk_level += 2 249 | reason.append('URL位于可疑上下文中') 250 | break 251 | 252 | # 检查是否为JavaScript伪协议 253 | if url.lower().startswith('javascript:'): 254 | risk_level += 4 255 | reason.append('JavaScript伪协议') 256 | 257 | # 对相对路径与同域资源降低风险 258 | if not is_abs: 259 | if url.startswith('/'): 260 | # 同域相对路径,不计风险 261 | return 0, '' 262 | # 非协议/非根路径的文本片段,不计风险 263 | if not url.lower().startswith(('javascript:', 'data:')): 264 | return 0, '' 265 | 266 | return risk_level, ', '.join(reason) 267 | 268 | def _detect_suspicious_patterns(self, file_path: str, content: str) -> List[Dict[str, Any]]: 269 | """ 270 | 检测HTML中的可疑模式 271 | 272 | Args: 273 | file_path: 文件路径或URL 274 | content: HTML内容 275 | 276 | Returns: 277 | 可疑模式检测结果 278 | """ 279 | results = [] 280 | 281 | for pattern_name, pattern in self.suspicious_patterns.items(): 282 | for match in pattern.finditer(content): 283 | start_pos = max(0, match.start() - 50) 284 | end_pos = min(len(content), match.end() + 50) 285 | context = get_context(content, match.start(), 50) 286 | 287 | # 计算风险等级 288 | risk_level = self._get_pattern_risk_level(pattern_name) 289 | 290 | result = { 291 | 'type': 'suspicious_pattern', 292 | 'file_path': file_path, 293 | 'pattern': pattern_name, 294 | 'matched_content': match.group(0), 295 | 'risk_level': risk_level, 296 | 'description': self._get_pattern_description(pattern_name), 297 | 'context': context 298 | } 299 | results.append(result) 300 | 301 | # 检测内联脚本 302 | script_tags = extract_script_tags(content) 303 | for script in script_tags: 304 | is_inline = script.get('inline') if 'inline' in script else (not script.get('src') and bool(script.get('content'))) 305 | if is_inline: 306 | script_text = script.get('content', '') or '' 307 | script_length = len(script_text) 308 | # 计算位置,避免缺少start_pos/end_pos导致异常 309 | pos = 0 310 | try: 311 | original = script.get('original_tag', '') or '' 312 | if original: 313 | idx = content.find(original) 314 | if idx >= 0: 315 | pos = idx 316 | else: 317 | # 回退使用脚本内容定位 318 | cidx = content.find(script_text[:50]) if script_text else -1 319 | pos = cidx if cidx >= 0 else 0 320 | else: 321 | # 使用src或部分内容定位 322 | src = script.get('src', '') or '' 323 | if src: 324 | import re as _re 325 | m = _re.search(r']*src=["\']' + _re.escape(src) + r'["\']', content, _re.IGNORECASE) 326 | pos = m.start() if m else 0 327 | else: 328 | cidx = content.find(script_text[:50]) if script_text else -1 329 | pos = cidx if cidx >= 0 else 0 330 | except Exception: 331 | pos = 0 332 | # 检测复杂内联脚本 333 | if script_length > 1000: 334 | context = get_context(content, pos, 100) 335 | result = { 336 | 'type': 'suspicious_pattern', 337 | 'file_path': file_path, 338 | 'pattern': 'large_inline_script', 339 | 'matched_content': script_text[:200] + '...', 340 | 'risk_level': 2, 341 | 'description': '大型内联脚本', 342 | 'context': context 343 | } 344 | results.append(result) 345 | 346 | return results 347 | 348 | def _get_pattern_risk_level(self, pattern_name: str) -> int: 349 | """ 350 | 获取模式的风险等级 351 | 352 | Args: 353 | pattern_name: 模式名称 354 | 355 | Returns: 356 | 风险等级 357 | """ 358 | risk_levels = { 359 | 'suspicious_attributes': 3, 360 | 'eval_inline': 4, 361 | 'document_write': 3, 362 | 'base64_decode': 2, 363 | 'data_uri': 2, 364 | 'remote_iframe': 3, 365 | 'hidden_divs': 2, 366 | 'obfuscated_attributes': 3 367 | } 368 | 369 | return risk_levels.get(pattern_name, 1) 370 | 371 | def _get_pattern_description(self, pattern_name: str) -> str: 372 | """ 373 | 获取模式的描述 374 | 375 | Args: 376 | pattern_name: 模式名称 377 | 378 | Returns: 379 | 描述文本 380 | """ 381 | descriptions = { 382 | 'suspicious_attributes': '可疑的事件属性', 383 | 'eval_inline': '内联eval函数', 384 | 'document_write': 'document.write调用', 385 | 'base64_decode': 'Base64解码操作', 386 | 'data_uri': 'Data URI', 387 | 'remote_iframe': '远程iframe', 388 | 'hidden_divs': '隐藏的div元素', 389 | 'obfuscated_attributes': '混淆的属性' 390 | } 391 | 392 | return descriptions.get(pattern_name, pattern_name) 393 | 394 | def _detect_hidden_elements(self, file_path: str, content: str) -> List[Dict[str, Any]]: 395 | """ 396 | 检测HTML中的隐藏元素 397 | 398 | Args: 399 | file_path: 文件路径或URL 400 | content: HTML内容 401 | 402 | Returns: 403 | 隐藏元素检测结果 404 | """ 405 | results = [] 406 | 407 | hidden_elements = find_hidden_elements(content) 408 | 409 | for element in hidden_elements: 410 | # 确保元素字典包含必要的键 411 | if not all(key in element for key in ['type', 'method', 'context']): 412 | # 如果缺少必要的键,使用默认值 413 | element_type = element.get('type', 'unknown') 414 | hiding_method = element.get('method', 'unknown') 415 | context = element.get('context', '') 416 | else: 417 | element_type = element['type'] 418 | hiding_method = element['method'] 419 | context = element['context'] 420 | 421 | # 计算风险等级 422 | risk_level = self._calculate_hidden_element_risk(element) 423 | 424 | if risk_level > 0: 425 | result = { 426 | 'type': 'hidden_element', 427 | 'file_path': file_path, 428 | 'element_type': element_type, 429 | 'hiding_method': hiding_method, 430 | 'risk_level': risk_level, 431 | 'context': context, 432 | 'description': f"隐藏的{element_type}元素,使用{hiding_method}技术" 433 | } 434 | results.append(result) 435 | 436 | return results 437 | 438 | def _calculate_hidden_element_risk(self, element: Dict[str, Any]) -> int: 439 | """ 440 | 计算隐藏元素的风险等级 441 | 442 | Args: 443 | element: 隐藏元素信息 444 | 445 | Returns: 446 | 风险等级 447 | """ 448 | # 基础风险 449 | risk_level = 1 450 | 451 | # 根据隐藏方法调整风险,确保'method'键存在 452 | high_risk_methods = ['position:absolute', 'opacity:0', 'clip-path'] 453 | if 'method' in element and any(method in element['method'] for method in high_risk_methods): 454 | risk_level += 1 455 | 456 | # 检查内容长度,如果内容很长,风险更高 457 | if 'context' in element and len(element['context']) > 100: 458 | risk_level += 1 459 | 460 | # 检查是否包含链接 461 | if 'context' in element and ('href=' in element['context'] or 'src=' in element['context']): 462 | risk_level += 2 463 | 464 | return risk_level 465 | 466 | def _detect_suspicious_comments(self, file_path: str, content: str) -> List[Dict[str, Any]]: 467 | """ 468 | 检测HTML中的可疑注释 469 | 470 | Args: 471 | file_path: 文件路径或URL 472 | content: HTML内容 473 | 474 | Returns: 475 | 可疑注释检测结果 476 | """ 477 | results = [] 478 | 479 | comments = extract_comments(content) 480 | 481 | # 可疑注释模式 482 | suspicious_comment_patterns = { 483 | 'hidden_content': re.compile(r')[\s\S])*?(?:password|secret|hidden|private|admin)(?:(?!-->)[\s\S])*?-->'), 484 | 'encoded_content': re.compile(r')[\s\S])*?(?:base64|hex|escape|decodeURI)(?:(?!-->)[\s\S])*?-->'), 485 | 'conditional_comments': re.compile(r')\S\s)*\]>'), 486 | 'large_comment': re.compile(r')[\s\S]){500,}-->') 487 | } 488 | 489 | for c in comments: 490 | text = c['content'] if isinstance(c, dict) else (c if isinstance(c, str) else '') 491 | if not text: 492 | continue 493 | 494 | for pattern_name, pattern in suspicious_comment_patterns.items(): 495 | if pattern.search(text): 496 | # 计算风险等级 497 | risk_level = self._get_comment_risk_level(pattern_name) 498 | 499 | result = { 500 | 'type': 'suspicious_comment', 501 | 'file_path': file_path, 502 | 'pattern': pattern_name, 503 | 'comment': text[:200] + ('...' if len(text) > 200 else ''), 504 | 'risk_level': risk_level, 505 | 'description': self._get_comment_description(pattern_name), 506 | 'context': get_context(content, content.find(text), content.find(text) + len(text), 50) 507 | } 508 | results.append(result) 509 | 510 | # 检测注释中的链接 511 | link_pattern = re.compile(r'href=["\'](https?://[^"\']+)') 512 | for c in comments: 513 | text = c['content'] if isinstance(c, dict) else (c if isinstance(c, str) else '') 514 | if not text: 515 | continue 516 | for match in link_pattern.finditer(text): 517 | url = match.group(1) 518 | result = { 519 | 'type': 'suspicious_url', 520 | 'file_path': file_path, 521 | 'url': url, 522 | 'risk_level': 3, 523 | 'reason': '链接位于HTML注释中', 524 | 'context': text[:200] + ('...' if len(text) > 200 else '') 525 | } 526 | results.append(result) 527 | 528 | return results 529 | 530 | def _get_comment_risk_level(self, pattern_name: str) -> int: 531 | """ 532 | 获取注释模式的风险等级 533 | 534 | Args: 535 | pattern_name: 模式名称 536 | 537 | Returns: 538 | 风险等级 539 | """ 540 | risk_levels = { 541 | 'hidden_content': 3, 542 | 'encoded_content': 3, 543 | 'conditional_comments': 1, 544 | 'large_comment': 2 545 | } 546 | 547 | return risk_levels.get(pattern_name, 1) 548 | 549 | def _get_comment_description(self, pattern_name: str) -> str: 550 | """ 551 | 获取注释模式的描述 552 | 553 | Args: 554 | pattern_name: 模式名称 555 | 556 | Returns: 557 | 描述文本 558 | """ 559 | descriptions = { 560 | 'hidden_content': '包含敏感信息的注释', 561 | 'encoded_content': '包含编码内容的注释', 562 | 'conditional_comments': '条件注释', 563 | 'large_comment': '大型注释' 564 | } 565 | 566 | return descriptions.get(pattern_name, pattern_name) 567 | 568 | def _detect_suspicious_meta(self, file_path: str, content: str) -> List[Dict[str, Any]]: 569 | """ 570 | 检测HTML中的可疑Meta标签 571 | 572 | Args: 573 | file_path: 文件路径或URL 574 | content: HTML内容 575 | 576 | Returns: 577 | 可疑Meta标签检测结果 578 | """ 579 | results = [] 580 | 581 | meta_tags = extract_meta_tags(content) 582 | 583 | # 检测可疑的refresh或redirect Meta标签 584 | for meta in meta_tags: 585 | http_equiv = meta.get('http-equiv', '').lower() 586 | content_attr = meta.get('content', '').lower() 587 | 588 | if http_equiv in ['refresh', 'redirect'] and 'url=' in content_attr: 589 | # 提取URL 590 | url_match = re.search(r'url=(\S+)', content_attr) 591 | if url_match: 592 | url = url_match.group(1) 593 | result = { 594 | 'type': 'suspicious_url', 595 | 'file_path': file_path, 596 | 'url': url, 597 | 'risk_level': 3, 598 | 'reason': '通过Meta标签重定向', 599 | 'context': meta.get('raw', '') 600 | } 601 | results.append(result) 602 | 603 | # 检测包含可疑内容的Meta标签 604 | suspicious_meta_keywords = ['bot', 'spider', 'crawler', 'nofollow', 'noindex'] 605 | for meta in meta_tags: 606 | name = meta.get('name', '').lower() 607 | content_attr = meta.get('content', '').lower() 608 | 609 | if name in ['robots', 'keywords', 'description']: 610 | for keyword in suspicious_meta_keywords: 611 | if keyword in content_attr: 612 | result = { 613 | 'type': 'suspicious_meta', 614 | 'file_path': file_path, 615 | 'meta_name': name, 616 | 'suspicious_keyword': keyword, 617 | 'risk_level': 1, 618 | 'description': f"包含可疑关键词'{keyword}'的Meta标签", 619 | 'context': meta.get('raw', '') 620 | } 621 | results.append(result) 622 | 623 | return results 624 | 625 | -------------------------------------------------------------------------------- /core/detector/js_detector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | JavaScript检测器模块 5 | """ 6 | 7 | import re 8 | from typing import List, Dict, Any 9 | 10 | from utils.js_utils import ( 11 | extract_suspicious_patterns, 12 | extract_function_calls, 13 | detect_dynamic_urls, 14 | identify_obfuscated_code, 15 | detect_document_modification, 16 | extract_variable_assignments, 17 | extract_comments as js_extract_comments, 18 | strip_comments as js_remove_comments 19 | ) 20 | from utils.common_utils import ( 21 | get_context, 22 | calculate_entropy, 23 | clean_text 24 | ) 25 | from utils.network_utils import ( 26 | extract_urls, 27 | is_external_link, 28 | get_domain 29 | ) 30 | 31 | class JSDetector: 32 | """ 33 | JavaScript代码检测器,用于检测JS中的恶意代码和可疑行为 34 | """ 35 | 36 | def __init__(self, config): 37 | """ 38 | 初始化JavaScript检测器 39 | 40 | Args: 41 | config: 扫描配置对象 42 | """ 43 | self.config = config 44 | self.logger = config.logger 45 | 46 | # 高危JavaScript函数和方法 47 | self.high_risk_functions = { 48 | 'eval': 5, 49 | 'Function': 4, 50 | 'setTimeout': 3, 51 | 'setInterval': 3, 52 | 'document.write': 4, 53 | 'document.writeln': 4, 54 | 'innerHTML': 4, 55 | 'outerHTML': 4, 56 | 'execScript': 5, 57 | 'XMLHttpRequest': 3, 58 | 'fetch': 3, 59 | 'WebSocket': 3, 60 | 'navigator.sendBeacon': 3, 61 | 'window.open': 3, 62 | 'unescape': 3, 63 | 'escape': 3, 64 | 'decodeURI': 2, 65 | 'decodeURIComponent': 2, 66 | 'document.createElement': 3, # 提升DOM创建的风险等级 67 | 'document.createElementNS': 3, 68 | 'appendChild': 3, 69 | 'insertBefore': 3 70 | } 71 | 72 | # 可疑的DOM操作 73 | self.suspicious_dom_operations = { 74 | 'appendChild': 3, 75 | 'insertBefore': 3, 76 | 'replaceChild': 3, 77 | 'createElement': 2, 78 | 'createTextNode': 2, 79 | 'createDocumentFragment': 2, 80 | 'querySelector': 2, 81 | 'querySelectorAll': 2, 82 | 'getElementById': 2, 83 | 'getElementsByClassName': 2, 84 | 'getElementsByTagName': 2 85 | } 86 | 87 | # 混淆代码特征 88 | self.obfuscation_patterns = { 89 | 'hex_encoding': re.compile(r'\\x[0-9a-fA-F]{2}'), 90 | 'unicode_encoding': re.compile(r'\\u[0-9a-fA-F]{4}'), 91 | 'string_concatenation': re.compile(r'["\'][^"\']*["\']\s*\+\s*["\'][^"\']*["\']'), 92 | 'array_manipulation': re.compile(r'\[.*\]\.join\s*\(\s*["\']'), 93 | 'eval_with_arguments': re.compile(r'eval\s*\(\s*[a-zA-Z0-9_$\[\]]+\s*\+'), 94 | 'reversed_string': re.compile(r'\.split\(\s*["\']\s*\)\s*\.reverse\(\)\s*\.join'), 95 | 'base64_like': re.compile(r'[A-Za-z0-9+/=]{20,}'), 96 | 'unusual_variable_names': re.compile(r'[a-zA-Z_$][a-zA-Z0-9_$]{15,}'), 97 | 'suspicious_domain_pattern': re.compile(r'https?://[a-zA-Z0-9]{8,}\.(?:pro|xyz|pw|top|loan|win|bid|online)', re.IGNORECASE) 98 | } 99 | 100 | # 可疑的代码模式 101 | self.suspicious_patterns = { 102 | 'self_executing': re.compile(r'(function\s*\(\s*\)\s*\{[^\}]*\}\s*\(\s*\))|\(([^\)]+)\)\(\)'), 103 | 'conditional_eval': re.compile(r'if\s*\([^\)]*\)\s*\{[^\}]*eval\s*\('), 104 | 'try_catch_eval': re.compile(r'try\s*\{[^\}]*eval\s*\([^\)]*\)[^\}]*\}\s*catch'), 105 | 'hidden_eval': re.compile(r'[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*["\']eval["\'].*;.*\[.*\]\s*\('), 106 | 'document_manipulation_with_eval': re.compile(r'document\.(body|documentElement|head)\.(appendChild|innerHTML)\s*=\s*eval\s*\('), 107 | 'url_to_eval': re.compile(r'(document\.location|window\.location|location)\.(href|search|hash)\s*.*eval\s*\('), 108 | 'cookie_manipulation': re.compile(r'document\.cookie'), 109 | 'user_agent_check': re.compile(r'navigator\.userAgent'), 110 | 'referrer_check': re.compile(r'document\.referrer') 111 | } 112 | 113 | def detect(self, file_path: str, content: str) -> List[Dict[str, Any]]: 114 | """ 115 | 检测JavaScript代码中的恶意内容 116 | 117 | Args: 118 | file_path: 文件路径 119 | content: JavaScript代码内容 120 | 121 | Returns: 122 | 检测结果列表 123 | """ 124 | results = [] 125 | 126 | try: 127 | # 预处理代码,清理空白字符等 128 | clean_content = clean_text(content) 129 | 130 | # 1. 检测高危函数调用 131 | high_risk_results = self._detect_high_risk_functions(file_path, content, clean_content) 132 | results.extend(high_risk_results) 133 | 134 | # 2. 检测混淆代码 135 | obfuscation_results = self._detect_obfuscation(file_path, content, clean_content) 136 | results.extend(obfuscation_results) 137 | 138 | # 3. 检测可疑代码模式 139 | pattern_results = self._detect_suspicious_patterns(file_path, content) 140 | results.extend(pattern_results) 141 | 142 | # 4. 检测动态URL和网络请求 143 | url_results = self._detect_dynamic_urls(file_path, content) 144 | results.extend(url_results) 145 | 146 | # 5. 检测DOM修改操作 147 | dom_results = self._detect_dom_manipulations(file_path, content) 148 | results.extend(dom_results) 149 | 150 | # 6. 检测可疑注释 151 | comment_results = self._detect_suspicious_comments(file_path, content) 152 | results.extend(comment_results) 153 | 154 | # 7. 代码复杂度和熵分析 155 | complexity_results = self._analyze_code_complexity(file_path, content) 156 | results.extend(complexity_results) 157 | 158 | except Exception as e: 159 | self.logger.error(f"JavaScript检测过程中发生错误: {str(e)}", exc_info=True) 160 | 161 | return results 162 | 163 | def _detect_high_risk_functions(self, file_path: str, content: str, clean_content: str) -> List[Dict[str, Any]]: 164 | """ 165 | 检测高危函数调用 166 | 167 | Args: 168 | file_path: 文件路径 169 | content: JavaScript代码内容 170 | clean_content: 清理后的代码内容 171 | 172 | Returns: 173 | 高危函数检测结果 174 | """ 175 | results = [] 176 | 177 | for func_name in self.high_risk_functions.keys(): 178 | calls = extract_function_calls(content, func_name) 179 | for func_call in calls: 180 | args_str = func_call.get('arguments', '') 181 | pos = func_call.get('position', (0, 0)) 182 | risk_level = self.high_risk_functions[func_name] 183 | if any(pattern.search(args_str) for pattern in self.obfuscation_patterns.values()): 184 | risk_level = min(5, risk_level + 1) 185 | context = get_context(content, pos[0], pos[1], 100) 186 | result = { 187 | 'type': 'high_risk_function', 188 | 'file_path': file_path, 189 | 'function_name': func_name, 190 | 'arguments': args_str, 191 | 'risk_level': risk_level, 192 | 'description': f"调用高危函数 {func_name}", 193 | 'context': context 194 | } 195 | results.append(result) 196 | 197 | return results 198 | 199 | def _detect_obfuscation(self, file_path: str, content: str, clean_content: str) -> List[Dict[str, Any]]: 200 | """ 201 | 检测混淆代码 202 | 203 | Args: 204 | file_path: 文件路径 205 | content: JavaScript代码内容 206 | clean_content: 清理后的代码内容 207 | 208 | Returns: 209 | 混淆代码检测结果 210 | """ 211 | results = [] 212 | 213 | # 使用js_utils中的函数识别混淆代码 214 | obfuscation_info = identify_obfuscated_code(content) 215 | 216 | if obfuscation_info['is_obfuscated']: 217 | # 基于混淆特征计算风险等级 218 | risk_level = 3 + len(obfuscation_info['detected_patterns']) 219 | risk_level = min(5, risk_level) 220 | 221 | result = { 222 | 'type': 'obfuscated_code', 223 | 'file_path': file_path, 224 | 'risk_level': risk_level, 225 | 'detected_patterns': obfuscation_info['detected_patterns'], 226 | 'description': "代码疑似被混淆,可能隐藏恶意行为", 227 | 'context': obfuscation_info.get('sample', '')[:200] + '...' if len(obfuscation_info.get('sample', '')) > 200 else obfuscation_info.get('sample', '') 228 | } 229 | results.append(result) 230 | 231 | # 额外检测特定混淆模式 232 | for pattern_name, pattern in self.obfuscation_patterns.items(): 233 | matches = list(pattern.finditer(content)) 234 | if matches: 235 | # 统计匹配次数 236 | match_count = len(matches) 237 | 238 | # 根据匹配次数确定风险等级 239 | if match_count >= 10: 240 | risk_level = 4 241 | elif match_count >= 5: 242 | risk_level = 3 243 | else: 244 | risk_level = 2 245 | 246 | # 获取第一个匹配的上下文 247 | first_match = matches[0] 248 | context = get_context(content, first_match.start(), first_match.end(), 100) 249 | 250 | result = { 251 | 'type': 'obfuscation_pattern', 252 | 'file_path': file_path, 253 | 'pattern': pattern_name, 254 | 'match_count': match_count, 255 | 'risk_level': risk_level, 256 | 'description': f"检测到{self._get_pattern_description(pattern_name)},匹配{match_count}次", 257 | 'context': context 258 | } 259 | results.append(result) 260 | 261 | return results 262 | 263 | def _detect_suspicious_patterns(self, file_path: str, content: str) -> List[Dict[str, Any]]: 264 | """ 265 | 检测可疑代码模式 266 | 267 | Args: 268 | file_path: 文件路径 269 | content: JavaScript代码内容 270 | 271 | Returns: 272 | 可疑代码模式检测结果 273 | """ 274 | results = [] 275 | 276 | for pattern_name, pattern in self.suspicious_patterns.items(): 277 | matches = list(pattern.finditer(content)) 278 | if matches: 279 | for match in matches: 280 | # 确定风险等级 281 | risk_level = self._get_pattern_risk_level(pattern_name) 282 | 283 | # 获取上下文 284 | context = get_context(content, match.start(), match.end(), 100) 285 | 286 | result = { 287 | 'type': 'suspicious_pattern', 288 | 'file_path': file_path, 289 | 'pattern': pattern_name, 290 | 'matched_content': match.group(0), 291 | 'risk_level': risk_level, 292 | 'description': self._get_pattern_description(pattern_name), 293 | 'context': context 294 | } 295 | results.append(result) 296 | 297 | # 使用js_utils中的函数提取可疑模式 298 | suspicious_patterns = extract_suspicious_patterns(content) 299 | for pattern_info in suspicious_patterns: 300 | result = { 301 | 'type': 'suspicious_pattern', 302 | 'file_path': file_path, 303 | 'pattern': pattern_info['type'], 304 | 'matched_content': pattern_info['content'], 305 | 'risk_level': pattern_info['risk_level'], 306 | 'description': pattern_info['description'], 307 | 'context': pattern_info.get('context', '') 308 | } 309 | results.append(result) 310 | 311 | return results 312 | 313 | def _detect_dynamic_urls(self, file_path: str, content: str) -> List[Dict[str, Any]]: 314 | """ 315 | 检测动态URL和网络请求 316 | 317 | Args: 318 | file_path: 文件路径 319 | content: JavaScript代码内容 320 | 321 | Returns: 322 | 动态URL检测结果 323 | """ 324 | results = [] 325 | 326 | # 使用js_utils中的函数检测动态URL 327 | dynamic_urls = detect_dynamic_urls(content) 328 | 329 | for url_info in dynamic_urls: 330 | risk_level = url_info.get('risk_level', 3) 331 | raw_url = url_info.get('url') 332 | expr = url_info.get('expression', '') 333 | if raw_url: 334 | if ('+' in raw_url) or ('[' in raw_url): 335 | risk_level = min(5, risk_level + 1) 336 | result = { 337 | 'type': 'dynamic_url', 338 | 'file_path': file_path, 339 | 'url': raw_url, 340 | 'risk_level': risk_level, 341 | 'reason': url_info.get('reason', '动态构建的URL'), 342 | 'context': url_info.get('context', '') 343 | } 344 | else: 345 | result = { 346 | 'type': 'dynamic_expression', 347 | 'file_path': file_path, 348 | 'expression': expr, 349 | 'risk_level': risk_level, 350 | 'reason': url_info.get('reason', '动态构建的URL'), 351 | 'context': url_info.get('context', '') 352 | } 353 | results.append(result) 354 | 355 | # 提取所有URL并检测可疑URL 356 | urls = extract_urls(content, context_type='js') 357 | for url_obj in urls: 358 | url = url_obj['url'] 359 | context = url_obj['context'] 360 | 361 | # 检测可疑URL 362 | # 传入基准域用于外链判断 363 | base_domain = None 364 | if isinstance(file_path, str) and file_path.startswith(('http://', 'https://')): 365 | base_domain = get_domain(file_path) 366 | if is_external_link(url, base_domain): 367 | risk_level = 3 368 | reasons = [] 369 | 370 | # 1. 检查URL是否匹配可疑域名模式 371 | if self.obfuscation_patterns['suspicious_domain_pattern'].search(url): 372 | risk_level = 5 373 | reasons.append('URL匹配可疑域名模式') 374 | 375 | # 2. 检查URL是否在可疑上下文中 376 | if any(keyword in context.lower() for keyword in ['eval', 'exec', 'decode', 'base64']): 377 | risk_level = min(5, risk_level + 1) 378 | reasons.append('URL在可疑上下文中') 379 | 380 | # 3. 检查URL域名是否使用了可疑后缀 381 | suspicious_suffixes = ['.pro', '.xyz', '.pw', '.top', '.loan', '.win', '.bid', '.online'] 382 | for suffix in suspicious_suffixes: 383 | if url.endswith(suffix): 384 | risk_level = min(5, risk_level + 1) 385 | reasons.append(f'使用了高风险域名后缀{suffix}') 386 | break 387 | 388 | # 4. 检查URL路径是否包含随机字符串 389 | if re.search(r'/[a-zA-Z0-9]{8,}\.js$', url): 390 | risk_level = min(5, risk_level + 1) 391 | reasons.append('URL路径包含长随机字符串') 392 | 393 | # 如果有任何风险因素,添加结果 394 | if risk_level >= 3 or reasons: 395 | result = { 396 | 'type': 'suspicious_url', 397 | 'file_path': file_path, 398 | 'url': url, 399 | 'risk_level': risk_level, 400 | 'reason': '; '.join(reasons) if reasons else '外部URL', 401 | 'context': context 402 | } 403 | results.append(result) 404 | 405 | return results 406 | 407 | def _detect_dom_manipulations(self, file_path: str, content: str) -> List[Dict[str, Any]]: 408 | """ 409 | 检测DOM修改操作 410 | 411 | Args: 412 | file_path: 文件路径 413 | content: JavaScript代码内容 414 | 415 | Returns: 416 | DOM操作检测结果 417 | """ 418 | results = [] 419 | 420 | # 使用js_utils中的函数检测文档修改 421 | modifications = detect_document_modification(content) 422 | 423 | for mod_info in modifications: 424 | # 确定风险等级 425 | risk_level = mod_info.get('risk_level', 3) 426 | 427 | # 检查是否包含可疑内容 428 | target = mod_info.get('target', '') 429 | value = mod_info.get('value', '') 430 | 431 | if 'innerHTML' in target or 'outerHTML' in target: 432 | risk_level = min(5, risk_level + 1) 433 | 434 | if any(pattern.search(value) for pattern in self.obfuscation_patterns.values()): 435 | risk_level = min(5, risk_level + 1) 436 | 437 | result = { 438 | 'type': 'dom_manipulation', 439 | 'file_path': file_path, 440 | 'target': target, 441 | 'value': value[:200] + ('...' if len(value) > 200 else ''), 442 | 'risk_level': risk_level, 443 | 'description': mod_info.get('description', 'DOM修改操作'), 444 | 'context': mod_info.get('context', '') 445 | } 446 | results.append(result) 447 | 448 | # 检测可疑的DOM操作函数 449 | for op_name, base_risk in self.suspicious_dom_operations.items(): 450 | pattern = re.compile(r'\b(?:document|window|this)\b[^\n;]*?\b' + re.escape(op_name) + r'\s*\(') 451 | matches = list(pattern.finditer(content)) 452 | 453 | for match in matches: 454 | # 获取上下文 455 | context = get_context(content, match.start(), match.end(), 100) 456 | 457 | # 检查是否与可疑内容组合使用 458 | risk_level = base_risk 459 | if any(keyword in context.lower() for keyword in ['eval', 'decode', 'base64', 'fromcharcode']): 460 | risk_level = min(5, risk_level + 2) 461 | 462 | result = { 463 | 'type': 'dom_operation', 464 | 'file_path': file_path, 465 | 'operation': op_name, 466 | 'risk_level': risk_level, 467 | 'description': f"可疑的DOM操作: {op_name}", 468 | 'context': context 469 | } 470 | results.append(result) 471 | 472 | return results 473 | 474 | def _detect_suspicious_comments(self, file_path: str, content: str) -> List[Dict[str, Any]]: 475 | """ 476 | 检测可疑注释 477 | 478 | Args: 479 | file_path: 文件路径 480 | content: JavaScript代码内容 481 | 482 | Returns: 483 | 可疑注释检测结果 484 | """ 485 | results = [] 486 | 487 | # 使用js_utils中的函数提取注释 488 | comments = js_extract_comments(content) 489 | 490 | # 可疑注释关键词 491 | suspicious_keywords = ['hack', 'exploit', 'backdoor', 'trojan', 'malware', 'keylogger', 'cracker', 492 | 'steal', 'inject', 'redirect', 'obfuscate', 'encrypt', 'decrypt', 'hidden', 493 | 'admin', 'password', 'credential', 'phish', 'spy', 'tracking'] 494 | 495 | for comment in comments: 496 | text = comment.get('content', '') 497 | comment_lower = text.lower() 498 | 499 | # 检查是否包含可疑关键词 500 | for keyword in suspicious_keywords: 501 | if keyword in comment_lower: 502 | result = { 503 | 'type': 'suspicious_comment', 504 | 'file_path': file_path, 505 | 'keyword': keyword, 506 | 'risk_level': 3, 507 | 'description': f"注释中包含可疑关键词: {keyword}", 508 | 'context': text[:200] + ('...' if len(text) > 200 else '') 509 | } 510 | results.append(result) 511 | break 512 | 513 | # 检查注释中是否包含Base64编码内容 514 | base64_pattern = re.compile(r'[A-Za-z0-9+/=]{32,}') 515 | if base64_pattern.search(text) and len(text) > 50: 516 | result = { 517 | 'type': 'suspicious_comment', 518 | 'file_path': file_path, 519 | 'risk_level': 4, 520 | 'description': "注释中包含疑似Base64编码的长字符串", 521 | 'context': text[:200] + ('...' if len(text) > 200 else '') 522 | } 523 | results.append(result) 524 | 525 | return results 526 | 527 | def _analyze_code_complexity(self, file_path: str, content: str) -> List[Dict[str, Any]]: 528 | """ 529 | 分析代码复杂度和熵 530 | 531 | Args: 532 | file_path: 文件路径 533 | content: JavaScript代码内容 534 | 535 | Returns: 536 | 代码复杂度分析结果 537 | """ 538 | results = [] 539 | 540 | # 移除注释后的代码用于熵计算 541 | code_without_comments = js_remove_comments(content) 542 | 543 | # 计算代码熵 544 | entropy = calculate_entropy(code_without_comments) 545 | 546 | # 如果熵值过高,可能是混淆代码 547 | if entropy > 4.5: 548 | result = { 549 | 'type': 'code_complexity', 550 | 'file_path': file_path, 551 | 'entropy': round(entropy, 2), 552 | 'risk_level': 4, 553 | 'description': f"代码熵值过高 ({round(entropy, 2)}),疑似经过混淆", 554 | 'context': code_without_comments[:200] + ('...' if len(code_without_comments) > 200 else '') 555 | } 556 | results.append(result) 557 | elif entropy > 3.8: 558 | result = { 559 | 'type': 'code_complexity', 560 | 'file_path': file_path, 561 | 'entropy': round(entropy, 2), 562 | 'risk_level': 2, 563 | 'description': f"代码熵值较高 ({round(entropy, 2)}),可能包含复杂逻辑", 564 | 'context': code_without_comments[:200] + ('...' if len(code_without_comments) > 200 else '') 565 | } 566 | results.append(result) 567 | 568 | # 分析变量命名模式 569 | var_pattern = re.compile(r'\bvar\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\b|\blet\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\b|\bconst\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\b') 570 | var_names = [] 571 | 572 | for match in var_pattern.finditer(content): 573 | for group in match.groups(): 574 | if group: 575 | var_names.append(group) 576 | 577 | # 检查是否有大量的短变量名(可能是混淆特征) 578 | short_vars = [name for name in var_names if len(name) <= 2] 579 | if len(short_vars) > 30 and len(var_names) > 50: 580 | short_var_ratio = len(short_vars) / len(var_names) 581 | if short_var_ratio > 0.4: 582 | result = { 583 | 'type': 'code_complexity', 584 | 'file_path': file_path, 585 | 'short_var_count': len(short_vars), 586 | 'total_var_count': len(var_names), 587 | 'risk_level': 3, 588 | 'description': f"存在大量短变量名 ({len(short_vars)}/{len(var_names)}),可能是混淆特征", 589 | 'context': ', '.join(short_vars[:10]) + ('...' if len(short_vars) > 10 else '') 590 | } 591 | results.append(result) 592 | 593 | # 分析函数调用密度 594 | function_call_pattern = re.compile(r'\b[a-zA-Z_$][a-zA-Z0-9_$]*\s*\(') 595 | function_calls = len(list(function_call_pattern.finditer(content))) 596 | 597 | code_length = len(content) 598 | calls_per_1000 = (function_calls / code_length * 1000) if code_length > 0 else 0 599 | 600 | if calls_per_1000 > 50: 601 | result = { 602 | 'type': 'code_complexity', 603 | 'file_path': file_path, 604 | 'function_call_density': round(calls_per_1000, 2), 605 | 'risk_level': 2, 606 | 'description': f"函数调用密度较高 ({round(calls_per_1000, 2)} 次/1000字符),可能包含复杂逻辑", 607 | 'context': content[:200] + ('...' if len(content) > 200 else '') 608 | } 609 | results.append(result) 610 | 611 | return results 612 | 613 | def _get_pattern_description(self, pattern_name: str) -> str: 614 | """ 615 | 获取模式的描述 616 | 617 | Args: 618 | pattern_name: 模式名称 619 | 620 | Returns: 621 | 描述文本 622 | """ 623 | descriptions = { 624 | # 混淆模式描述 625 | 'hex_encoding': '十六进制编码', 626 | 'unicode_encoding': 'Unicode编码', 627 | 'string_concatenation': '字符串拼接', 628 | 'array_manipulation': '数组操作混淆', 629 | 'eval_with_arguments': '带参数的eval调用', 630 | 'reversed_string': '反转字符串', 631 | 'base64_like': '疑似Base64编码', 632 | 'unusual_variable_names': '异常变量名', 633 | 634 | # 可疑模式描述 635 | 'self_executing': '自执行函数', 636 | 'conditional_eval': '条件eval调用', 637 | 'try_catch_eval': 'try-catch中的eval', 638 | 'hidden_eval': '隐藏的eval调用', 639 | 'document_manipulation_with_eval': '使用eval操作DOM', 640 | 'url_to_eval': '从URL提取数据执行eval', 641 | 'cookie_manipulation': 'Cookie操作', 642 | 'user_agent_check': 'User-Agent检查', 643 | 'referrer_check': 'Referrer检查' 644 | } 645 | 646 | return descriptions.get(pattern_name, pattern_name) 647 | 648 | def _get_pattern_risk_level(self, pattern_name: str) -> int: 649 | """ 650 | 获取模式的风险等级 651 | 652 | Args: 653 | pattern_name: 模式名称 654 | 655 | Returns: 656 | 风险等级 657 | """ 658 | risk_levels = { 659 | 'self_executing': 3, 660 | 'conditional_eval': 4, 661 | 'try_catch_eval': 4, 662 | 'hidden_eval': 5, 663 | 'document_manipulation_with_eval': 5, 664 | 'url_to_eval': 5, 665 | 'cookie_manipulation': 3, 666 | 'user_agent_check': 1, 667 | 'referrer_check': 1 668 | } 669 | 670 | return risk_levels.get(pattern_name, 2) 671 | 672 | --------------------------------------------------------------------------------