├── .github └── workflows │ └── ci.yml ├── Dockerfile ├── LICENSE ├── README.md ├── clone └── Clone_Repo.py ├── config.yaml ├── core ├── __init__.py ├── cache.py ├── config_manager.py ├── logger.py ├── memory_optimizer.py ├── parallel_manager.py ├── performance_monitor.py └── resource_manager.py ├── detector ├── Detector.py ├── __init__.py ├── ast_analyzer.py ├── clone_detector.py ├── metrics.py ├── semantic_analyzer.py └── version_predictor.py ├── docker-compose.yml ├── osscollector ├── collector.py └── sample ├── preprocessor ├── Preprocessor_full.py ├── Preprocessor_lite.py ├── __init__.py ├── language_processors │ ├── cpp_processor.py │ └── java_processor.py └── preprocessor.py ├── prometheus.yml ├── re-centris-go ├── cmd │ └── re-centris │ │ └── main.go ├── config.yaml ├── go.mod ├── internal │ ├── analyzer │ │ ├── analyzer.go │ │ ├── parser │ │ │ ├── cpp │ │ │ │ ├── parser.go │ │ │ │ └── parser_test.go │ │ │ └── parser.go │ │ └── tlsh │ │ │ ├── errors.go │ │ │ ├── tlsh.go │ │ │ └── tlsh_test.go │ ├── cmd │ │ ├── analyze.go │ │ ├── clone.go │ │ ├── detect.go │ │ └── root.go │ ├── collector │ │ └── clone │ │ │ └── clone.go │ ├── common │ │ ├── cache │ │ │ └── cache.go │ │ ├── logger │ │ │ └── logger.go │ │ └── monitor │ │ │ └── monitor.go │ ├── config │ │ └── config.go │ ├── detector │ │ └── detector.go │ └── preprocessor │ │ └── preprocessor.go └── tests │ ├── integration │ └── clone_analyze_test.go │ └── security │ └── security_test.go ├── requirements.txt ├── scripts └── deploy.sh └── tests ├── __init__.py ├── core ├── test_cache.py ├── test_config_manager.py ├── test_memory_optimizer.py ├── test_parallel_manager.py ├── test_performance_monitor.py └── test_resource_manager.py ├── detector ├── test_detector.py └── test_version_predictor.py ├── integration ├── test_clone_detection.py └── test_integration.py ├── preprocessor ├── test_cpp_processor.py ├── test_java_processor.py └── test_preprocessor.py ├── run_tests.py └── security └── test_security.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Re-Centris CI/CD 2 | 3 | on: 4 | push: 5 | branches: [ main, develop ] 6 | pull_request: 7 | branches: [ main, develop ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [3.8, 3.9, "3.10"] 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements.txt 28 | pip install -r requirements-dev.txt 29 | 30 | - name: Run linting 31 | run: | 32 | flake8 . 33 | black . --check 34 | isort . --check-only 35 | mypy . 36 | 37 | - name: Run tests 38 | run: | 39 | pytest --cov=. --cov-report=xml 40 | 41 | - name: Upload coverage 42 | uses: codecov/codecov-action@v3 43 | with: 44 | file: ./coverage.xml 45 | 46 | build: 47 | needs: test 48 | runs-on: ubuntu-latest 49 | if: github.event_name == 'push' && github.ref == 'refs/heads/main' 50 | 51 | steps: 52 | - uses: actions/checkout@v3 53 | 54 | - name: Set up Python 55 | uses: actions/setup-python@v4 56 | with: 57 | python-version: "3.10" 58 | 59 | - name: Build package 60 | run: | 61 | pip install build 62 | python -m build 63 | 64 | - name: Upload artifact 65 | uses: actions/upload-artifact@v3 66 | with: 67 | name: dist 68 | path: dist/ 69 | 70 | deploy: 71 | needs: build 72 | runs-on: ubuntu-latest 73 | if: github.event_name == 'push' && github.ref == 'refs/heads/main' 74 | 75 | steps: 76 | - uses: actions/download-artifact@v3 77 | with: 78 | name: dist 79 | path: dist/ 80 | 81 | - name: Set up Python 82 | uses: actions/setup-python@v4 83 | with: 84 | python-version: "3.10" 85 | 86 | - name: Install twine 87 | run: pip install twine 88 | 89 | - name: Publish to PyPI 90 | env: 91 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 92 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 93 | run: twine upload dist/* -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 使用Python官方镜像作为基础镜像 2 | FROM python:3.10-slim 3 | 4 | # 设置工作目录 5 | WORKDIR /app 6 | 7 | # 设置环境变量 8 | ENV PYTHONUNBUFFERED=1 \ 9 | PYTHONDONTWRITEBYTECODE=1 \ 10 | PIP_NO_CACHE_DIR=1 \ 11 | PIP_DISABLE_PIP_VERSION_CHECK=1 12 | 13 | # 安装系统依赖 14 | RUN apt-get update && apt-get install -y --no-install-recommends \ 15 | build-essential \ 16 | libclang-dev \ 17 | git \ 18 | && rm -rf /var/lib/apt/lists/* 19 | 20 | # 复制项目文件 21 | COPY . . 22 | 23 | # 安装Python依赖 24 | RUN pip install --no-cache-dir -r requirements.txt 25 | 26 | # 暴露端口 27 | EXPOSE 8000 28 | 29 | # 设置健康检查 30 | HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ 31 | CMD curl -f http://localhost:8000/health || exit 1 32 | 33 | # 设置启动命令 34 | CMD ["gunicorn", "--bind", "0.0.0.0:8000", "app:app"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 byRen2002 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Re-Centris 2 | 3 | Re-Centris是一个高性能的代码相似度分析工具,基于TLSH(Trend Micro Locality Sensitive Hash)算法实现。它专注于代码克隆检测、开源组件识别和依赖关系分析,支持多种编程语言。 4 | 5 | ## 主要特性 6 | 7 | - **高精度代码相似度分析** 8 | - 基于TLSH算法的模糊哈希匹配 9 | - 支持检测代码重构和变体 10 | - 函数级别的细粒度分析 11 | 12 | - **多语言支持** 13 | - Python版本支持:C/C++、Java、Python 14 | - Go版本当前支持:C/C++(其他语言支持持续添加中) 15 | 16 | - **高性能设计** 17 | - 多进程/协程并行处理 18 | - 内存映射技术 19 | - 智能缓存机制 20 | - 资源使用优化 21 | 22 | - **丰富的分析功能** 23 | - 开源组件识别 24 | - 代码克隆检测 25 | - 依赖关系分析 26 | - 版本信息提取 27 | 28 | ## 版本选择指南 29 | 30 | ### Python版本 31 | - 适用场景: 32 | - 需要分析多种编程语言 33 | - 需要更灵活的扩展性 34 | - 对易用性要求较高 35 | 36 | ### Go版本 37 | - 适用场景: 38 | - 大规模代码库分析 39 | - 对性能要求极高 40 | - 主要分析C/C++代码 41 | 42 | ## 快速开始 43 | 44 | ### Python版本安装 45 | 46 | ```bash 47 | # 1. 克隆仓库 48 | git clone https://github.com/xxx/xxx.git 49 | cd re-centris 50 | 51 | # 2. 创建并激活虚拟环境 52 | python -m venv venv 53 | source venv/bin/activate # Linux/Mac 54 | venv\Scripts\activate # Windows 55 | 56 | # 3. 安装依赖 57 | pip install -r requirements.txt 58 | ``` 59 | 60 | ### Go版本安装 61 | 62 | ```bash 63 | # 1. 克隆仓库 64 | git clone https://github.com/yourusername/re-centris-go.git 65 | cd re-centris-go 66 | 67 | # 2. 构建项目 68 | go build -o re-centris ./cmd/re-centris 69 | 70 | # 3. (可选)系统级安装 71 | go install ./cmd/re-centris 72 | ``` 73 | 74 | ## 使用示例 75 | 76 | ### Python版本 77 | 78 | ```bash 79 | # 1. 收集开源代码信息 80 | python -m osscollector.collector -c config.yaml 81 | 82 | # 2. 预处理代码 83 | python -m preprocessor.preprocessor -c config.yaml 84 | 85 | # 3. 执行相似度检测 86 | python -m detector.detector -c config.yaml -i path/to/input/code 87 | ``` 88 | 89 | ### Go版本 90 | 91 | ```bash 92 | # 1. 克隆并收集代码 93 | re-centris clone repo-list.txt -o ./repos 94 | 95 | # 2. 分析代码 96 | re-centris analyze ./source-code -o ./analysis 97 | 98 | # 3. 执行相似度检测 99 | re-centris detect target-file.cpp -k ./known-files -o results.json 100 | ``` 101 | 102 | ## 配置说明 103 | 104 | 配置文件使用YAML格式,支持以下主要配置项: 105 | 106 | ```yaml 107 | paths: 108 | repo_path: "./repos" 109 | tag_date_path: "./data/repo_date" 110 | result_path: "./data/repo_functions" 111 | 112 | performance: 113 | max_workers: 0 # 自动使用可用CPU核心数 114 | cache_size: 1000 115 | memory_limit: 0.8 # 最大内存使用率 116 | 117 | languages: 118 | cpp: 119 | enabled: true 120 | extensions: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"] 121 | java: 122 | enabled: false 123 | extensions: [".java"] 124 | python: 125 | enabled: false 126 | extensions: [".py"] 127 | ``` 128 | 129 | ## 项目结构 130 | 131 | ### Python版本 132 | ``` 133 | re-centris/ 134 | ├── core/ # 核心功能模块 135 | ├── osscollector/ # 开源代码收集 136 | ├── preprocessor/ # 代码预处理 137 | ├── detector/ # 相似度检测 138 | ├── config.yaml 139 | └── requirements.txt 140 | ``` 141 | 142 | ### Go版本 143 | ``` 144 | re-centris-go/ 145 | ├── cmd/ # CLI入口 146 | ├── internal/ # 核心实现 147 | │ ├── analyzer/ # 代码分析 148 | │ ├── collector/ # 代码收集 149 | │ ├── detector/ # 相似度检测 150 | │ └── preprocessor/ # 预处理 151 | └── config.yaml 152 | ``` 153 | 154 | ## 输出结果 155 | 156 | 分析结果以JSON格式输出,包含: 157 | - 相似度评分 158 | - 函数级别匹配信息 159 | - 依赖关系图谱 160 | - 版本信息追踪 161 | 162 | ## 贡献指南 163 | 164 | 欢迎提交Pull Request!请确保: 165 | 166 | 1. 代码通过所有测试 167 | 2. 添加必要的测试用例 168 | 3. 更新相关文档 169 | 4. 遵循项目代码规范 170 | 171 | ## 许可证 172 | 173 | MIT License - 详见LICENSE文件 174 | 175 | ## 关于 176 | 177 | 由byRen2002开发维护。问题反馈请提交GitHub Issue。 -------------------------------------------------------------------------------- /clone/Clone_Repo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import re 4 | from concurrent.futures import ThreadPoolExecutor, as_completed 5 | from tqdm import tqdm 6 | import logging 7 | from typing import List, Tuple 8 | 9 | # 配置日志 10 | logging.basicConfig( 11 | level=logging.INFO, 12 | format='%(asctime)s - %(levelname)s - %(message)s', 13 | handlers=[ 14 | logging.FileHandler('clone.log'), 15 | logging.StreamHandler() 16 | ] 17 | ) 18 | 19 | def parse_repo_url(repo_url: str) -> Tuple[str, str, str]: 20 | """ 21 | 解析GitHub仓库URL,返回作者和仓库名 22 | 23 | Args: 24 | repo_url: GitHub仓库URL 25 | 26 | Returns: 27 | Tuple[str, str, str]: 作者名,仓库名,目标路径 28 | """ 29 | match = re.search(r'github\.com/([^/]+)/([^/]+)', repo_url) 30 | if not match: 31 | raise ValueError(f"无法解析仓库URL: {repo_url}") 32 | 33 | author, repo_name = match.groups() 34 | repo_name = repo_name[:-4] if repo_name.endswith('.git') else repo_name 35 | 36 | return author, repo_name, repo_url 37 | 38 | def clone_single_repo(repo_info: Tuple[str, str, str], clone_path: str) -> bool: 39 | """ 40 | 克隆单个仓库 41 | 42 | Args: 43 | repo_info: 包含作者名,仓库名,URL的元组 44 | clone_path: 克隆目标路径 45 | 46 | Returns: 47 | bool: 克隆是否成功 48 | """ 49 | author, repo_name, repo_url = repo_info 50 | folder_name = f"{author}%{repo_name}" 51 | target_path = os.path.join(clone_path, folder_name) 52 | 53 | if os.path.exists(target_path): 54 | logging.info(f"仓库 {folder_name} 已存在,跳过克隆") 55 | return True 56 | 57 | try: 58 | # 优化的git clone命令 59 | cmd = [ 60 | 'git', 'clone', 61 | '--depth', '1', # 只克隆最新版本 62 | '--single-branch', # 只克隆默认分支 63 | '--no-tags', # 不克隆标签 64 | repo_url, 65 | target_path 66 | ] 67 | 68 | subprocess.run( 69 | cmd, 70 | check=True, 71 | stdout=subprocess.PIPE, 72 | stderr=subprocess.PIPE 73 | ) 74 | 75 | logging.info(f"成功克隆仓库 {folder_name}") 76 | return True 77 | 78 | except subprocess.CalledProcessError as e: 79 | logging.error(f"克隆仓库 {repo_url} 失败: {e.stderr.decode()}") 80 | return False 81 | except Exception as e: 82 | logging.error(f"处理仓库 {repo_url} 时发生错误: {str(e)}") 83 | return False 84 | 85 | def clone_repositories(repo_list_file: str, clone_path: str, max_workers: int = 5): 86 | """ 87 | 并行克隆多个GitHub仓库 88 | 89 | Args: 90 | repo_list_file: 包含GitHub仓库URL的文件路径 91 | clone_path: 克隆仓库的目标路径 92 | max_workers: 最大并行工作线程数 93 | """ 94 | # 确保目标目录存在 95 | os.makedirs(clone_path, exist_ok=True) 96 | 97 | # 读取仓库URL列表 98 | try: 99 | with open(repo_list_file, 'r', buffering=8192) as f: 100 | repo_urls = [url.strip() for url in f if url.strip()] 101 | except Exception as e: 102 | logging.error(f"读取仓库列表文件失败: {str(e)}") 103 | return 104 | 105 | if not repo_urls: 106 | logging.warning("仓库列表为空") 107 | return 108 | 109 | # 解析所有仓库URL 110 | repo_infos = [] 111 | for url in repo_urls: 112 | try: 113 | repo_infos.append(parse_repo_url(url)) 114 | except ValueError as e: 115 | logging.error(str(e)) 116 | continue 117 | 118 | # 使用线程池并行克隆 119 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 120 | # 提交所有克隆任务 121 | future_to_repo = { 122 | executor.submit(clone_single_repo, repo_info, clone_path): repo_info 123 | for repo_info in repo_infos 124 | } 125 | 126 | # 使用tqdm显示进度 127 | with tqdm(total=len(repo_infos), desc="克隆进度") as pbar: 128 | for future in as_completed(future_to_repo): 129 | repo_info = future_to_repo[future] 130 | try: 131 | success = future.result() 132 | if success: 133 | pbar.set_description(f"成功克隆 {repo_info[1]}") 134 | else: 135 | pbar.set_description(f"克隆失败 {repo_info[1]}") 136 | except Exception as e: 137 | logging.error(f"处理仓库 {repo_info[1]} 时发生错误: {str(e)}") 138 | finally: 139 | pbar.update(1) 140 | 141 | logging.info("所有仓库克隆完成") 142 | 143 | if __name__ == "__main__": 144 | clone_repositories( 145 | '/home/rby/Project/project-file/dependency_analysis/sample', 146 | '/home/rby/Project/project-file/dependency_analysis/repo_src' 147 | ) 148 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # Re-Centris 配置文件 2 | 3 | # 路径配置 4 | paths: 5 | # 仓库路径 6 | repo_path: "./repos" 7 | # 标签日期路径 8 | tag_date_path: "./osscollector/repo_date" 9 | # 结果路径 10 | result_path: "./osscollector/repo_functions" 11 | # 日志路径 12 | log_path: "./logs" 13 | # 版本索引路径 14 | ver_idx_path: "./preprocessor/verIDX" 15 | # 初始签名数据库路径 16 | initial_db_path: "./preprocessor/initialSigs" 17 | # 最终组件数据库路径 18 | final_db_path: "./preprocessor/componentDB" 19 | # 元信息路径 20 | meta_path: "./preprocessor/metaInfos" 21 | # 权重路径 22 | weight_path: "./preprocessor/metaInfos/weights" 23 | # 函数日期路径 24 | func_date_path: "./preprocessor/funcDate" 25 | # 缓存路径 26 | cache_path: "./cache" 27 | 28 | # 性能配置 29 | performance: 30 | # 最大工作进程数,默认为CPU核心数 31 | max_workers: null 32 | # 缓存大小 33 | cache_size: 1000 34 | # 缓存过期时间(秒) 35 | cache_expire: 3600 36 | # 内存使用限制(0.0-1.0) 37 | memory_limit: 0.9 38 | # 超时时间(秒) 39 | timeout: 30 40 | # 批处理大小 41 | batch_size: 1000 42 | 43 | # 日志配置 44 | logging: 45 | # 日志级别: DEBUG, INFO, WARNING, ERROR, CRITICAL 46 | level: "INFO" 47 | # 日志文件最大大小(字节) 48 | max_size: 10485760 # 10MB 49 | # 日志文件备份数量 50 | backup_count: 5 51 | 52 | # 分析配置 53 | analysis: 54 | # 相似度阈值 55 | theta: 0.1 56 | # TLSH差异阈值 57 | tlsh_threshold: 30 58 | 59 | # 外部工具配置 60 | external_tools: 61 | # ctags路径 62 | ctags_path: "ctags" 63 | 64 | # 支持的语言配置 65 | languages: 66 | # C/C++ 67 | cpp: 68 | enabled: true 69 | extensions: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"] 70 | # Java 71 | java: 72 | enabled: false 73 | extensions: [".java"] 74 | # Python 75 | python: 76 | enabled: false 77 | extensions: [".py"] 78 | # JavaScript 79 | javascript: 80 | enabled: false 81 | extensions: [".js", ".jsx", ".ts", ".tsx"] 82 | # Go 83 | go: 84 | enabled: false 85 | extensions: [".go"] -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | """Re-Centris 核心模块 2 | 3 | 该模块提供了Re-Centris项目的核心功能和工具类,包括: 4 | - 缓存管理 5 | - 资源管理 6 | - 内存优化 7 | - 性能监控 8 | - 配置管理 9 | - 日志系统 10 | 11 | 作者: Re-Centris团队 12 | 版本: 1.0.0 13 | 许可证: MIT 14 | """ 15 | 16 | from .cache import Cache 17 | from .resource_manager import ResourceManager 18 | from .memory_optimizer import MemoryOptimizer 19 | from .performance_monitor import PerformanceMonitor 20 | from .config_manager import ConfigManager 21 | from .logger import setup_logger 22 | 23 | __all__ = [ 24 | 'Cache', 25 | 'ResourceManager', 26 | 'MemoryOptimizer', 27 | 'PerformanceMonitor', 28 | 'ConfigManager', 29 | 'setup_logger' 30 | ] -------------------------------------------------------------------------------- /core/cache.py: -------------------------------------------------------------------------------- 1 | """缓存模块 2 | 3 | 该模块提供了统一的缓存管理功能,支持内存缓存和持久化缓存, 4 | 以及LRU淘汰策略、过期时间和大小限制等功能。 5 | 6 | 作者: Re-Centris团队 7 | 版本: 1.0.0 8 | 许可证: MIT 9 | """ 10 | 11 | import os 12 | import time 13 | import pickle 14 | import threading 15 | import logging 16 | from typing import Dict, Any, Optional, Tuple, List, Callable 17 | from functools import wraps 18 | 19 | # 获取模块日志记录器 20 | logger = logging.getLogger("re-centris.cache") 21 | 22 | 23 | class Cache: 24 | """通用缓存类,支持LRU淘汰策略、过期时间和大小限制""" 25 | 26 | def __init__( 27 | self, 28 | max_size: int = 1000, 29 | expire_time: int = 3600, 30 | persistent: bool = False, 31 | cache_dir: Optional[str] = None 32 | ): 33 | """初始化缓存 34 | 35 | Args: 36 | max_size: 缓存最大条目数 37 | expire_time: 缓存过期时间(秒) 38 | persistent: 是否持久化缓存 39 | cache_dir: 缓存目录,仅在persistent=True时有效 40 | """ 41 | self.max_size = max_size 42 | self.expire_time = expire_time 43 | self.persistent = persistent 44 | self.cache_dir = cache_dir 45 | 46 | if persistent and not cache_dir: 47 | raise ValueError("持久化缓存必须指定缓存目录") 48 | 49 | if persistent and not os.path.exists(cache_dir): 50 | os.makedirs(cache_dir) 51 | 52 | self._cache: Dict[str, Any] = {} 53 | self._access_times: Dict[str, float] = {} 54 | self._lock = threading.Lock() 55 | 56 | def get(self, key: str) -> Optional[Any]: 57 | """获取缓存值 58 | 59 | Args: 60 | key: 缓存键 61 | 62 | Returns: 63 | 缓存值,如果不存在或已过期则返回None 64 | """ 65 | with self._lock: 66 | # 检查内存缓存 67 | if key in self._cache: 68 | access_time = self._access_times[key] 69 | if time.time() - access_time <= self.expire_time: 70 | # 更新访问时间 71 | self._access_times[key] = time.time() 72 | return self._cache[key] 73 | else: 74 | # 缓存已过期,删除 75 | del self._cache[key] 76 | del self._access_times[key] 77 | 78 | # 如果启用了持久化缓存,尝试从文件加载 79 | if self.persistent: 80 | cache_file = self._get_cache_file(key) 81 | if os.path.exists(cache_file): 82 | try: 83 | with open(cache_file, 'rb') as f: 84 | data = pickle.load(f) 85 | timestamp, value = data 86 | 87 | if time.time() - timestamp <= self.expire_time: 88 | # 加载到内存缓存 89 | self._cache[key] = value 90 | self._access_times[key] = time.time() 91 | return value 92 | else: 93 | # 缓存已过期,删除文件 94 | os.remove(cache_file) 95 | except Exception as e: 96 | logger.warning(f"从持久化缓存加载失败: {e}") 97 | 98 | return None 99 | 100 | def put(self, key: str, value: Any) -> None: 101 | """存入缓存值 102 | 103 | Args: 104 | key: 缓存键 105 | value: 缓存值 106 | """ 107 | with self._lock: 108 | # 如果键已存在,更新访问时间 109 | if key in self._cache: 110 | self._access_times[key] = time.time() 111 | self._cache[key] = value 112 | else: 113 | # 如果缓存已满,淘汰最久未使用的项 114 | if len(self._cache) >= self.max_size: 115 | self._evict_lru() 116 | 117 | # 添加新项 118 | self._cache[key] = value 119 | self._access_times[key] = time.time() 120 | 121 | # 如果启用了持久化缓存,保存到文件 122 | if self.persistent: 123 | self._save_to_file(key, value) 124 | 125 | def _evict_lru(self) -> None: 126 | """淘汰最久未使用的缓存项""" 127 | if not self._access_times: 128 | return 129 | 130 | # 找出访问时间最早的键 131 | oldest_key = min(self._access_times.items(), key=lambda x: x[1])[0] 132 | 133 | # 从内存缓存中删除 134 | del self._cache[oldest_key] 135 | del self._access_times[oldest_key] 136 | 137 | # 如果启用了持久化缓存,删除文件 138 | if self.persistent: 139 | cache_file = self._get_cache_file(oldest_key) 140 | if os.path.exists(cache_file): 141 | try: 142 | os.remove(cache_file) 143 | except Exception as e: 144 | logger.warning(f"删除缓存文件失败: {e}") 145 | 146 | def _get_cache_file(self, key: str) -> str: 147 | """获取缓存文件路径 148 | 149 | Args: 150 | key: 缓存键 151 | 152 | Returns: 153 | 缓存文件路径 154 | """ 155 | # 使用MD5哈希作为文件名,避免文件名无效字符 156 | import hashlib 157 | key_hash = hashlib.md5(key.encode()).hexdigest() 158 | return os.path.join(self.cache_dir, f"{key_hash}.cache") 159 | 160 | def _save_to_file(self, key: str, value: Any) -> None: 161 | """保存缓存项到文件 162 | 163 | Args: 164 | key: 缓存键 165 | value: 缓存值 166 | """ 167 | if not self.persistent: 168 | return 169 | 170 | cache_file = self._get_cache_file(key) 171 | try: 172 | with open(cache_file, 'wb') as f: 173 | # 保存时间戳和值 174 | data = (time.time(), value) 175 | pickle.dump(data, f) 176 | except Exception as e: 177 | logger.warning(f"保存缓存到文件失败: {e}") 178 | 179 | def clear(self) -> None: 180 | """清空缓存""" 181 | with self._lock: 182 | self._cache.clear() 183 | self._access_times.clear() 184 | 185 | # 如果启用了持久化缓存,删除所有缓存文件 186 | if self.persistent and os.path.exists(self.cache_dir): 187 | for filename in os.listdir(self.cache_dir): 188 | if filename.endswith(".cache"): 189 | try: 190 | os.remove(os.path.join(self.cache_dir, filename)) 191 | except Exception as e: 192 | logger.warning(f"删除缓存文件失败: {e}") 193 | 194 | def remove(self, key: str) -> bool: 195 | """删除缓存项 196 | 197 | Args: 198 | key: 缓存键 199 | 200 | Returns: 201 | 是否成功删除 202 | """ 203 | with self._lock: 204 | if key in self._cache: 205 | del self._cache[key] 206 | del self._access_times[key] 207 | 208 | # 如果启用了持久化缓存,删除文件 209 | if self.persistent: 210 | cache_file = self._get_cache_file(key) 211 | if os.path.exists(cache_file): 212 | try: 213 | os.remove(cache_file) 214 | except Exception as e: 215 | logger.warning(f"删除缓存文件失败: {e}") 216 | 217 | return True 218 | return False 219 | 220 | def keys(self) -> List[str]: 221 | """获取所有缓存键 222 | 223 | Returns: 224 | 缓存键列表 225 | """ 226 | with self._lock: 227 | return list(self._cache.keys()) 228 | 229 | def size(self) -> int: 230 | """获取缓存大小 231 | 232 | Returns: 233 | 缓存条目数 234 | """ 235 | with self._lock: 236 | return len(self._cache) 237 | 238 | def has_key(self, key: str) -> bool: 239 | """检查缓存键是否存在 240 | 241 | Args: 242 | key: 缓存键 243 | 244 | Returns: 245 | 缓存键是否存在 246 | """ 247 | with self._lock: 248 | return key in self._cache 249 | 250 | 251 | def cached(cache: Cache, key_func: Optional[Callable] = None): 252 | """函数结果缓存装饰器 253 | 254 | Args: 255 | cache: 缓存对象 256 | key_func: 缓存键生成函数,如果为None则使用函数名和参数生成 257 | 258 | Returns: 259 | 装饰器函数 260 | """ 261 | def decorator(func): 262 | @wraps(func) 263 | def wrapper(*args, **kwargs): 264 | # 生成缓存键 265 | if key_func: 266 | key = key_func(*args, **kwargs) 267 | else: 268 | # 默认使用函数名和参数生成键 269 | key = f"{func.__module__}.{func.__name__}:{str(args)}:{str(kwargs)}" 270 | 271 | # 尝试从缓存获取 272 | result = cache.get(key) 273 | if result is not None: 274 | return result 275 | 276 | # 执行函数 277 | result = func(*args, **kwargs) 278 | 279 | # 存入缓存 280 | cache.put(key, result) 281 | 282 | return result 283 | return wrapper 284 | return decorator -------------------------------------------------------------------------------- /core/config_manager.py: -------------------------------------------------------------------------------- 1 | """配置管理模块 2 | 3 | 该模块提供了统一的配置管理功能,支持从配置文件、环境变量和命令行参数加载配置。 4 | 配置项包括路径设置、性能参数、日志设置等。 5 | 6 | 作者: Re-Centris团队 7 | 版本: 1.0.0 8 | 许可证: MIT 9 | """ 10 | 11 | import os 12 | import json 13 | import yaml 14 | import logging 15 | from pathlib import Path 16 | from typing import Dict, Any, Optional, Union 17 | 18 | 19 | class ConfigManager: 20 | """配置管理类,负责加载、验证和提供配置信息""" 21 | 22 | def __init__(self, config_file: Optional[str] = None): 23 | """初始化配置管理器 24 | 25 | Args: 26 | config_file: 配置文件路径,如果为None,则尝试从默认位置加载 27 | """ 28 | self.config: Dict[str, Any] = {} 29 | self.config_file = config_file 30 | self._load_default_config() 31 | 32 | if config_file: 33 | self.load_config(config_file) 34 | else: 35 | # 尝试从默认位置加载配置 36 | default_locations = [ 37 | "./config.yaml", 38 | "./config.json", 39 | os.path.expanduser("~/.re-centris/config.yaml"), 40 | os.path.expanduser("~/.re-centris/config.json"), 41 | "/etc/re-centris/config.yaml", 42 | "/etc/re-centris/config.json" 43 | ] 44 | 45 | for location in default_locations: 46 | if os.path.exists(location): 47 | self.load_config(location) 48 | break 49 | 50 | # 从环境变量加载配置 51 | self._load_from_env() 52 | 53 | def _load_default_config(self) -> None: 54 | """加载默认配置""" 55 | # 获取当前工作目录 56 | current_dir = os.getcwd() 57 | 58 | self.config = { 59 | "paths": { 60 | "current_path": current_dir, 61 | "repo_path": os.path.join(current_dir, "repos"), 62 | "tag_date_path": os.path.join(current_dir, "osscollector", "repo_date"), 63 | "result_path": os.path.join(current_dir, "osscollector", "repo_functions"), 64 | "log_path": os.path.join(current_dir, "logs"), 65 | "ver_idx_path": os.path.join(current_dir, "preprocessor", "verIDX"), 66 | "initial_db_path": os.path.join(current_dir, "preprocessor", "initialSigs"), 67 | "final_db_path": os.path.join(current_dir, "preprocessor", "componentDB"), 68 | "meta_path": os.path.join(current_dir, "preprocessor", "metaInfos"), 69 | "weight_path": os.path.join(current_dir, "preprocessor", "metaInfos", "weights"), 70 | "func_date_path": os.path.join(current_dir, "preprocessor", "funcDate"), 71 | "cache_path": os.path.join(current_dir, "cache") 72 | }, 73 | "performance": { 74 | "max_workers": os.cpu_count(), 75 | "cache_size": 1000, 76 | "cache_expire": 3600, # 1小时 77 | "memory_limit": 0.9, # 90% 78 | "timeout": 30, # 30秒 79 | "batch_size": 1000 80 | }, 81 | "logging": { 82 | "level": "INFO", 83 | "max_size": 10 * 1024 * 1024, # 10MB 84 | "backup_count": 5 85 | }, 86 | "analysis": { 87 | "theta": 0.1, # 相似度阈值 88 | "tlsh_threshold": 30 # TLSH差异阈值 89 | }, 90 | "external_tools": { 91 | "ctags_path": "ctags" # 默认从PATH中查找 92 | } 93 | } 94 | 95 | def load_config(self, config_file: str) -> None: 96 | """从文件加载配置 97 | 98 | Args: 99 | config_file: 配置文件路径 100 | 101 | Raises: 102 | FileNotFoundError: 配置文件不存在 103 | ValueError: 配置文件格式错误 104 | """ 105 | if not os.path.exists(config_file): 106 | raise FileNotFoundError(f"配置文件不存在: {config_file}") 107 | 108 | try: 109 | ext = os.path.splitext(config_file)[1].lower() 110 | 111 | if ext == '.json': 112 | with open(config_file, 'r', encoding='utf-8') as f: 113 | file_config = json.load(f) 114 | elif ext in ['.yaml', '.yml']: 115 | with open(config_file, 'r', encoding='utf-8') as f: 116 | file_config = yaml.safe_load(f) 117 | else: 118 | raise ValueError(f"不支持的配置文件格式: {ext}") 119 | 120 | # 递归更新配置 121 | self._update_config(self.config, file_config) 122 | 123 | logging.info(f"已从 {config_file} 加载配置") 124 | 125 | except Exception as e: 126 | logging.error(f"加载配置文件失败: {e}") 127 | raise 128 | 129 | def _update_config(self, target: Dict, source: Dict) -> None: 130 | """递归更新配置字典 131 | 132 | Args: 133 | target: 目标配置字典 134 | source: 源配置字典 135 | """ 136 | for key, value in source.items(): 137 | if key in target and isinstance(target[key], dict) and isinstance(value, dict): 138 | self._update_config(target[key], value) 139 | else: 140 | target[key] = value 141 | 142 | def _load_from_env(self) -> None: 143 | """从环境变量加载配置 144 | 145 | 环境变量格式: RECENTRIS_SECTION_KEY=value 146 | 例如: RECENTRIS_PATHS_REPO_PATH=/path/to/repos 147 | """ 148 | prefix = "RECENTRIS_" 149 | 150 | for key, value in os.environ.items(): 151 | if key.startswith(prefix): 152 | parts = key[len(prefix):].lower().split('_') 153 | 154 | if len(parts) >= 2: 155 | section = parts[0] 156 | subkey = '_'.join(parts[1:]) 157 | 158 | if section in self.config: 159 | if subkey in self.config[section]: 160 | # 尝试转换值类型 161 | orig_value = self.config[section][subkey] 162 | if isinstance(orig_value, bool): 163 | self.config[section][subkey] = value.lower() in ['true', '1', 'yes'] 164 | elif isinstance(orig_value, int): 165 | self.config[section][subkey] = int(value) 166 | elif isinstance(orig_value, float): 167 | self.config[section][subkey] = float(value) 168 | else: 169 | self.config[section][subkey] = value 170 | 171 | def get(self, section: str, key: str, default: Any = None) -> Any: 172 | """获取配置值 173 | 174 | Args: 175 | section: 配置部分 176 | key: 配置键 177 | default: 默认值,如果配置不存在则返回该值 178 | 179 | Returns: 180 | 配置值 181 | """ 182 | if section in self.config and key in self.config[section]: 183 | return self.config[section][key] 184 | return default 185 | 186 | def set(self, section: str, key: str, value: Any) -> None: 187 | """设置配置值 188 | 189 | Args: 190 | section: 配置部分 191 | key: 配置键 192 | value: 配置值 193 | """ 194 | if section not in self.config: 195 | self.config[section] = {} 196 | 197 | self.config[section][key] = value 198 | 199 | def get_path(self, key: str) -> str: 200 | """获取路径配置 201 | 202 | Args: 203 | key: 路径键名 204 | 205 | Returns: 206 | 路径字符串 207 | """ 208 | path = self.get("paths", key) 209 | if path: 210 | # 确保目录存在 211 | os.makedirs(path, exist_ok=True) 212 | return path 213 | 214 | def save_config(self, config_file: Optional[str] = None) -> None: 215 | """保存配置到文件 216 | 217 | Args: 218 | config_file: 配置文件路径,如果为None则使用初始化时的配置文件 219 | """ 220 | if config_file is None: 221 | config_file = self.config_file 222 | 223 | if not config_file: 224 | raise ValueError("未指定配置文件路径") 225 | 226 | try: 227 | # 确保目录存在 228 | os.makedirs(os.path.dirname(os.path.abspath(config_file)), exist_ok=True) 229 | 230 | ext = os.path.splitext(config_file)[1].lower() 231 | 232 | if ext == '.json': 233 | with open(config_file, 'w', encoding='utf-8') as f: 234 | json.dump(self.config, f, indent=2, ensure_ascii=False) 235 | elif ext in ['.yaml', '.yml']: 236 | with open(config_file, 'w', encoding='utf-8') as f: 237 | yaml.dump(self.config, f, default_flow_style=False, allow_unicode=True) 238 | else: 239 | raise ValueError(f"不支持的配置文件格式: {ext}") 240 | 241 | logging.info(f"配置已保存到 {config_file}") 242 | 243 | except Exception as e: 244 | logging.error(f"保存配置文件失败: {e}") 245 | raise 246 | 247 | def create_required_directories(self) -> None: 248 | """创建所有必需的目录""" 249 | for key, path in self.config["paths"].items(): 250 | if isinstance(path, str) and not os.path.exists(path): 251 | try: 252 | os.makedirs(path) 253 | logging.info(f"创建目录: {path}") 254 | except Exception as e: 255 | logging.error(f"创建目录 {path} 失败: {e}") -------------------------------------------------------------------------------- /core/logger.py: -------------------------------------------------------------------------------- 1 | """日志模块 2 | 3 | 该模块提供了统一的日志配置和管理功能,支持文件日志和控制台日志, 4 | 以及日志轮转、级别控制和格式化等功能。 5 | 6 | 作者: Re-Centris团队 7 | 版本: 1.0.0 8 | 许可证: MIT 9 | """ 10 | 11 | import os 12 | import sys 13 | import logging 14 | import datetime 15 | from logging.handlers import RotatingFileHandler 16 | from typing import Optional, Dict, Any, Union 17 | 18 | 19 | def setup_logger( 20 | name: str = "re-centris", 21 | log_file: Optional[str] = None, 22 | log_level: Union[int, str] = logging.INFO, 23 | max_size: int = 10 * 1024 * 1024, # 10MB 24 | backup_count: int = 5, 25 | console: bool = True, 26 | format_str: Optional[str] = None 27 | ) -> logging.Logger: 28 | """设置日志记录器 29 | 30 | Args: 31 | name: 日志记录器名称 32 | log_file: 日志文件路径,如果为None则不记录到文件 33 | log_level: 日志级别,可以是整数或字符串 34 | max_size: 日志文件最大大小(字节) 35 | backup_count: 日志文件备份数量 36 | console: 是否输出到控制台 37 | format_str: 日志格式字符串,如果为None则使用默认格式 38 | 39 | Returns: 40 | 配置好的日志记录器 41 | """ 42 | # 转换日志级别 43 | if isinstance(log_level, str): 44 | log_level = getattr(logging, log_level.upper(), logging.INFO) 45 | 46 | # 创建日志记录器 47 | logger = logging.getLogger(name) 48 | logger.setLevel(log_level) 49 | 50 | # 清除现有处理器 51 | for handler in logger.handlers[:]: 52 | logger.removeHandler(handler) 53 | 54 | # 设置日志格式 55 | if format_str is None: 56 | format_str = "%(asctime)s - %(levelname)s - %(name)s - %(message)s" 57 | formatter = logging.Formatter(format_str) 58 | 59 | # 添加文件处理器 60 | if log_file: 61 | # 确保日志目录存在 62 | log_dir = os.path.dirname(log_file) 63 | if log_dir and not os.path.exists(log_dir): 64 | os.makedirs(log_dir) 65 | 66 | file_handler = RotatingFileHandler( 67 | log_file, 68 | maxBytes=max_size, 69 | backupCount=backup_count, 70 | encoding="utf-8" 71 | ) 72 | file_handler.setFormatter(formatter) 73 | logger.addHandler(file_handler) 74 | 75 | # 添加控制台处理器 76 | if console: 77 | console_handler = logging.StreamHandler() 78 | console_handler.setFormatter(formatter) 79 | logger.addHandler(console_handler) 80 | 81 | return logger 82 | 83 | 84 | def get_module_logger(module_name: str) -> logging.Logger: 85 | """获取模块日志记录器 86 | 87 | Args: 88 | module_name: 模块名称 89 | 90 | Returns: 91 | 模块日志记录器 92 | """ 93 | return logging.getLogger(f"re-centris.{module_name}") 94 | 95 | 96 | class LoggerAdapter(logging.LoggerAdapter): 97 | """日志适配器,用于添加上下文信息""" 98 | 99 | def __init__(self, logger: logging.Logger, extra: Optional[Dict[str, Any]] = None): 100 | """初始化日志适配器 101 | 102 | Args: 103 | logger: 日志记录器 104 | extra: 额外上下文信息 105 | """ 106 | super().__init__(logger, extra or {}) 107 | 108 | def process(self, msg: str, kwargs: Dict[str, Any]) -> tuple: 109 | """处理日志消息,添加上下文信息 110 | 111 | Args: 112 | msg: 日志消息 113 | kwargs: 关键字参数 114 | 115 | Returns: 116 | 处理后的消息和关键字参数 117 | """ 118 | context_str = " ".join(f"[{k}={v}]" for k, v in self.extra.items()) 119 | if context_str: 120 | msg = f"{context_str} {msg}" 121 | return msg, kwargs 122 | 123 | 124 | def create_context_logger( 125 | logger: logging.Logger, 126 | context: Dict[str, Any] 127 | ) -> LoggerAdapter: 128 | """创建带有上下文的日志记录器 129 | 130 | Args: 131 | logger: 基础日志记录器 132 | context: 上下文信息 133 | 134 | Returns: 135 | 带有上下文的日志适配器 136 | """ 137 | return LoggerAdapter(logger, context) -------------------------------------------------------------------------------- /core/memory_optimizer.py: -------------------------------------------------------------------------------- 1 | """内存优化器模块 2 | 3 | 该模块提供了内存使用优化功能,包括内存使用监控、分批处理数据、 4 | 自动垃圾回收和内存限制等功能。 5 | 6 | 作者: Re-Centris团队 7 | 版本: 1.0.0 8 | 许可证: MIT 9 | """ 10 | 11 | import os 12 | import gc 13 | import sys 14 | import time 15 | import logging 16 | import threading 17 | from typing import List, Any, Callable, Generator, TypeVar, Generic, Optional 18 | 19 | # 获取模块日志记录器 20 | logger = logging.getLogger("re-centris.memory_optimizer") 21 | 22 | # 定义泛型类型 23 | T = TypeVar('T') 24 | R = TypeVar('R') 25 | 26 | 27 | class MemoryOptimizer: 28 | """内存优化器,提供内存使用优化功能""" 29 | 30 | def __init__( 31 | self, 32 | target_memory_usage: float = 0.8, 33 | initial_batch_size: int = 1000, 34 | min_batch_size: int = 100, 35 | max_batch_size: int = 10000, 36 | check_interval: int = 10 37 | ): 38 | """初始化内存优化器 39 | 40 | Args: 41 | target_memory_usage: 目标内存使用率(0.0-1.0) 42 | initial_batch_size: 初始批处理大小 43 | min_batch_size: 最小批处理大小 44 | max_batch_size: 最大批处理大小 45 | check_interval: 内存检查间隔(秒) 46 | """ 47 | self.target_memory_usage = target_memory_usage 48 | self.current_batch_size = initial_batch_size 49 | self.min_batch_size = min_batch_size 50 | self.max_batch_size = max_batch_size 51 | self.check_interval = check_interval 52 | self._lock = threading.Lock() 53 | self._last_check_time = 0 54 | self._last_gc_time = 0 55 | 56 | def get_memory_usage(self) -> float: 57 | """获取当前内存使用率 58 | 59 | Returns: 60 | 内存使用率(0.0-1.0) 61 | """ 62 | try: 63 | import psutil 64 | process = psutil.Process() 65 | return process.memory_percent() / 100 66 | except ImportError: 67 | # 如果没有psutil,使用简单的内存使用估计 68 | if hasattr(sys, 'getsizeof'): 69 | # 获取Python解释器使用的内存 70 | memory_used = 0 71 | for obj in gc.get_objects(): 72 | try: 73 | memory_used += sys.getsizeof(obj) 74 | except: 75 | pass 76 | # 估计总内存 77 | try: 78 | with open('/proc/meminfo', 'r') as f: 79 | for line in f: 80 | if 'MemTotal' in line: 81 | total_memory = int(line.split()[1]) * 1024 82 | return memory_used / total_memory 83 | except: 84 | pass 85 | return 0.5 # 默认返回中等内存使用率 86 | 87 | def should_gc(self) -> bool: 88 | """判断是否需要执行垃圾回收 89 | 90 | Returns: 91 | 是否需要执行垃圾回收 92 | """ 93 | current_time = time.time() 94 | 95 | # 至少间隔10秒检查一次 96 | if current_time - self._last_check_time < self.check_interval: 97 | return False 98 | 99 | self._last_check_time = current_time 100 | 101 | # 检查内存使用率 102 | memory_usage = self.get_memory_usage() 103 | 104 | # 如果内存使用率超过目标,执行垃圾回收 105 | if memory_usage > self.target_memory_usage: 106 | # 至少间隔30秒执行一次垃圾回收 107 | if current_time - self._last_gc_time >= 30: 108 | self._last_gc_time = current_time 109 | return True 110 | 111 | return False 112 | 113 | def optimize(self) -> None: 114 | """执行内存优化""" 115 | if self.should_gc(): 116 | logger.debug("执行垃圾回收") 117 | gc.collect() 118 | 119 | def adjust_batch_size(self) -> int: 120 | """根据内存使用情况调整批处理大小 121 | 122 | Returns: 123 | 调整后的批处理大小 124 | """ 125 | with self._lock: 126 | memory_usage = self.get_memory_usage() 127 | 128 | if memory_usage > self.target_memory_usage: 129 | # 内存使用率过高,减小批处理大小 130 | self.current_batch_size = max( 131 | self.min_batch_size, 132 | int(self.current_batch_size * 0.8) 133 | ) 134 | elif memory_usage < self.target_memory_usage * 0.7: 135 | # 内存使用率较低,增加批处理大小 136 | self.current_batch_size = min( 137 | self.max_batch_size, 138 | int(self.current_batch_size * 1.2) 139 | ) 140 | 141 | return self.current_batch_size 142 | 143 | def batch_items(self, items: List[T]) -> Generator[List[T], None, None]: 144 | """分批处理数据 145 | 146 | Args: 147 | items: 数据项列表 148 | 149 | Yields: 150 | 批次数据 151 | """ 152 | for i in range(0, len(items), self.current_batch_size): 153 | batch = items[i:i + self.current_batch_size] 154 | yield batch 155 | 156 | # 优化内存 157 | self.optimize() 158 | 159 | # 调整批处理大小 160 | self.adjust_batch_size() 161 | 162 | def process_in_batches( 163 | self, 164 | items: List[T], 165 | processor: Callable[[List[T]], List[R]] 166 | ) -> List[R]: 167 | """分批处理数据并合并结果 168 | 169 | Args: 170 | items: 数据项列表 171 | processor: 处理函数,接受一个批次数据,返回处理结果 172 | 173 | Returns: 174 | 所有批次的处理结果合并 175 | """ 176 | results = [] 177 | 178 | for batch in self.batch_items(items): 179 | batch_results = processor(batch) 180 | results.extend(batch_results) 181 | 182 | return results 183 | 184 | def monitor_memory( 185 | self, 186 | interval: int = 60, 187 | callback: Optional[Callable[[float], None]] = None 188 | ) -> threading.Thread: 189 | """启动内存监控线程 190 | 191 | Args: 192 | interval: 监控间隔(秒) 193 | callback: 回调函数,接受当前内存使用率 194 | 195 | Returns: 196 | 监控线程 197 | """ 198 | def _monitor(): 199 | while True: 200 | try: 201 | memory_usage = self.get_memory_usage() 202 | 203 | if callback: 204 | callback(memory_usage) 205 | else: 206 | logger.info(f"当前内存使用率: {memory_usage:.2%}") 207 | 208 | # 如果内存使用率过高,执行垃圾回收 209 | if memory_usage > self.target_memory_usage: 210 | logger.warning(f"内存使用率过高: {memory_usage:.2%}") 211 | gc.collect() 212 | 213 | time.sleep(interval) 214 | except Exception as e: 215 | logger.error(f"内存监控异常: {e}") 216 | time.sleep(interval) 217 | 218 | thread = threading.Thread(target=_monitor, daemon=True) 219 | thread.start() 220 | return thread 221 | 222 | 223 | class BatchProcessor(Generic[T, R]): 224 | """批处理器,用于高效处理大量数据""" 225 | 226 | def __init__( 227 | self, 228 | processor: Callable[[T], R], 229 | batch_size: int = 1000, 230 | memory_optimizer: Optional[MemoryOptimizer] = None 231 | ): 232 | """初始化批处理器 233 | 234 | Args: 235 | processor: 处理函数,接受一个数据项,返回处理结果 236 | batch_size: 批处理大小 237 | memory_optimizer: 内存优化器,如果为None则创建新的 238 | """ 239 | self.processor = processor 240 | self.batch_size = batch_size 241 | self.memory_optimizer = memory_optimizer or MemoryOptimizer() 242 | 243 | def process(self, items: List[T]) -> List[R]: 244 | """处理数据项列表 245 | 246 | Args: 247 | items: 数据项列表 248 | 249 | Returns: 250 | 处理结果列表 251 | """ 252 | results = [] 253 | 254 | for batch in self.memory_optimizer.batch_items(items): 255 | batch_results = [self.processor(item) for item in batch] 256 | results.extend(batch_results) 257 | 258 | # 优化内存 259 | self.memory_optimizer.optimize() 260 | 261 | return results 262 | 263 | def process_generator(self, items_generator: Generator[T, None, None]) -> Generator[R, None, None]: 264 | """处理数据项生成器 265 | 266 | Args: 267 | items_generator: 数据项生成器 268 | 269 | Yields: 270 | 处理结果 271 | """ 272 | batch = [] 273 | 274 | for item in items_generator: 275 | batch.append(item) 276 | 277 | if len(batch) >= self.batch_size: 278 | for result in self._process_batch(batch): 279 | yield result 280 | batch = [] 281 | 282 | # 优化内存 283 | self.memory_optimizer.optimize() 284 | 285 | # 处理剩余项 286 | if batch: 287 | for result in self._process_batch(batch): 288 | yield result 289 | 290 | def _process_batch(self, batch: List[T]) -> List[R]: 291 | """处理单个批次 292 | 293 | Args: 294 | batch: 批次数据 295 | 296 | Returns: 297 | 处理结果 298 | """ 299 | return [self.processor(item) for item in batch] -------------------------------------------------------------------------------- /core/parallel_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import multiprocessing 4 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed 5 | from typing import List, Callable, Any, Dict, Optional, Union 6 | from functools import partial 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class ParallelManager: 11 | """并行处理管理器类""" 12 | 13 | def __init__(self, max_workers: Optional[int] = None): 14 | """初始化并行处理管理器 15 | 16 | Args: 17 | max_workers: 最大工作进程数,默认为CPU核心数 18 | """ 19 | self.max_workers = max_workers or multiprocessing.cpu_count() 20 | self._process_pools: Dict[str, ProcessPoolExecutor] = {} 21 | self._thread_pools: Dict[str, ThreadPoolExecutor] = {} 22 | 23 | def process_items(self, 24 | items: List[Any], 25 | process_func: Callable, 26 | pool_name: str = "default", 27 | chunk_size: Optional[int] = None, 28 | use_threads: bool = False, 29 | **kwargs) -> List[Any]: 30 | """并行处理项目列表 31 | 32 | Args: 33 | items: 待处理项目列表 34 | process_func: 处理函数 35 | pool_name: 进程池名称 36 | chunk_size: 分块大小 37 | use_threads: 是否使用线程池 38 | **kwargs: 传递给处理函数的额外参数 39 | 40 | Returns: 41 | 处理结果列表 42 | """ 43 | if not items: 44 | return [] 45 | 46 | # 确定分块大小 47 | if chunk_size is None: 48 | chunk_size = max(1, len(items) // (self.max_workers * 4)) 49 | 50 | # 准备任务 51 | chunked_items = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] 52 | partial_func = partial(process_func, **kwargs) 53 | 54 | # 选择执行器 55 | executor_cls = ThreadPoolExecutor if use_threads else ProcessPoolExecutor 56 | executor_dict = self._thread_pools if use_threads else self._process_pools 57 | 58 | # 获取或创建执行器 59 | if pool_name not in executor_dict: 60 | executor_dict[pool_name] = executor_cls(max_workers=self.max_workers) 61 | executor = executor_dict[pool_name] 62 | 63 | results = [] 64 | try: 65 | # 提交任务 66 | futures = [ 67 | executor.submit(partial_func, chunk) 68 | for chunk in chunked_items 69 | ] 70 | 71 | # 收集结果 72 | for future in as_completed(futures): 73 | try: 74 | result = future.result() 75 | if isinstance(result, list): 76 | results.extend(result) 77 | else: 78 | results.append(result) 79 | except Exception as e: 80 | logger.error(f"处理任务失败: {str(e)}") 81 | 82 | except Exception as e: 83 | logger.error(f"并行处理失败: {str(e)}") 84 | 85 | return results 86 | 87 | def process_items_with_progress(self, 88 | items: List[Any], 89 | process_func: Callable, 90 | progress_callback: Callable[[int, int], None], 91 | pool_name: str = "default", 92 | chunk_size: Optional[int] = None, 93 | use_threads: bool = False, 94 | **kwargs) -> List[Any]: 95 | """带进度回调的并行处理 96 | 97 | Args: 98 | items: 待处理项目列表 99 | process_func: 处理函数 100 | progress_callback: 进度回调函数 101 | pool_name: 进程池名称 102 | chunk_size: 分块大小 103 | use_threads: 是否使用线程池 104 | **kwargs: 传递给处理函数的额外参数 105 | 106 | Returns: 107 | 处理结果列表 108 | """ 109 | total_items = len(items) 110 | processed_items = 0 111 | 112 | def update_progress(result): 113 | nonlocal processed_items 114 | processed_items += len(result) if isinstance(result, list) else 1 115 | progress_callback(processed_items, total_items) 116 | return result 117 | 118 | results = self.process_items( 119 | items=items, 120 | process_func=process_func, 121 | pool_name=pool_name, 122 | chunk_size=chunk_size, 123 | use_threads=use_threads, 124 | **kwargs 125 | ) 126 | 127 | for result in results: 128 | update_progress(result) 129 | 130 | return results 131 | 132 | def close_pool(self, pool_name: str, use_threads: bool = False): 133 | """关闭指定的进程池或线程池 134 | 135 | Args: 136 | pool_name: 池名称 137 | use_threads: 是否为线程池 138 | """ 139 | pool_dict = self._thread_pools if use_threads else self._process_pools 140 | if pool_name in pool_dict: 141 | pool_dict[pool_name].shutdown() 142 | del pool_dict[pool_name] 143 | 144 | def close_all(self): 145 | """关闭所有进程池和线程池""" 146 | for pool in list(self._process_pools.values()): 147 | pool.shutdown() 148 | self._process_pools.clear() 149 | 150 | for pool in list(self._thread_pools.values()): 151 | pool.shutdown() 152 | self._thread_pools.clear() -------------------------------------------------------------------------------- /core/resource_manager.py: -------------------------------------------------------------------------------- 1 | """资源管理器模块 2 | 3 | 该模块提供了统一的资源管理功能,包括文件句柄、进程池、线程池等资源的管理, 4 | 确保资源在使用后被正确释放,避免资源泄漏。 5 | 6 | 作者: Re-Centris团队 7 | 版本: 1.0.0 8 | 许可证: MIT 9 | """ 10 | 11 | import os 12 | import logging 13 | import threading 14 | import multiprocessing 15 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor 16 | from typing import Dict, Any, Optional, Tuple, Union, Set 17 | 18 | # 获取模块日志记录器 19 | logger = logging.getLogger("re-centris.resource_manager") 20 | 21 | 22 | class ResourceManager: 23 | """资源管理器,负责管理和释放各种资源""" 24 | 25 | def __init__(self): 26 | """初始化资源管理器""" 27 | self._file_handles: Dict[Tuple[str, str], Any] = {} 28 | self._process_pools: Dict[str, ProcessPoolExecutor] = {} 29 | self._thread_pools: Dict[str, ThreadPoolExecutor] = {} 30 | self._resources: Dict[str, Any] = {} 31 | self._lock = threading.Lock() 32 | 33 | def __enter__(self): 34 | """上下文管理器入口""" 35 | return self 36 | 37 | def __exit__(self, exc_type, exc_val, exc_tb): 38 | """上下文管理器退出时释放所有资源""" 39 | self.close_all() 40 | 41 | def get_file_handle(self, path: str, mode: str = 'r', encoding: Optional[str] = None) -> Any: 42 | """获取文件句柄 43 | 44 | Args: 45 | path: 文件路径 46 | mode: 打开模式 47 | encoding: 文件编码 48 | 49 | Returns: 50 | 文件句柄 51 | """ 52 | with self._lock: 53 | key = (path, mode) 54 | if key not in self._file_handles: 55 | try: 56 | if encoding: 57 | self._file_handles[key] = open(path, mode, encoding=encoding) 58 | else: 59 | self._file_handles[key] = open(path, mode) 60 | except Exception as e: 61 | logger.error(f"打开文件失败 {path}: {e}") 62 | raise 63 | return self._file_handles[key] 64 | 65 | def close_file(self, path: str, mode: str = 'r') -> None: 66 | """关闭文件句柄 67 | 68 | Args: 69 | path: 文件路径 70 | mode: 打开模式 71 | """ 72 | with self._lock: 73 | key = (path, mode) 74 | if key in self._file_handles: 75 | try: 76 | self._file_handles[key].close() 77 | except Exception as e: 78 | logger.warning(f"关闭文件失败 {path}: {e}") 79 | finally: 80 | del self._file_handles[key] 81 | 82 | def get_process_pool(self, name: str = "default", max_workers: Optional[int] = None) -> ProcessPoolExecutor: 83 | """获取进程池 84 | 85 | Args: 86 | name: 进程池名称 87 | max_workers: 最大工作进程数,如果为None则使用CPU核心数 88 | 89 | Returns: 90 | 进程池 91 | """ 92 | with self._lock: 93 | if name not in self._process_pools: 94 | if max_workers is None: 95 | max_workers = multiprocessing.cpu_count() 96 | self._process_pools[name] = ProcessPoolExecutor(max_workers=max_workers) 97 | return self._process_pools[name] 98 | 99 | def get_thread_pool(self, name: str = "default", max_workers: Optional[int] = None) -> ThreadPoolExecutor: 100 | """获取线程池 101 | 102 | Args: 103 | name: 线程池名称 104 | max_workers: 最大工作线程数,如果为None则使用CPU核心数的5倍 105 | 106 | Returns: 107 | 线程池 108 | """ 109 | with self._lock: 110 | if name not in self._thread_pools: 111 | if max_workers is None: 112 | max_workers = multiprocessing.cpu_count() * 5 113 | self._thread_pools[name] = ThreadPoolExecutor(max_workers=max_workers) 114 | return self._thread_pools[name] 115 | 116 | def register_resource(self, name: str, resource: Any, close_method: str = "close") -> None: 117 | """注册自定义资源 118 | 119 | Args: 120 | name: 资源名称 121 | resource: 资源对象 122 | close_method: 关闭资源的方法名 123 | """ 124 | with self._lock: 125 | if name in self._resources: 126 | logger.warning(f"资源 {name} 已存在,将被覆盖") 127 | 128 | self._resources[name] = (resource, close_method) 129 | 130 | def get_resource(self, name: str) -> Optional[Any]: 131 | """获取自定义资源 132 | 133 | Args: 134 | name: 资源名称 135 | 136 | Returns: 137 | 资源对象,如果不存在则返回None 138 | """ 139 | with self._lock: 140 | if name in self._resources: 141 | return self._resources[name][0] 142 | return None 143 | 144 | def close_resource(self, name: str) -> bool: 145 | """关闭自定义资源 146 | 147 | Args: 148 | name: 资源名称 149 | 150 | Returns: 151 | 是否成功关闭 152 | """ 153 | with self._lock: 154 | if name in self._resources: 155 | resource, close_method = self._resources[name] 156 | try: 157 | getattr(resource, close_method)() 158 | del self._resources[name] 159 | return True 160 | except Exception as e: 161 | logger.warning(f"关闭资源 {name} 失败: {e}") 162 | return False 163 | 164 | def close_all(self) -> None: 165 | """关闭所有资源""" 166 | with self._lock: 167 | # 关闭文件句柄 168 | for key, handle in list(self._file_handles.items()): 169 | try: 170 | handle.close() 171 | except Exception as e: 172 | logger.warning(f"关闭文件失败 {key[0]}: {e}") 173 | self._file_handles.clear() 174 | 175 | # 关闭进程池 176 | for name, pool in list(self._process_pools.items()): 177 | try: 178 | pool.shutdown() 179 | except Exception as e: 180 | logger.warning(f"关闭进程池 {name} 失败: {e}") 181 | self._process_pools.clear() 182 | 183 | # 关闭线程池 184 | for name, pool in list(self._thread_pools.items()): 185 | try: 186 | pool.shutdown() 187 | except Exception as e: 188 | logger.warning(f"关闭线程池 {name} 失败: {e}") 189 | self._thread_pools.clear() 190 | 191 | # 关闭自定义资源 192 | for name, (resource, close_method) in list(self._resources.items()): 193 | try: 194 | getattr(resource, close_method)() 195 | except Exception as e: 196 | logger.warning(f"关闭资源 {name} 失败: {e}") 197 | self._resources.clear() 198 | 199 | def __del__(self): 200 | """析构时关闭所有资源""" 201 | self.close_all() 202 | 203 | 204 | class SafeFileHandler: 205 | """安全的文件处理器,自动处理文件打开和关闭""" 206 | 207 | def __init__(self, path: str, mode: str = 'r', encoding: Optional[str] = None): 208 | """初始化安全文件处理器 209 | 210 | Args: 211 | path: 文件路径 212 | mode: 打开模式 213 | encoding: 文件编码 214 | """ 215 | self.path = path 216 | self.mode = mode 217 | self.encoding = encoding 218 | self.file = None 219 | 220 | def __enter__(self): 221 | """上下文管理器入口""" 222 | try: 223 | if self.encoding: 224 | self.file = open(self.path, self.mode, encoding=self.encoding) 225 | else: 226 | self.file = open(self.path, self.mode) 227 | return self.file 228 | except Exception as e: 229 | logger.error(f"打开文件失败 {self.path}: {e}") 230 | raise 231 | 232 | def __exit__(self, exc_type, exc_val, exc_tb): 233 | """上下文管理器退出时关闭文件""" 234 | if self.file: 235 | try: 236 | self.file.close() 237 | except Exception as e: 238 | logger.warning(f"关闭文件失败 {self.path}: {e}") 239 | 240 | 241 | def safe_open(path: str, mode: str = 'r', encoding: Optional[str] = None) -> SafeFileHandler: 242 | """安全打开文件 243 | 244 | Args: 245 | path: 文件路径 246 | mode: 打开模式 247 | encoding: 文件编码 248 | 249 | Returns: 250 | 安全文件处理器 251 | """ 252 | return SafeFileHandler(path, mode, encoding) -------------------------------------------------------------------------------- /detector/__init__.py: -------------------------------------------------------------------------------- 1 | """Re-Centris 检测器包 - 基于TLSH的代码克隆和依赖关系检测工具。 2 | 3 | 主要功能: 4 | 1. 代码克隆检测 - 使用TLSH算法检测代码克隆 5 | 2. 依赖关系分析 - 分析组件间的依赖关系 6 | 3. 版本预测 - 预测使用的组件版本 7 | 8 | 作者: byRen2002 9 | 修改日期: 2025年3月 10 | 许可证: MIT License 11 | """ 12 | 13 | from .detector import Detector 14 | 15 | __all__ = ['Detector'] -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | web: 5 | build: . 6 | ports: 7 | - "8000:8000" 8 | volumes: 9 | - .:/app 10 | environment: 11 | - FLASK_ENV=production 12 | - FLASK_APP=app.py 13 | depends_on: 14 | - redis 15 | - postgres 16 | networks: 17 | - re-centris-network 18 | deploy: 19 | replicas: 2 20 | restart_policy: 21 | condition: on-failure 22 | max_attempts: 3 23 | resources: 24 | limits: 25 | cpus: '1' 26 | memory: 1G 27 | healthcheck: 28 | test: ["CMD", "curl", "-f", "http://localhost:8000/health"] 29 | interval: 30s 30 | timeout: 10s 31 | retries: 3 32 | start_period: 40s 33 | 34 | redis: 35 | image: redis:alpine 36 | ports: 37 | - "6379:6379" 38 | volumes: 39 | - redis-data:/data 40 | networks: 41 | - re-centris-network 42 | deploy: 43 | resources: 44 | limits: 45 | cpus: '0.5' 46 | memory: 500M 47 | 48 | postgres: 49 | image: postgres:13-alpine 50 | ports: 51 | - "5432:5432" 52 | volumes: 53 | - postgres-data:/var/lib/postgresql/data 54 | environment: 55 | - POSTGRES_DB=re_centris 56 | - POSTGRES_USER=re_centris 57 | - POSTGRES_PASSWORD=${DB_PASSWORD} 58 | networks: 59 | - re-centris-network 60 | deploy: 61 | resources: 62 | limits: 63 | cpus: '1' 64 | memory: 1G 65 | 66 | prometheus: 67 | image: prom/prometheus 68 | ports: 69 | - "9090:9090" 70 | volumes: 71 | - ./prometheus.yml:/etc/prometheus/prometheus.yml 72 | - prometheus-data:/prometheus 73 | networks: 74 | - re-centris-network 75 | deploy: 76 | resources: 77 | limits: 78 | cpus: '0.5' 79 | memory: 500M 80 | 81 | grafana: 82 | image: grafana/grafana 83 | ports: 84 | - "3000:3000" 85 | volumes: 86 | - grafana-data:/var/lib/grafana 87 | depends_on: 88 | - prometheus 89 | networks: 90 | - re-centris-network 91 | deploy: 92 | resources: 93 | limits: 94 | cpus: '0.5' 95 | memory: 500M 96 | 97 | volumes: 98 | redis-data: 99 | postgres-data: 100 | prometheus-data: 101 | grafana-data: 102 | 103 | networks: 104 | re-centris-network: 105 | driver: bridge -------------------------------------------------------------------------------- /osscollector/sample: -------------------------------------------------------------------------------- 1 | git clone https://github.com/redis/redis.git 2 | git clone https://github.com/torvalds/linux.git 3 | git clone https://github.com/git/git.git 4 | git clone https://github.com/openssl/openssl.git 5 | git clone https://github.com/tensorflow/tensorflow.git 6 | git clone https://github.com/electron/electron.git 7 | git clone https://github.com/microsoft/terminal.git 8 | git clone https://github.com/apple/swift.git 9 | git clone https://github.com/opencv/opencv.git 10 | git clone https://github.com/bitcoin/bitcoin.git -------------------------------------------------------------------------------- /preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | """Re-Centris 预处理器包 2 | 3 | 该包提供了预处理开源代码库中收集的函数信息的功能。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT 8 | """ 9 | 10 | from .preprocessor import Preprocessor 11 | 12 | __all__ = ['Preprocessor'] -------------------------------------------------------------------------------- /preprocessor/language_processors/java_processor.py: -------------------------------------------------------------------------------- 1 | """Java语言处理器 2 | 3 | 该模块实现了Java代码的解析和处理功能。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import os 11 | import re 12 | import ast 13 | import javalang 14 | from typing import Dict, List, Tuple, Optional 15 | import logging 16 | 17 | class JavaProcessor: 18 | """Java代码处理器类""" 19 | 20 | def __init__(self): 21 | """初始化Java处理器""" 22 | self.method_pattern = re.compile( 23 | r'(?:public|private|protected|static|\s) +[\w\<\>\[\]]+\s+(\w+) *\([^\)]*\) *\{?[^\{]*$' 24 | ) 25 | 26 | def extract_methods(self, file_path: str) -> List[Dict[str, str]]: 27 | """提取Java文件中的方法 28 | 29 | 参数: 30 | file_path: Java文件路径 31 | 32 | 返回: 33 | 方法列表,每个方法包含名称、内容、起始行等信息 34 | """ 35 | methods = [] 36 | try: 37 | with open(file_path, 'r', encoding='utf-8') as f: 38 | content = f.read() 39 | 40 | # 使用javalang解析Java代码 41 | tree = javalang.parse.parse(content) 42 | 43 | for _, node in tree.filter(javalang.tree.MethodDeclaration): 44 | method = { 45 | 'name': node.name, 46 | 'content': self._get_method_content(content, node), 47 | 'start_line': node.position.line if node.position else 0, 48 | 'modifiers': [str(mod) for mod in node.modifiers], 49 | 'return_type': self._get_return_type(node), 50 | 'parameters': self._get_parameters(node) 51 | } 52 | methods.append(method) 53 | 54 | except Exception as e: 55 | logging.error(f"处理Java文件 {file_path} 时出错: {e}") 56 | 57 | return methods 58 | 59 | def _get_method_content(self, content: str, node: javalang.tree.MethodDeclaration) -> str: 60 | """获取方法的完整内容""" 61 | try: 62 | lines = content.splitlines() 63 | start_line = node.position.line - 1 64 | 65 | # 找到方法体的结束位置 66 | end_line = start_line 67 | brace_count = 0 68 | found_first_brace = False 69 | 70 | for i, line in enumerate(lines[start_line:], start_line): 71 | if '{' in line: 72 | brace_count += line.count('{') 73 | found_first_brace = True 74 | if '}' in line: 75 | brace_count -= line.count('}') 76 | 77 | if found_first_brace and brace_count == 0: 78 | end_line = i 79 | break 80 | 81 | return '\n'.join(lines[start_line:end_line + 1]) 82 | 83 | except Exception as e: 84 | logging.error(f"提取方法内容时出错: {e}") 85 | return "" 86 | 87 | def _get_return_type(self, node: javalang.tree.MethodDeclaration) -> str: 88 | """获取方法返回类型""" 89 | try: 90 | return str(node.return_type.name) if node.return_type else "void" 91 | except: 92 | return "void" 93 | 94 | def _get_parameters(self, node: javalang.tree.MethodDeclaration) -> List[Dict[str, str]]: 95 | """获取方法参数列表""" 96 | params = [] 97 | try: 98 | for param in node.parameters: 99 | params.append({ 100 | 'name': param.name, 101 | 'type': str(param.type.name) 102 | }) 103 | except: 104 | pass 105 | return params 106 | 107 | def analyze_complexity(self, method_content: str) -> Dict[str, int]: 108 | """分析方法的复杂度 109 | 110 | 参数: 111 | method_content: 方法内容 112 | 113 | 返回: 114 | 包含圈复杂度、认知复杂度等指标的字典 115 | """ 116 | metrics = { 117 | 'cyclomatic_complexity': 1, # 基础复杂度为1 118 | 'cognitive_complexity': 0, 119 | 'nesting_depth': 0 120 | } 121 | 122 | try: 123 | # 计算圈复杂度 124 | metrics['cyclomatic_complexity'] += ( 125 | method_content.count('if ') + 126 | method_content.count('while ') + 127 | method_content.count('for ') + 128 | method_content.count('case ') + 129 | method_content.count('catch ') + 130 | method_content.count('&&') + 131 | method_content.count('||') 132 | ) 133 | 134 | # 计算嵌套深度 135 | current_depth = 0 136 | max_depth = 0 137 | 138 | for line in method_content.split('\n'): 139 | if '{' in line: 140 | current_depth += 1 141 | max_depth = max(max_depth, current_depth) 142 | if '}' in line: 143 | current_depth -= 1 144 | 145 | metrics['nesting_depth'] = max_depth 146 | 147 | # 计算认知复杂度 148 | metrics['cognitive_complexity'] = ( 149 | metrics['cyclomatic_complexity'] + 150 | metrics['nesting_depth'] 151 | ) 152 | 153 | except Exception as e: 154 | logging.error(f"分析方法复杂度时出错: {e}") 155 | 156 | return metrics 157 | 158 | def extract_class_info(self, file_path: str) -> Dict[str, any]: 159 | """提取类信息 160 | 161 | 参数: 162 | file_path: Java文件路径 163 | 164 | 返回: 165 | 包含类名、包名、导入等信息的字典 166 | """ 167 | class_info = { 168 | 'name': '', 169 | 'package': '', 170 | 'imports': [], 171 | 'extends': None, 172 | 'implements': [], 173 | 'modifiers': [] 174 | } 175 | 176 | try: 177 | with open(file_path, 'r', encoding='utf-8') as f: 178 | content = f.read() 179 | 180 | tree = javalang.parse.parse(content) 181 | 182 | # 获取包名 183 | if tree.package: 184 | class_info['package'] = str(tree.package.name) 185 | 186 | # 获取导入 187 | class_info['imports'] = [ 188 | str(imp.path) for imp in tree.imports 189 | ] 190 | 191 | # 获取类信息 192 | for path, node in tree.filter(javalang.tree.ClassDeclaration): 193 | class_info['name'] = node.name 194 | class_info['modifiers'] = [str(mod) for mod in node.modifiers] 195 | 196 | if node.extends: 197 | class_info['extends'] = str(node.extends.name) 198 | 199 | if node.implements: 200 | class_info['implements'] = [ 201 | str(impl.name) for impl in node.implements 202 | ] 203 | break # 只处理第一个类 204 | 205 | except Exception as e: 206 | logging.error(f"提取类信息时出错: {e}") 207 | 208 | return class_info 209 | 210 | def get_method_signature(self, method: Dict[str, str]) -> str: 211 | """生成方法签名 212 | 213 | 参数: 214 | method: 方法信息字典 215 | 216 | 返回: 217 | 标准化的方法签名 218 | """ 219 | try: 220 | modifiers = ' '.join(method.get('modifiers', [])) 221 | return_type = method.get('return_type', 'void') 222 | name = method.get('name', '') 223 | 224 | params = [] 225 | for param in method.get('parameters', []): 226 | params.append(f"{param['type']} {param['name']}") 227 | 228 | signature = f"{modifiers} {return_type} {name}({', '.join(params)})" 229 | return signature.strip() 230 | 231 | except Exception as e: 232 | logging.error(f"生成方法签名时出错: {e}") 233 | return "" 234 | 235 | def normalize_code(self, code: str) -> str: 236 | """规范化代码 237 | 238 | 参数: 239 | code: 源代码 240 | 241 | 返回: 242 | 规范化后的代码 243 | """ 244 | try: 245 | # 移除注释 246 | code = re.sub(r'//.*?\n|/\*.*?\*/', '', code, flags=re.DOTALL) 247 | 248 | # 移除空行 249 | code = '\n'.join( 250 | line for line in code.splitlines() 251 | if line.strip() 252 | ) 253 | 254 | # 规范化空白字符 255 | code = re.sub(r'\s+', ' ', code) 256 | 257 | # 规范化字符串字面量 258 | code = re.sub(r'"[^"]*"', '""', code) 259 | 260 | return code.strip() 261 | 262 | except Exception as e: 263 | logging.error(f"规范化代码时出错: {e}") 264 | return code -------------------------------------------------------------------------------- /prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | 5 | alerting: 6 | alertmanagers: 7 | - static_configs: 8 | - targets: 9 | # - alertmanager:9093 10 | 11 | rule_files: 12 | # - "first_rules.yml" 13 | # - "second_rules.yml" 14 | 15 | scrape_configs: 16 | - job_name: 're-centris' 17 | static_configs: 18 | - targets: ['web:8000'] 19 | metrics_path: '/metrics' 20 | scrape_interval: 5s 21 | 22 | - job_name: 'prometheus' 23 | static_configs: 24 | - targets: ['localhost:9090'] 25 | 26 | - job_name: 'redis' 27 | static_configs: 28 | - targets: ['redis:6379'] 29 | 30 | - job_name: 'postgres' 31 | static_configs: 32 | - targets: ['postgres:5432'] -------------------------------------------------------------------------------- /re-centris-go/cmd/re-centris/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/re-centris/re-centris-go/internal/cmd" 7 | ) 8 | 9 | func main() { 10 | if err := cmd.Execute(); err != nil { 11 | log.Fatal(err) 12 | } 13 | } -------------------------------------------------------------------------------- /re-centris-go/config.yaml: -------------------------------------------------------------------------------- 1 | # Re-Centris Configuration 2 | 3 | # Path configurations 4 | paths: 5 | repo_path: "./repos" 6 | tag_date_path: "./data/repo_date" 7 | result_path: "./data/repo_functions" 8 | 9 | # Performance settings 10 | performance: 11 | max_workers: 0 # 0 means use number of CPU cores 12 | cache_size: 1000 13 | memory_limit: 0.8 # Maximum memory usage (80%) 14 | 15 | # Language settings 16 | languages: 17 | cpp: 18 | enabled: true 19 | extensions: 20 | - ".c" 21 | - ".cc" 22 | - ".cpp" 23 | - ".cxx" 24 | - ".h" 25 | - ".hpp" 26 | java: 27 | enabled: false 28 | extensions: 29 | - ".java" 30 | python: 31 | enabled: false 32 | extensions: 33 | - ".py" 34 | 35 | # Clone settings 36 | clone: 37 | output: "./repos" 38 | workers: 5 39 | 40 | # Analysis settings 41 | analyze: 42 | output: "./analysis" 43 | workers: 5 44 | 45 | # Detection settings 46 | detect: 47 | known_files: "./known-files" 48 | output: "detection-results.json" 49 | workers: 5 50 | threshold: 0.8 # Similarity threshold (0.0-1.0) -------------------------------------------------------------------------------- /re-centris-go/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/re-centris/re-centris-go 2 | 3 | go 1.21 4 | 5 | require ( 6 | github.com/spf13/cobra v1.8.0 7 | github.com/spf13/viper v1.18.2 8 | go.uber.org/zap v1.27.0 9 | golang.org/x/sync v0.6.0 10 | gopkg.in/yaml.v3 v3.0.1 11 | ) -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/analyzer.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "fmt" 7 | "io" 8 | "os" 9 | "path/filepath" 10 | "strings" 11 | "sync" 12 | 13 | "github.com/re-centris/re-centris-go/internal/analyzer/tlsh" 14 | "github.com/re-centris/re-centris-go/internal/common/logger" 15 | "golang.org/x/sync/errgroup" 16 | ) 17 | 18 | // FileInfo represents information about an analyzed file 19 | type FileInfo struct { 20 | Path string 21 | Language string 22 | Hash *tlsh.TLSH 23 | Size int64 24 | } 25 | 26 | // AnalyzerOptions contains options for the analyzer 27 | type AnalyzerOptions struct { 28 | MaxWorkers int 29 | Languages map[string][]string // map of language to file extensions 30 | } 31 | 32 | // Analyzer handles code analysis 33 | type Analyzer struct { 34 | opts AnalyzerOptions 35 | } 36 | 37 | // New creates a new Analyzer 38 | func New(opts AnalyzerOptions) *Analyzer { 39 | return &Analyzer{opts: opts} 40 | } 41 | 42 | // AnalyzeFile analyzes a single file and returns its FileInfo 43 | func (a *Analyzer) AnalyzeFile(ctx context.Context, path string) (*FileInfo, error) { 44 | // Get file extension 45 | ext := strings.ToLower(filepath.Ext(path)) 46 | 47 | // Find language for this extension 48 | var language string 49 | for lang, exts := range a.opts.Languages { 50 | for _, e := range exts { 51 | if e == ext { 52 | language = lang 53 | break 54 | } 55 | } 56 | if language != "" { 57 | break 58 | } 59 | } 60 | 61 | if language == "" { 62 | return nil, fmt.Errorf("unsupported file extension: %s", ext) 63 | } 64 | 65 | // Open and read file 66 | file, err := os.Open(path) 67 | if err != nil { 68 | return nil, fmt.Errorf("failed to open file: %v", err) 69 | } 70 | defer file.Close() 71 | 72 | // Get file size 73 | stat, err := file.Stat() 74 | if err != nil { 75 | return nil, fmt.Errorf("failed to get file stats: %v", err) 76 | } 77 | 78 | // Read file content 79 | content, err := io.ReadAll(file) 80 | if err != nil { 81 | return nil, fmt.Errorf("failed to read file: %v", err) 82 | } 83 | 84 | // Calculate TLSH hash 85 | hash, err := tlsh.New(content) 86 | if err != nil { 87 | return nil, fmt.Errorf("failed to calculate TLSH hash: %v", err) 88 | } 89 | 90 | return &FileInfo{ 91 | Path: path, 92 | Language: language, 93 | Hash: hash, 94 | Size: stat.Size(), 95 | }, nil 96 | } 97 | 98 | // AnalyzeDirectory analyzes all files in a directory and its subdirectories 99 | func (a *Analyzer) AnalyzeDirectory(ctx context.Context, dir string) ([]*FileInfo, error) { 100 | var ( 101 | files []*FileInfo 102 | filesMux sync.Mutex 103 | ) 104 | 105 | // Create error group with context and worker limit 106 | g, ctx := errgroup.WithContext(ctx) 107 | g.SetLimit(a.opts.MaxWorkers) 108 | 109 | // Walk through directory 110 | err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 111 | if err != nil { 112 | return err 113 | } 114 | 115 | // Skip directories 116 | if info.IsDir() { 117 | return nil 118 | } 119 | 120 | // Check if context is cancelled 121 | select { 122 | case <-ctx.Done(): 123 | return ctx.Err() 124 | default: 125 | } 126 | 127 | // Process file in goroutine 128 | g.Go(func() error { 129 | fileInfo, err := a.AnalyzeFile(ctx, path) 130 | if err != nil { 131 | if err == tlsh.ErrDataTooSmall { 132 | // Skip files that are too small 133 | return nil 134 | } 135 | logger.Error("Failed to analyze file", 136 | zap.String("path", path), 137 | zap.Error(err)) 138 | return err 139 | } 140 | 141 | // Add file info to results 142 | filesMux.Lock() 143 | files = append(files, fileInfo) 144 | filesMux.Unlock() 145 | 146 | return nil 147 | }) 148 | 149 | return nil 150 | }) 151 | 152 | if err != nil { 153 | return nil, fmt.Errorf("failed to walk directory: %v", err) 154 | } 155 | 156 | // Wait for all goroutines to complete 157 | if err := g.Wait(); err != nil { 158 | return nil, fmt.Errorf("error while analyzing files: %v", err) 159 | } 160 | 161 | return files, nil 162 | } 163 | 164 | // FindSimilarFiles finds files similar to the target file 165 | func (a *Analyzer) FindSimilarFiles(target *FileInfo, candidates []*FileInfo, threshold int) []*FileInfo { 166 | var similar []*FileInfo 167 | 168 | for _, candidate := range candidates { 169 | // Skip same file 170 | if target.Path == candidate.Path { 171 | continue 172 | } 173 | 174 | // Skip files with different languages 175 | if target.Language != candidate.Language { 176 | continue 177 | } 178 | 179 | // Calculate distance 180 | distance := target.Hash.Distance(candidate.Hash) 181 | if distance <= threshold { 182 | similar = append(similar, candidate) 183 | } 184 | } 185 | 186 | return similar 187 | } -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/parser/cpp/parser.go: -------------------------------------------------------------------------------- 1 | package cpp 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "regexp" 9 | "strings" 10 | 11 | "github.com/re-centris/re-centris-go/internal/analyzer/parser" 12 | "github.com/re-centris/re-centris-go/internal/analyzer/tlsh" 13 | ) 14 | 15 | var ( 16 | // Function declaration pattern 17 | funcPattern = regexp.MustCompile(`^[\s]*(?:virtual\s+)?(?:static\s+)?(?:inline\s+)?(?:explicit\s+)?(?:[\w:]+[\s*&]+)?[\w:~]+[\s*&]*\s*[\w:]+\s*\([^)]*\)\s*(?:const\s*)?(?:noexcept\s*)?(?:override\s*)?(?:final\s*)?(?:=\s*0\s*)?(?:=\s*default\s*)?(?:=\s*delete\s*)?(?:\s*{\s*)?$`) 18 | 19 | // Class declaration pattern 20 | classPattern = regexp.MustCompile(`^[\s]*(?:class|struct)\s+\w+(?:\s*:\s*(?:public|protected|private)\s+\w+(?:\s*,\s*(?:public|protected|private)\s+\w+)*)?(?:\s*{\s*)?$`) 21 | ) 22 | 23 | // CPPParser implements the Parser interface for C/C++ 24 | type CPPParser struct{} 25 | 26 | // New creates a new C/C++ parser 27 | func New() *CPPParser { 28 | return &CPPParser{} 29 | } 30 | 31 | // GetLanguage returns the language name 32 | func (p *CPPParser) GetLanguage() string { 33 | return "cpp" 34 | } 35 | 36 | // GetExtensions returns supported file extensions 37 | func (p *CPPParser) GetExtensions() []string { 38 | return []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"} 39 | } 40 | 41 | // Parse parses C/C++ source code and extracts functions 42 | func (p *CPPParser) Parse(reader io.Reader) ([]parser.Function, error) { 43 | var ( 44 | functions []parser.Function 45 | scanner = bufio.NewScanner(reader) 46 | lineNum = 0 47 | inFunc = false 48 | inClass = false 49 | curFunc parser.Function 50 | content strings.Builder 51 | ) 52 | 53 | // Stack to track nested braces 54 | braceCount := 0 55 | 56 | for scanner.Scan() { 57 | lineNum++ 58 | line := scanner.Text() 59 | trimmedLine := strings.TrimSpace(line) 60 | 61 | // Skip empty lines and comments 62 | if trimmedLine == "" || strings.HasPrefix(trimmedLine, "//") { 63 | continue 64 | } 65 | 66 | // Handle multi-line comments 67 | if strings.HasPrefix(trimmedLine, "/*") { 68 | for scanner.Scan() { 69 | lineNum++ 70 | if strings.Contains(scanner.Text(), "*/") { 71 | break 72 | } 73 | } 74 | continue 75 | } 76 | 77 | // Track braces 78 | braceCount += strings.Count(line, "{") - strings.Count(line, "}") 79 | 80 | // Check for class/struct declarations 81 | if classPattern.MatchString(line) { 82 | inClass = true 83 | continue 84 | } 85 | 86 | // Check for function declarations 87 | if !inFunc && funcPattern.MatchString(line) { 88 | inFunc = true 89 | curFunc = parser.Function{ 90 | Name: extractFunctionName(line), 91 | StartLine: lineNum, 92 | Content: line + "\n", 93 | } 94 | continue 95 | } 96 | 97 | // Inside function 98 | if inFunc { 99 | content.WriteString(line) 100 | content.WriteString("\n") 101 | 102 | // Function ends when braces are balanced 103 | if braceCount == 0 { 104 | curFunc.EndLine = lineNum 105 | curFunc.Content = content.String() 106 | 107 | // Calculate hash 108 | hash, err := tlsh.New([]byte(curFunc.Content)) 109 | if err == nil { 110 | curFunc.Hash = hash.String() 111 | } 112 | 113 | functions = append(functions, curFunc) 114 | inFunc = false 115 | content.Reset() 116 | } 117 | } 118 | 119 | // Reset class state when closing brace is found 120 | if inClass && braceCount == 0 { 121 | inClass = false 122 | } 123 | } 124 | 125 | if err := scanner.Err(); err != nil { 126 | return nil, fmt.Errorf("error scanning C/C++ code: %v", err) 127 | } 128 | 129 | return functions, nil 130 | } 131 | 132 | // extractFunctionName extracts the function name from the declaration 133 | func extractFunctionName(line string) string { 134 | // Remove return type and parameters 135 | line = strings.TrimSpace(line) 136 | if idx := strings.Index(line, "("); idx > 0 { 137 | line = strings.TrimSpace(line[:idx]) 138 | } 139 | 140 | // Get the last word before parameters 141 | parts := strings.Fields(line) 142 | if len(parts) > 0 { 143 | return parts[len(parts)-1] 144 | } 145 | 146 | return "" 147 | } -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/parser/cpp/parser_test.go: -------------------------------------------------------------------------------- 1 | package cpp 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestCPPParser_GetLanguage(t *testing.T) { 9 | parser := New() 10 | if lang := parser.GetLanguage(); lang != "cpp" { 11 | t.Errorf("GetLanguage() = %v, want cpp", lang) 12 | } 13 | } 14 | 15 | func TestCPPParser_GetExtensions(t *testing.T) { 16 | parser := New() 17 | exts := parser.GetExtensions() 18 | expected := []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"} 19 | 20 | if len(exts) != len(expected) { 21 | t.Errorf("GetExtensions() returned %d extensions, want %d", len(exts), len(expected)) 22 | } 23 | 24 | for i, ext := range expected { 25 | if exts[i] != ext { 26 | t.Errorf("GetExtensions()[%d] = %v, want %v", i, exts[i], ext) 27 | } 28 | } 29 | } 30 | 31 | func TestCPPParser_Parse(t *testing.T) { 32 | tests := []struct { 33 | name string 34 | code string 35 | wantFunctions int 36 | wantNames []string 37 | }{ 38 | { 39 | name: "simple function", 40 | code: ` 41 | int add(int a, int b) { 42 | return a + b; 43 | } 44 | `, 45 | wantFunctions: 1, 46 | wantNames: []string{"add"}, 47 | }, 48 | { 49 | name: "class method", 50 | code: ` 51 | class Calculator { 52 | public: 53 | int add(int a, int b) { 54 | return a + b; 55 | } 56 | virtual void process() = 0; 57 | }; 58 | `, 59 | wantFunctions: 2, 60 | wantNames: []string{"add", "process"}, 61 | }, 62 | { 63 | name: "multiple functions", 64 | code: ` 65 | void init() {} 66 | int calculate(double x) { 67 | return static_cast(x); 68 | } 69 | namespace test { 70 | void helper() {} 71 | } 72 | `, 73 | wantFunctions: 3, 74 | wantNames: []string{"init", "calculate", "helper"}, 75 | }, 76 | { 77 | name: "complex function", 78 | code: ` 79 | template 80 | static inline T* createObject(const std::string& name) noexcept { 81 | return new T(name); 82 | } 83 | `, 84 | wantFunctions: 1, 85 | wantNames: []string{"createObject"}, 86 | }, 87 | } 88 | 89 | parser := New() 90 | for _, tt := range tests { 91 | t.Run(tt.name, func(t *testing.T) { 92 | reader := strings.NewReader(tt.code) 93 | functions, err := parser.Parse(reader) 94 | 95 | if err != nil { 96 | t.Errorf("Parse() error = %v", err) 97 | return 98 | } 99 | 100 | if len(functions) != tt.wantFunctions { 101 | t.Errorf("Parse() got %v functions, want %v", len(functions), tt.wantFunctions) 102 | return 103 | } 104 | 105 | for i, wantName := range tt.wantNames { 106 | if i >= len(functions) { 107 | t.Errorf("Missing function %v", wantName) 108 | continue 109 | } 110 | if functions[i].Name != wantName { 111 | t.Errorf("Function[%d].Name = %v, want %v", i, functions[i].Name, wantName) 112 | } 113 | if functions[i].Hash == "" { 114 | t.Errorf("Function[%d].Hash is empty", i) 115 | } 116 | } 117 | }) 118 | } 119 | } 120 | 121 | func TestCPPParser_ParseEdgeCases(t *testing.T) { 122 | tests := []struct { 123 | name string 124 | code string 125 | wantErr bool 126 | }{ 127 | { 128 | name: "empty code", 129 | code: "", 130 | wantErr: false, 131 | }, 132 | { 133 | name: "only comments", 134 | code: ` 135 | // This is a comment 136 | /* This is a 137 | multi-line comment */ 138 | `, 139 | wantErr: false, 140 | }, 141 | { 142 | name: "incomplete function", 143 | code: ` 144 | int add(int a, int b) { 145 | return a + b; 146 | // missing closing brace 147 | `, 148 | wantErr: false, // parser should handle this gracefully 149 | }, 150 | { 151 | name: "nested functions", 152 | code: ` 153 | void outer() { 154 | void inner() { 155 | // nested function (invalid in C++) 156 | } 157 | } 158 | `, 159 | wantErr: false, 160 | }, 161 | } 162 | 163 | parser := New() 164 | for _, tt := range tests { 165 | t.Run(tt.name, func(t *testing.T) { 166 | reader := strings.NewReader(tt.code) 167 | _, err := parser.Parse(reader) 168 | if (err != nil) != tt.wantErr { 169 | t.Errorf("Parse() error = %v, wantErr %v", err, tt.wantErr) 170 | } 171 | }) 172 | } 173 | } 174 | 175 | func BenchmarkCPPParser_Parse(b *testing.B) { 176 | code := ` 177 | class Example { 178 | public: 179 | void method1() { } 180 | int method2(int x) { return x * 2; } 181 | virtual void method3() = 0; 182 | }; 183 | 184 | namespace test { 185 | void function1() { 186 | // some code 187 | } 188 | 189 | int function2(double x) { 190 | return static_cast(x); 191 | } 192 | } 193 | ` 194 | 195 | parser := New() 196 | b.ResetTimer() 197 | 198 | for i := 0; i < b.N; i++ { 199 | reader := strings.NewReader(code) 200 | _, _ = parser.Parse(reader) 201 | } 202 | } -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/parser/parser.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "io" 5 | ) 6 | 7 | // Function represents a parsed function 8 | type Function struct { 9 | Name string 10 | StartLine int 11 | EndLine int 12 | Content string 13 | Hash string 14 | } 15 | 16 | // Parser defines the interface for language-specific parsers 17 | type Parser interface { 18 | // Parse parses the source code and returns extracted functions 19 | Parse(reader io.Reader) ([]Function, error) 20 | 21 | // GetLanguage returns the language name 22 | GetLanguage() string 23 | 24 | // GetExtensions returns supported file extensions 25 | GetExtensions() []string 26 | } 27 | 28 | // Registry maintains a map of language parsers 29 | type Registry struct { 30 | parsers map[string]Parser 31 | } 32 | 33 | // NewRegistry creates a new parser registry 34 | func NewRegistry() *Registry { 35 | return &Registry{ 36 | parsers: make(map[string]Parser), 37 | } 38 | } 39 | 40 | // Register registers a parser for a language 41 | func (r *Registry) Register(parser Parser) { 42 | r.parsers[parser.GetLanguage()] = parser 43 | } 44 | 45 | // Get returns a parser for the given language 46 | func (r *Registry) Get(language string) (Parser, bool) { 47 | parser, ok := r.parsers[language] 48 | return parser, ok 49 | } 50 | 51 | // GetByExtension returns a parser for the given file extension 52 | func (r *Registry) GetByExtension(ext string) (Parser, bool) { 53 | for _, parser := range r.parsers { 54 | for _, e := range parser.GetExtensions() { 55 | if e == ext { 56 | return parser, true 57 | } 58 | } 59 | } 60 | return nil, false 61 | } -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/tlsh/errors.go: -------------------------------------------------------------------------------- 1 | package tlsh 2 | 3 | import "errors" 4 | 5 | var ( 6 | // ErrDataTooSmall is returned when input data is too small for TLSH calculation 7 | ErrDataTooSmall = errors.New("input data must be at least 50 bytes") 8 | 9 | // ErrInvalidHash is returned when trying to parse an invalid TLSH hash string 10 | ErrInvalidHash = errors.New("invalid TLSH hash format") 11 | 12 | // ErrNilHash is returned when trying to operate on a nil TLSH hash 13 | ErrNilHash = errors.New("nil TLSH hash") 14 | ) -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/tlsh/tlsh.go: -------------------------------------------------------------------------------- 1 | package tlsh 2 | 3 | import ( 4 | "crypto/sha256" 5 | "encoding/hex" 6 | "math" 7 | "sort" 8 | ) 9 | 10 | const ( 11 | bucketCount = 256 12 | windowSize = 5 13 | minDataLength = 50 14 | ) 15 | 16 | // TLSH represents a Trend Micro Locality Sensitive Hash 17 | type TLSH struct { 18 | Checksum byte 19 | LValue byte 20 | Q1Ratio byte 21 | Q2Ratio byte 22 | QRatios [2]byte 23 | Buckets [bucketCount]byte 24 | DataLength int 25 | } 26 | 27 | // New creates a new TLSH hash from a byte slice 28 | func New(data []byte) (*TLSH, error) { 29 | if len(data) < minDataLength { 30 | return nil, ErrDataTooSmall 31 | } 32 | 33 | tlsh := &TLSH{ 34 | DataLength: len(data), 35 | } 36 | 37 | // Calculate sliding window 38 | buckets := make([]int, bucketCount) 39 | for i := 0; i < len(data)-windowSize; i++ { 40 | window := data[i : i+windowSize] 41 | triplet := (int(window[0]) << 16) | (int(window[2]) << 8) | int(window[4]) 42 | bucket := triplet % bucketCount 43 | buckets[bucket]++ 44 | } 45 | 46 | // Calculate quartiles 47 | sortedBuckets := make([]int, len(buckets)) 48 | copy(sortedBuckets, buckets) 49 | sort.Ints(sortedBuckets) 50 | 51 | q1Pos := len(sortedBuckets) / 4 52 | q2Pos := len(sortedBuckets) / 2 53 | q3Pos := (3 * len(sortedBuckets)) / 4 54 | 55 | q1 := sortedBuckets[q1Pos] 56 | q2 := sortedBuckets[q2Pos] 57 | q3 := sortedBuckets[q3Pos] 58 | 59 | // Calculate ratios 60 | tlsh.Q1Ratio = byte((float64(q1) / float64(q3)) * 16) 61 | tlsh.Q2Ratio = byte((float64(q2) / float64(q3)) * 16) 62 | 63 | // Calculate final bucket values 64 | for i := 0; i < bucketCount; i++ { 65 | if buckets[i] <= q1 { 66 | tlsh.Buckets[i] = 0 67 | } else if buckets[i] <= q2 { 68 | tlsh.Buckets[i] = 1 69 | } else if buckets[i] <= q3 { 70 | tlsh.Buckets[i] = 2 71 | } else { 72 | tlsh.Buckets[i] = 3 73 | } 74 | } 75 | 76 | // Calculate checksum 77 | h := sha256.New() 78 | h.Write(data) 79 | tlsh.Checksum = h.Sum(nil)[0] 80 | 81 | // Calculate L-Value (log base 2 of the file size) 82 | tlsh.LValue = byte(math.Log2(float64(len(data)))) 83 | 84 | return tlsh, nil 85 | } 86 | 87 | // Distance calculates the distance between two TLSH hashes 88 | func (t *TLSH) Distance(other *TLSH) int { 89 | if t == nil || other == nil { 90 | return -1 91 | } 92 | 93 | // Calculate L-Value difference 94 | lDiff := math.Abs(float64(t.LValue - other.LValue)) 95 | 96 | // Calculate bucket difference 97 | bucketDiff := 0 98 | for i := 0; i < bucketCount; i++ { 99 | bucketDiff += int(math.Abs(float64(t.Buckets[i] - other.Buckets[i]))) 100 | } 101 | 102 | // Calculate quartile ratio difference 103 | q1Diff := math.Abs(float64(t.Q1Ratio - other.Q1Ratio)) 104 | q2Diff := math.Abs(float64(t.Q2Ratio - other.Q2Ratio)) 105 | 106 | // Weighted sum of differences 107 | return int(lDiff*12 + float64(bucketDiff) + (q1Diff+q2Diff)*12) 108 | } 109 | 110 | // String returns the hex representation of the TLSH hash 111 | func (t *TLSH) String() string { 112 | if t == nil { 113 | return "" 114 | } 115 | 116 | result := make([]byte, bucketCount/2+4) 117 | result[0] = t.Checksum 118 | result[1] = t.LValue 119 | result[2] = t.Q1Ratio 120 | result[3] = t.Q2Ratio 121 | 122 | // Pack buckets (2 buckets per byte) 123 | for i := 0; i < bucketCount/2; i++ { 124 | result[i+4] = (t.Buckets[i*2] << 4) | t.Buckets[i*2+1] 125 | } 126 | 127 | return hex.EncodeToString(result) 128 | } -------------------------------------------------------------------------------- /re-centris-go/internal/analyzer/tlsh/tlsh_test.go: -------------------------------------------------------------------------------- 1 | package tlsh 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestTLSH(t *testing.T) { 8 | tests := []struct { 9 | name string 10 | data []byte 11 | wantErr bool 12 | distance int // distance with itself should be 0 13 | }{ 14 | { 15 | name: "normal text", 16 | data: []byte("This is a test string that is long enough to generate a TLSH hash"), 17 | wantErr: false, 18 | distance: 0, 19 | }, 20 | { 21 | name: "too short", 22 | data: []byte("too short"), 23 | wantErr: true, 24 | distance: -1, 25 | }, 26 | { 27 | name: "repeated content", 28 | data: []byte("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), 29 | wantErr: false, 30 | distance: 0, 31 | }, 32 | } 33 | 34 | for _, tt := range tests { 35 | t.Run(tt.name, func(t *testing.T) { 36 | hash1, err1 := New(tt.data) 37 | if (err1 != nil) != tt.wantErr { 38 | t.Errorf("New() error = %v, wantErr %v", err1, tt.wantErr) 39 | return 40 | } 41 | if tt.wantErr { 42 | return 43 | } 44 | 45 | // Test distance with itself 46 | if dist := hash1.Distance(hash1); dist != tt.distance { 47 | t.Errorf("Distance with itself = %v, want %v", dist, tt.distance) 48 | } 49 | 50 | // Test string representation 51 | if str := hash1.String(); str == "" { 52 | t.Error("String() returned empty string") 53 | } 54 | 55 | // Test with modified data 56 | modifiedData := make([]byte, len(tt.data)) 57 | copy(modifiedData, tt.data) 58 | modifiedData[len(modifiedData)-1]++ // modify last byte 59 | hash2, _ := New(modifiedData) 60 | 61 | // Distance should be non-zero for different data 62 | if dist := hash1.Distance(hash2); dist == 0 { 63 | t.Error("Distance should be non-zero for different data") 64 | } 65 | }) 66 | } 67 | } 68 | 69 | func TestTLSHEdgeCases(t *testing.T) { 70 | tests := []struct { 71 | name string 72 | data []byte 73 | wantErr bool 74 | }{ 75 | { 76 | name: "nil data", 77 | data: nil, 78 | wantErr: true, 79 | }, 80 | { 81 | name: "empty data", 82 | data: []byte{}, 83 | wantErr: true, 84 | }, 85 | { 86 | name: "exactly minimum length", 87 | data: make([]byte, minDataLength), 88 | wantErr: false, 89 | }, 90 | { 91 | name: "one byte less than minimum", 92 | data: make([]byte, minDataLength-1), 93 | wantErr: true, 94 | }, 95 | } 96 | 97 | for _, tt := range tests { 98 | t.Run(tt.name, func(t *testing.T) { 99 | _, err := New(tt.data) 100 | if (err != nil) != tt.wantErr { 101 | t.Errorf("New() error = %v, wantErr %v", err, tt.wantErr) 102 | } 103 | }) 104 | } 105 | } 106 | 107 | func BenchmarkTLSH(b *testing.B) { 108 | data := []byte(`This is a test string that is long enough to generate a TLSH hash. 109 | We need to make it even longer to ensure we have enough data for meaningful benchmarks. 110 | Adding more text to make it more realistic and provide better performance measurements.`) 111 | 112 | b.ResetTimer() 113 | for i := 0; i < b.N; i++ { 114 | _, _ = New(data) 115 | } 116 | } -------------------------------------------------------------------------------- /re-centris-go/internal/cmd/analyze.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/re-centris/re-centris-go/internal/analyzer" 7 | "github.com/re-centris/re-centris-go/internal/common/logger" 8 | "github.com/spf13/cobra" 9 | "github.com/spf13/viper" 10 | "go.uber.org/zap" 11 | ) 12 | 13 | var analyzeCmd = &cobra.Command{ 14 | Use: "analyze [directory]", 15 | Short: "Analyze source code files", 16 | Long: `Analyze source code files in a directory to calculate TLSH hashes 17 | and extract function information.`, 18 | Args: cobra.ExactArgs(1), 19 | RunE: runAnalyze, 20 | } 21 | 22 | func init() { 23 | rootCmd.AddCommand(analyzeCmd) 24 | 25 | analyzeCmd.Flags().StringP("output", "o", "./analysis", "Output directory for analysis results") 26 | analyzeCmd.Flags().IntP("workers", "w", 5, "Number of parallel workers") 27 | 28 | viper.BindPFlag("analyze.output", analyzeCmd.Flags().Lookup("output")) 29 | viper.BindPFlag("analyze.workers", analyzeCmd.Flags().Lookup("workers")) 30 | } 31 | 32 | func runAnalyze(cmd *cobra.Command, args []string) error { 33 | // Get target directory 34 | targetDir := args[0] 35 | 36 | // Create analyzer options 37 | opts := analyzer.AnalyzerOptions{ 38 | MaxWorkers: viper.GetInt("analyze.workers"), 39 | Languages: map[string][]string{ 40 | "cpp": {".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"}, 41 | "java": {".java"}, 42 | "python": {".py"}, 43 | }, 44 | } 45 | 46 | // Create analyzer 47 | a := analyzer.New(opts) 48 | 49 | // Analyze directory 50 | logger.Info("Starting code analysis", 51 | zap.String("directory", targetDir)) 52 | 53 | files, err := a.AnalyzeDirectory(context.Background(), targetDir) 54 | if err != nil { 55 | return err 56 | } 57 | 58 | logger.Info("Code analysis completed", 59 | zap.Int("total_files", len(files))) 60 | 61 | return nil 62 | } -------------------------------------------------------------------------------- /re-centris-go/internal/cmd/clone.go: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /re-centris-go/internal/cmd/detect.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/re-centris/re-centris-go/internal/detector" 7 | "github.com/re-centris/re-centris-go/internal/common/logger" 8 | "github.com/spf13/cobra" 9 | "github.com/spf13/viper" 10 | ) 11 | 12 | var detectCmd = &cobra.Command{ 13 | Use: "detect [target-files...]", 14 | Short: "Detect code similarities", 15 | Long: `Detect code similarities between target files and known files 16 | using TLSH hash comparison.`, 17 | Args: cobra.MinimumNArgs(1), 18 | RunE: runDetect, 19 | } 20 | 21 | func init() { 22 | rootCmd.AddCommand(detectCmd) 23 | 24 | detectCmd.Flags().StringP("known-files", "k", "./known-files", "Directory containing known files") 25 | detectCmd.Flags().StringP("output", "o", "detection-results.json", "Output file for detection results") 26 | detectCmd.Flags().IntP("workers", "w", 5, "Number of parallel workers") 27 | detectCmd.Flags().Float64P("threshold", "t", 0.8, "Similarity threshold (0.0-1.0)") 28 | 29 | viper.BindPFlag("detect.known_files", detectCmd.Flags().Lookup("known-files")) 30 | viper.BindPFlag("detect.output", detectCmd.Flags().Lookup("output")) 31 | viper.BindPFlag("detect.workers", detectCmd.Flags().Lookup("workers")) 32 | viper.BindPFlag("detect.threshold", detectCmd.Flags().Lookup("threshold")) 33 | } 34 | 35 | func runDetect(cmd *cobra.Command, args []string) error { 36 | // Create detector options 37 | opts := detector.DetectorOptions{ 38 | MaxWorkers: viper.GetInt("detect.workers"), 39 | SimilarityThreshold: viper.GetFloat64("detect.threshold"), 40 | KnownFilesDir: viper.GetString("detect.known_files"), 41 | Languages: map[string][]string{ 42 | "cpp": {".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"}, 43 | "java": {".java"}, 44 | "python": {".py"}, 45 | }, 46 | } 47 | 48 | // Create detector 49 | d := detector.New(opts) 50 | 51 | // Detect similarities 52 | logger.Info("Starting similarity detection", 53 | zap.Int("target_files", len(args)), 54 | zap.String("known_files_dir", opts.KnownFilesDir)) 55 | 56 | results, err := d.DetectSimilarity(context.Background(), args) 57 | if err != nil { 58 | return err 59 | } 60 | 61 | // Save results 62 | outputFile := viper.GetString("detect.output") 63 | if err := d.SaveResults(results, outputFile); err != nil { 64 | return err 65 | } 66 | 67 | logger.Info("Similarity detection completed", 68 | zap.String("output_file", outputFile)) 69 | 70 | return nil 71 | } -------------------------------------------------------------------------------- /re-centris-go/internal/cmd/root.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/spf13/cobra" 8 | "github.com/spf13/viper" 9 | ) 10 | 11 | var ( 12 | cfgFile string 13 | rootCmd = &cobra.Command{ 14 | Use: "re-centris", 15 | Short: "Re-Centris is a code analysis and dependency detection tool", 16 | Long: `Re-Centris is a tool based on TLSH (Trend Micro Locality Sensitive Hash) 17 | for analyzing source code and detecting dependencies. It can identify open source 18 | components used in codebases, detect code clones, and analyze dependencies.`, 19 | } 20 | ) 21 | 22 | // Execute adds all child commands to the root command and sets flags appropriately. 23 | func Execute() error { 24 | return rootCmd.Execute() 25 | } 26 | 27 | func init() { 28 | cobra.OnInitialize(initConfig) 29 | 30 | rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.re-centris.yaml)") 31 | } 32 | 33 | func initConfig() { 34 | if cfgFile != "" { 35 | viper.SetConfigFile(cfgFile) 36 | } else { 37 | home, err := os.UserHomeDir() 38 | cobra.CheckErr(err) 39 | 40 | viper.AddConfigPath(home) 41 | viper.SetConfigType("yaml") 42 | viper.SetConfigName(".re-centris") 43 | } 44 | 45 | viper.AutomaticEnv() 46 | 47 | if err := viper.ReadInConfig(); err == nil { 48 | fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed()) 49 | } 50 | } -------------------------------------------------------------------------------- /re-centris-go/internal/collector/clone/clone.go: -------------------------------------------------------------------------------- 1 | package clone 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "path/filepath" 9 | "strings" 10 | "sync" 11 | 12 | "github.com/re-centris/re-centris-go/internal/common/logger" 13 | "golang.org/x/sync/errgroup" 14 | ) 15 | 16 | // RepoInfo contains information about a repository 17 | type RepoInfo struct { 18 | Author string 19 | Name string 20 | URL string 21 | } 22 | 23 | // CloneOptions contains options for cloning repositories 24 | type CloneOptions struct { 25 | TargetDir string 26 | MaxWorkers int 27 | } 28 | 29 | // ParseRepoURL parses a GitHub repository URL and returns RepoInfo 30 | func ParseRepoURL(url string) (*RepoInfo, error) { 31 | parts := strings.Split(url, "/") 32 | if len(parts) < 2 { 33 | return nil, fmt.Errorf("invalid repository URL: %s", url) 34 | } 35 | 36 | name := parts[len(parts)-1] 37 | author := parts[len(parts)-2] 38 | 39 | // Remove .git suffix if present 40 | name = strings.TrimSuffix(name, ".git") 41 | 42 | return &RepoInfo{ 43 | Author: author, 44 | Name: name, 45 | URL: url, 46 | }, nil 47 | } 48 | 49 | // CloneRepository clones a single repository 50 | func CloneRepository(ctx context.Context, info *RepoInfo, targetDir string) error { 51 | folderName := fmt.Sprintf("%s%%%s", info.Author, info.Name) 52 | targetPath := filepath.Join(targetDir, folderName) 53 | 54 | // Check if repository already exists 55 | if _, err := os.Stat(targetPath); !os.IsNotExist(err) { 56 | logger.Info("Repository already exists, skipping", 57 | zap.String("repo", folderName)) 58 | return nil 59 | } 60 | 61 | // Prepare git clone command 62 | cmd := exec.CommandContext(ctx, "git", "clone", 63 | "--depth", "1", 64 | "--single-branch", 65 | "--no-tags", 66 | info.URL, 67 | targetPath, 68 | ) 69 | 70 | // Execute command 71 | if output, err := cmd.CombinedOutput(); err != nil { 72 | return fmt.Errorf("failed to clone repository %s: %v\nOutput: %s", 73 | info.URL, err, string(output)) 74 | } 75 | 76 | logger.Info("Successfully cloned repository", 77 | zap.String("repo", folderName)) 78 | return nil 79 | } 80 | 81 | // CloneRepositories clones multiple repositories in parallel 82 | func CloneRepositories(ctx context.Context, urls []string, opts CloneOptions) error { 83 | // Create target directory if it doesn't exist 84 | if err := os.MkdirAll(opts.TargetDir, 0755); err != nil { 85 | return fmt.Errorf("failed to create target directory: %v", err) 86 | } 87 | 88 | // Create error group with context 89 | g, ctx := errgroup.WithContext(ctx) 90 | g.SetLimit(opts.MaxWorkers) 91 | 92 | // Process each repository URL 93 | for _, url := range urls { 94 | url := url // Create new variable for goroutine 95 | g.Go(func() error { 96 | info, err := ParseRepoURL(url) 97 | if err != nil { 98 | logger.Error("Failed to parse repository URL", 99 | zap.String("url", url), 100 | zap.Error(err)) 101 | return err 102 | } 103 | 104 | return CloneRepository(ctx, info, opts.TargetDir) 105 | }) 106 | } 107 | 108 | // Wait for all goroutines to complete 109 | if err := g.Wait(); err != nil { 110 | return fmt.Errorf("error while cloning repositories: %v", err) 111 | } 112 | 113 | return nil 114 | } -------------------------------------------------------------------------------- /re-centris-go/internal/common/cache/cache.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | ) 7 | 8 | // Cache is a thread-safe LRU cache 9 | type Cache struct { 10 | capacity int 11 | items map[string]*list.Element 12 | queue *list.List 13 | mutex sync.RWMutex 14 | } 15 | 16 | // item represents a cache item 17 | type item struct { 18 | key string 19 | value interface{} 20 | } 21 | 22 | // New creates a new cache with the given capacity 23 | func New(capacity int) *Cache { 24 | return &Cache{ 25 | capacity: capacity, 26 | items: make(map[string]*list.Element), 27 | queue: list.New(), 28 | } 29 | } 30 | 31 | // Get retrieves a value from the cache 32 | func (c *Cache) Get(key string) (interface{}, bool) { 33 | c.mutex.RLock() 34 | if element, exists := c.items[key]; exists { 35 | c.mutex.RUnlock() 36 | c.mutex.Lock() 37 | c.queue.MoveToFront(element) 38 | c.mutex.Unlock() 39 | return element.Value.(*item).value, true 40 | } 41 | c.mutex.RUnlock() 42 | return nil, false 43 | } 44 | 45 | // Set adds or updates a value in the cache 46 | func (c *Cache) Set(key string, value interface{}) { 47 | c.mutex.Lock() 48 | defer c.mutex.Unlock() 49 | 50 | // If key exists, update its value and move to front 51 | if element, exists := c.items[key]; exists { 52 | c.queue.MoveToFront(element) 53 | element.Value.(*item).value = value 54 | return 55 | } 56 | 57 | // Add new item 58 | element := c.queue.PushFront(&item{key: key, value: value}) 59 | c.items[key] = element 60 | 61 | // Remove oldest item if cache is full 62 | if c.queue.Len() > c.capacity { 63 | oldest := c.queue.Back() 64 | if oldest != nil { 65 | c.queue.Remove(oldest) 66 | delete(c.items, oldest.Value.(*item).key) 67 | } 68 | } 69 | } 70 | 71 | // Delete removes a value from the cache 72 | func (c *Cache) Delete(key string) { 73 | c.mutex.Lock() 74 | defer c.mutex.Unlock() 75 | 76 | if element, exists := c.items[key]; exists { 77 | c.queue.Remove(element) 78 | delete(c.items, key) 79 | } 80 | } 81 | 82 | // Clear removes all items from the cache 83 | func (c *Cache) Clear() { 84 | c.mutex.Lock() 85 | defer c.mutex.Unlock() 86 | 87 | c.items = make(map[string]*list.Element) 88 | c.queue = list.New() 89 | } 90 | 91 | // Len returns the number of items in the cache 92 | func (c *Cache) Len() int { 93 | c.mutex.RLock() 94 | defer c.mutex.RUnlock() 95 | return len(c.items) 96 | } 97 | 98 | // Keys returns all keys in the cache 99 | func (c *Cache) Keys() []string { 100 | c.mutex.RLock() 101 | defer c.mutex.RUnlock() 102 | 103 | keys := make([]string, 0, len(c.items)) 104 | for key := range c.items { 105 | keys = append(keys, key) 106 | } 107 | return keys 108 | } -------------------------------------------------------------------------------- /re-centris-go/internal/common/logger/logger.go: -------------------------------------------------------------------------------- 1 | package logger 2 | 3 | import ( 4 | "os" 5 | 6 | "go.uber.org/zap" 7 | "go.uber.org/zap/zapcore" 8 | ) 9 | 10 | var log *zap.Logger 11 | 12 | // Init initializes the logger 13 | func Init(debug bool) { 14 | config := zap.NewProductionConfig() 15 | if debug { 16 | config.Level = zap.NewAtomicLevelAt(zap.DebugLevel) 17 | } 18 | 19 | config.OutputPaths = []string{"stdout", "re-centris.log"} 20 | config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder 21 | 22 | var err error 23 | log, err = config.Build() 24 | if err != nil { 25 | os.Exit(1) 26 | } 27 | } 28 | 29 | // Debug logs a debug message 30 | func Debug(msg string, fields ...zap.Field) { 31 | log.Debug(msg, fields...) 32 | } 33 | 34 | // Info logs an info message 35 | func Info(msg string, fields ...zap.Field) { 36 | log.Info(msg, fields...) 37 | } 38 | 39 | // Warn logs a warning message 40 | func Warn(msg string, fields ...zap.Field) { 41 | log.Warn(msg, fields...) 42 | } 43 | 44 | // Error logs an error message 45 | func Error(msg string, fields ...zap.Field) { 46 | log.Error(msg, fields...) 47 | } 48 | 49 | // Fatal logs a fatal message and exits 50 | func Fatal(msg string, fields ...zap.Field) { 51 | log.Fatal(msg, fields...) 52 | } 53 | 54 | // Sync flushes any buffered log entries 55 | func Sync() error { 56 | return log.Sync() 57 | } -------------------------------------------------------------------------------- /re-centris-go/internal/common/monitor/monitor.go: -------------------------------------------------------------------------------- 1 | package monitor 2 | 3 | import ( 4 | "runtime" 5 | "sync" 6 | "time" 7 | 8 | "github.com/re-centris/re-centris-go/internal/common/logger" 9 | "go.uber.org/zap" 10 | ) 11 | 12 | // Stats represents performance statistics 13 | type Stats struct { 14 | Goroutines int 15 | Memory uint64 16 | CPU float64 17 | StartTime time.Time 18 | Operations uint64 19 | mutex sync.RWMutex 20 | } 21 | 22 | // Monitor handles performance monitoring 23 | type Monitor struct { 24 | stats *Stats 25 | interval time.Duration 26 | done chan struct{} 27 | } 28 | 29 | // New creates a new performance monitor 30 | func New(interval time.Duration) *Monitor { 31 | return &Monitor{ 32 | stats: &Stats{ 33 | StartTime: time.Now(), 34 | }, 35 | interval: interval, 36 | done: make(chan struct{}), 37 | } 38 | } 39 | 40 | // Start starts the monitoring 41 | func (m *Monitor) Start() { 42 | go m.monitor() 43 | } 44 | 45 | // Stop stops the monitoring 46 | func (m *Monitor) Stop() { 47 | close(m.done) 48 | } 49 | 50 | // GetStats returns current statistics 51 | func (m *Monitor) GetStats() Stats { 52 | m.stats.mutex.RLock() 53 | defer m.stats.mutex.RUnlock() 54 | return *m.stats 55 | } 56 | 57 | // IncrementOperations increments the operation counter 58 | func (m *Monitor) IncrementOperations() { 59 | m.stats.mutex.Lock() 60 | m.stats.Operations++ 61 | m.stats.mutex.Unlock() 62 | } 63 | 64 | // monitor periodically collects performance metrics 65 | func (m *Monitor) monitor() { 66 | ticker := time.NewTicker(m.interval) 67 | defer ticker.Stop() 68 | 69 | for { 70 | select { 71 | case <-ticker.C: 72 | m.collectMetrics() 73 | case <-m.done: 74 | return 75 | } 76 | } 77 | } 78 | 79 | // collectMetrics collects current performance metrics 80 | func (m *Monitor) collectMetrics() { 81 | m.stats.mutex.Lock() 82 | defer m.stats.mutex.Unlock() 83 | 84 | // Get number of goroutines 85 | m.stats.Goroutines = runtime.NumGoroutine() 86 | 87 | // Get memory statistics 88 | var memStats runtime.MemStats 89 | runtime.ReadMemStats(&memStats) 90 | m.stats.Memory = memStats.Alloc 91 | 92 | // Log current metrics 93 | logger.Info("Performance metrics", 94 | zap.Int("goroutines", m.stats.Goroutines), 95 | zap.Uint64("memory_bytes", m.stats.Memory), 96 | zap.Uint64("operations", m.stats.Operations), 97 | zap.Duration("uptime", time.Since(m.stats.StartTime)), 98 | ) 99 | } 100 | 101 | // CheckMemoryLimit checks if memory usage is within limit 102 | func (m *Monitor) CheckMemoryLimit(limit float64) bool { 103 | var memStats runtime.MemStats 104 | runtime.ReadMemStats(&memStats) 105 | 106 | totalMemory := float64(memStats.Sys) 107 | usedMemory := float64(memStats.Alloc) 108 | memoryUsage := usedMemory / totalMemory 109 | 110 | if memoryUsage > limit { 111 | logger.Warn("Memory usage exceeds limit", 112 | zap.Float64("usage", memoryUsage), 113 | zap.Float64("limit", limit)) 114 | return false 115 | } 116 | 117 | return true 118 | } -------------------------------------------------------------------------------- /re-centris-go/internal/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // Config represents the main configuration structure 4 | type Config struct { 5 | Paths PathConfig `yaml:"paths"` 6 | Performance PerformanceConfig `yaml:"performance"` 7 | Languages LanguagesConfig `yaml:"languages"` 8 | } 9 | 10 | // PathConfig contains all path-related configurations 11 | type PathConfig struct { 12 | RepoPath string `yaml:"repo_path"` 13 | TagDatePath string `yaml:"tag_date_path"` 14 | ResultPath string `yaml:"result_path"` 15 | } 16 | 17 | // PerformanceConfig contains performance-related settings 18 | type PerformanceConfig struct { 19 | MaxWorkers int `yaml:"max_workers"` 20 | CacheSize int `yaml:"cache_size"` 21 | MemoryLimit float64 `yaml:"memory_limit"` 22 | } 23 | 24 | // LanguagesConfig contains settings for supported languages 25 | type LanguagesConfig struct { 26 | CPP LanguageSettings `yaml:"cpp"` 27 | Java LanguageSettings `yaml:"java"` 28 | Python LanguageSettings `yaml:"python"` 29 | } 30 | 31 | // LanguageSettings contains settings for a specific language 32 | type LanguageSettings struct { 33 | Enabled bool `yaml:"enabled"` 34 | Extensions []string `yaml:"extensions"` 35 | } 36 | 37 | // DefaultConfig returns a default configuration 38 | func DefaultConfig() *Config { 39 | return &Config{ 40 | Paths: PathConfig{ 41 | RepoPath: "./repos", 42 | TagDatePath: "./data/repo_date", 43 | ResultPath: "./data/repo_functions", 44 | }, 45 | Performance: PerformanceConfig{ 46 | MaxWorkers: 0, // 0 means use number of CPU cores 47 | CacheSize: 1000, 48 | MemoryLimit: 0.8, 49 | }, 50 | Languages: LanguagesConfig{ 51 | CPP: LanguageSettings{ 52 | Enabled: true, 53 | Extensions: []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"}, 54 | }, 55 | Java: LanguageSettings{ 56 | Enabled: false, 57 | Extensions: []string{".java"}, 58 | }, 59 | Python: LanguageSettings{ 60 | Enabled: false, 61 | Extensions: []string{".py"}, 62 | }, 63 | }, 64 | } 65 | } -------------------------------------------------------------------------------- /re-centris-go/internal/detector/detector.go: -------------------------------------------------------------------------------- 1 | package detector 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | "sort" 10 | "sync" 11 | 12 | "github.com/re-centris/re-centris-go/internal/analyzer" 13 | "github.com/re-centris/re-centris-go/internal/common/logger" 14 | "golang.org/x/sync/errgroup" 15 | ) 16 | 17 | // DetectionResult represents the result of a code similarity detection 18 | type DetectionResult struct { 19 | TargetFile string `json:"target_file"` 20 | Matches []Match `json:"matches"` 21 | TotalFiles int `json:"total_files"` 22 | MatchCount int `json:"match_count"` 23 | } 24 | 25 | // Match represents a single match in the detection result 26 | type Match struct { 27 | File string `json:"file"` 28 | Similarity float64 `json:"similarity"` 29 | Distance int `json:"distance"` 30 | } 31 | 32 | // DetectorOptions contains options for the detector 33 | type DetectorOptions struct { 34 | MaxWorkers int 35 | SimilarityThreshold float64 36 | Languages map[string][]string 37 | KnownFilesDir string 38 | } 39 | 40 | // Detector handles code similarity detection 41 | type Detector struct { 42 | opts DetectorOptions 43 | analyzer *analyzer.Analyzer 44 | } 45 | 46 | // New creates a new Detector 47 | func New(opts DetectorOptions) *Detector { 48 | return &Detector{ 49 | opts: opts, 50 | analyzer: analyzer.New(analyzer.AnalyzerOptions{ 51 | MaxWorkers: opts.MaxWorkers, 52 | Languages: opts.Languages, 53 | }), 54 | } 55 | } 56 | 57 | // DetectSimilarity detects code similarity between target files and known files 58 | func (d *Detector) DetectSimilarity(ctx context.Context, targetFiles []string) ([]*DetectionResult, error) { 59 | // Load known files 60 | knownFiles, err := d.loadKnownFiles(ctx) 61 | if err != nil { 62 | return nil, fmt.Errorf("failed to load known files: %v", err) 63 | } 64 | 65 | // Process target files in parallel 66 | var ( 67 | results []*DetectionResult 68 | resultsMux sync.Mutex 69 | ) 70 | 71 | g, ctx := errgroup.WithContext(ctx) 72 | g.SetLimit(d.opts.MaxWorkers) 73 | 74 | for _, targetFile := range targetFiles { 75 | targetFile := targetFile // Create new variable for goroutine 76 | g.Go(func() error { 77 | // Analyze target file 78 | fileInfo, err := d.analyzer.AnalyzeFile(ctx, targetFile) 79 | if err != nil { 80 | logger.Error("Failed to analyze target file", 81 | zap.String("file", targetFile), 82 | zap.Error(err)) 83 | return err 84 | } 85 | 86 | // Find similar files 87 | similar := d.analyzer.FindSimilarFiles(fileInfo, knownFiles, 88 | int(100 * (1 - d.opts.SimilarityThreshold))) 89 | 90 | // Create matches 91 | matches := make([]Match, len(similar)) 92 | for i, s := range similar { 93 | distance := fileInfo.Hash.Distance(s.Hash) 94 | similarity := 1.0 - float64(distance)/100.0 95 | matches[i] = Match{ 96 | File: s.Path, 97 | Similarity: similarity, 98 | Distance: distance, 99 | } 100 | } 101 | 102 | // Sort matches by similarity (descending) 103 | sort.Slice(matches, func(i, j int) bool { 104 | return matches[i].Similarity > matches[j].Similarity 105 | }) 106 | 107 | // Create result 108 | result := &DetectionResult{ 109 | TargetFile: targetFile, 110 | Matches: matches, 111 | TotalFiles: len(knownFiles), 112 | MatchCount: len(matches), 113 | } 114 | 115 | // Add to results 116 | resultsMux.Lock() 117 | results = append(results, result) 118 | resultsMux.Unlock() 119 | 120 | return nil 121 | }) 122 | } 123 | 124 | if err := g.Wait(); err != nil { 125 | return nil, fmt.Errorf("error while detecting similarities: %v", err) 126 | } 127 | 128 | return results, nil 129 | } 130 | 131 | // loadKnownFiles loads all known files from the specified directory 132 | func (d *Detector) loadKnownFiles(ctx context.Context) ([]*analyzer.FileInfo, error) { 133 | return d.analyzer.AnalyzeDirectory(ctx, d.opts.KnownFilesDir) 134 | } 135 | 136 | // SaveResults saves detection results to a JSON file 137 | func (d *Detector) SaveResults(results []*DetectionResult, outputPath string) error { 138 | // Create parent directories if they don't exist 139 | if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil { 140 | return fmt.Errorf("failed to create directories: %v", err) 141 | } 142 | 143 | // Marshal results to JSON 144 | data, err := json.MarshalIndent(results, "", " ") 145 | if err != nil { 146 | return fmt.Errorf("failed to marshal results: %v", err) 147 | } 148 | 149 | // Write to file 150 | if err := os.WriteFile(outputPath, data, 0644); err != nil { 151 | return fmt.Errorf("failed to write results: %v", err) 152 | } 153 | 154 | return nil 155 | } -------------------------------------------------------------------------------- /re-centris-go/internal/preprocessor/preprocessor.go: -------------------------------------------------------------------------------- 1 | package preprocessor 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | "sync" 10 | 11 | "github.com/re-centris/re-centris-go/internal/analyzer" 12 | "github.com/re-centris/re-centris-go/internal/common/logger" 13 | "golang.org/x/sync/errgroup" 14 | ) 15 | 16 | // FileMetadata contains metadata about a processed file 17 | type FileMetadata struct { 18 | Path string `json:"path"` 19 | Language string `json:"language"` 20 | Hash string `json:"hash"` 21 | Size int64 `json:"size"` 22 | Functions []FunctionInfo `json:"functions,omitempty"` 23 | } 24 | 25 | // FunctionInfo contains information about a function 26 | type FunctionInfo struct { 27 | Name string `json:"name"` 28 | StartLine int `json:"start_line"` 29 | EndLine int `json:"end_line"` 30 | Hash string `json:"hash"` 31 | } 32 | 33 | // PreprocessorOptions contains options for the preprocessor 34 | type PreprocessorOptions struct { 35 | MaxWorkers int 36 | OutputDir string 37 | Languages map[string][]string 38 | MinFileSize int64 39 | MaxFileSize int64 40 | } 41 | 42 | // Preprocessor handles file preprocessing 43 | type Preprocessor struct { 44 | opts PreprocessorOptions 45 | analyzer *analyzer.Analyzer 46 | } 47 | 48 | // New creates a new Preprocessor 49 | func New(opts PreprocessorOptions) *Preprocessor { 50 | return &Preprocessor{ 51 | opts: opts, 52 | analyzer: analyzer.New(analyzer.AnalyzerOptions{ 53 | MaxWorkers: opts.MaxWorkers, 54 | Languages: opts.Languages, 55 | }), 56 | } 57 | } 58 | 59 | // ProcessDirectory processes all files in a directory 60 | func (p *Preprocessor) ProcessDirectory(ctx context.Context, dir string) error { 61 | // Create output directory if it doesn't exist 62 | if err := os.MkdirAll(p.opts.OutputDir, 0755); err != nil { 63 | return fmt.Errorf("failed to create output directory: %v", err) 64 | } 65 | 66 | // Analyze all files in directory 67 | files, err := p.analyzer.AnalyzeDirectory(ctx, dir) 68 | if err != nil { 69 | return fmt.Errorf("failed to analyze directory: %v", err) 70 | } 71 | 72 | // Process files in parallel 73 | g, ctx := errgroup.WithContext(ctx) 74 | g.SetLimit(p.opts.MaxWorkers) 75 | 76 | for _, file := range files { 77 | file := file // Create new variable for goroutine 78 | g.Go(func() error { 79 | // Skip files that are too small or too large 80 | if file.Size < p.opts.MinFileSize || 81 | (p.opts.MaxFileSize > 0 && file.Size > p.opts.MaxFileSize) { 82 | return nil 83 | } 84 | 85 | metadata := &FileMetadata{ 86 | Path: file.Path, 87 | Language: file.Language, 88 | Hash: file.Hash.String(), 89 | Size: file.Size, 90 | } 91 | 92 | // Extract functions if supported 93 | if funcs, err := p.extractFunctions(file); err == nil { 94 | metadata.Functions = funcs 95 | } 96 | 97 | // Save metadata 98 | if err := p.saveMetadata(metadata); err != nil { 99 | logger.Error("Failed to save metadata", 100 | zap.String("path", file.Path), 101 | zap.Error(err)) 102 | return err 103 | } 104 | 105 | return nil 106 | }) 107 | } 108 | 109 | return g.Wait() 110 | } 111 | 112 | // extractFunctions extracts function information from a file 113 | func (p *Preprocessor) extractFunctions(file *analyzer.FileInfo) ([]FunctionInfo, error) { 114 | // TODO: Implement function extraction using language-specific parsers 115 | // This is a placeholder that should be replaced with actual implementation 116 | return nil, nil 117 | } 118 | 119 | // saveMetadata saves file metadata to JSON file 120 | func (p *Preprocessor) saveMetadata(metadata *FileMetadata) error { 121 | // Create output filename based on file path 122 | relPath, err := filepath.Rel("/", metadata.Path) 123 | if err != nil { 124 | relPath = metadata.Path 125 | } 126 | outPath := filepath.Join(p.opts.OutputDir, 127 | fmt.Sprintf("%s.json", filepath.ToSlash(relPath))) 128 | 129 | // Create parent directories if they don't exist 130 | if err := os.MkdirAll(filepath.Dir(outPath), 0755); err != nil { 131 | return fmt.Errorf("failed to create directories: %v", err) 132 | } 133 | 134 | // Marshal metadata to JSON 135 | data, err := json.MarshalIndent(metadata, "", " ") 136 | if err != nil { 137 | return fmt.Errorf("failed to marshal metadata: %v", err) 138 | } 139 | 140 | // Write to file 141 | if err := os.WriteFile(outPath, data, 0644); err != nil { 142 | return fmt.Errorf("failed to write metadata: %v", err) 143 | } 144 | 145 | return nil 146 | } -------------------------------------------------------------------------------- /re-centris-go/tests/integration/clone_analyze_test.go: -------------------------------------------------------------------------------- 1 | package integration 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/re-centris/re-centris-go/internal/analyzer" 9 | "github.com/re-centris/re-centris-go/internal/collector/clone" 10 | "github.com/re-centris/re-centris-go/internal/common/config" 11 | ) 12 | 13 | func TestCloneAndAnalyze(t *testing.T) { 14 | // Skip if running in CI environment 15 | if os.Getenv("CI") != "" { 16 | t.Skip("Skipping integration test in CI environment") 17 | } 18 | 19 | // Create temporary directories 20 | tmpDir, err := os.MkdirTemp("", "re-centris-test-*") 21 | if err != nil { 22 | t.Fatalf("Failed to create temp dir: %v", err) 23 | } 24 | defer os.RemoveAll(tmpDir) 25 | 26 | repoDir := filepath.Join(tmpDir, "repos") 27 | analysisDir := filepath.Join(tmpDir, "analysis") 28 | 29 | // Create test configuration 30 | cfg := &config.Config{ 31 | Clone: config.CloneConfig{ 32 | OutputPath: repoDir, 33 | Workers: 2, 34 | }, 35 | Analysis: config.AnalysisConfig{ 36 | OutputPath: analysisDir, 37 | Workers: 2, 38 | }, 39 | Languages: config.LanguagesConfig{ 40 | CPP: config.LanguageConfig{ 41 | Enabled: true, 42 | Extensions: []string{".cpp", ".h"}, 43 | }, 44 | }, 45 | } 46 | 47 | // Test repository to clone (use a small, public repo) 48 | testRepo := "https://github.com/google/googletest.git" 49 | 50 | // Initialize cloner 51 | cloner := clone.New(cfg) 52 | 53 | // Clone repository 54 | err = cloner.Clone([]string{testRepo}) 55 | if err != nil { 56 | t.Fatalf("Failed to clone repository: %v", err) 57 | } 58 | 59 | // Verify repository was cloned 60 | if _, err := os.Stat(repoDir); os.IsNotExist(err) { 61 | t.Errorf("Repository directory was not created") 62 | } 63 | 64 | // Initialize analyzer 65 | analyzer := analyzer.New(cfg) 66 | 67 | // Analyze cloned repository 68 | err = analyzer.Analyze(repoDir) 69 | if err != nil { 70 | t.Fatalf("Failed to analyze repository: %v", err) 71 | } 72 | 73 | // Verify analysis output 74 | if _, err := os.Stat(analysisDir); os.IsNotExist(err) { 75 | t.Errorf("Analysis directory was not created") 76 | } 77 | 78 | // Check for analysis results 79 | files, err := filepath.Glob(filepath.Join(analysisDir, "*.json")) 80 | if err != nil { 81 | t.Fatalf("Failed to list analysis files: %v", err) 82 | } 83 | if len(files) == 0 { 84 | t.Error("No analysis results were generated") 85 | } 86 | } 87 | 88 | func TestAnalyzeWithInvalidInput(t *testing.T) { 89 | tmpDir, err := os.MkdirTemp("", "re-centris-test-*") 90 | if err != nil { 91 | t.Fatalf("Failed to create temp dir: %v", err) 92 | } 93 | defer os.RemoveAll(tmpDir) 94 | 95 | cfg := &config.Config{ 96 | Analysis: config.AnalysisConfig{ 97 | OutputPath: tmpDir, 98 | Workers: 1, 99 | }, 100 | } 101 | 102 | analyzer := analyzer.New(cfg) 103 | 104 | // Test with non-existent directory 105 | err = analyzer.Analyze("/nonexistent/path") 106 | if err == nil { 107 | t.Error("Expected error when analyzing non-existent directory") 108 | } 109 | 110 | // Test with empty directory 111 | emptyDir := filepath.Join(tmpDir, "empty") 112 | if err := os.MkdirAll(emptyDir, 0755); err != nil { 113 | t.Fatalf("Failed to create empty directory: %v", err) 114 | } 115 | 116 | err = analyzer.Analyze(emptyDir) 117 | if err != nil { 118 | t.Errorf("Unexpected error analyzing empty directory: %v", err) 119 | } 120 | } -------------------------------------------------------------------------------- /re-centris-go/tests/security/security_test.go: -------------------------------------------------------------------------------- 1 | package security 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "strings" 8 | "sync" 9 | "testing" 10 | "time" 11 | 12 | "github.com/re-centris/re-centris-go/internal/analyzer" 13 | "github.com/re-centris/re-centris-go/internal/common/config" 14 | "github.com/re-centris/re-centris-go/internal/common/monitor" 15 | ) 16 | 17 | func TestPathTraversal(t *testing.T) { 18 | tmpDir, err := os.MkdirTemp("", "re-centris-security-*") 19 | if err != nil { 20 | t.Fatalf("Failed to create temp dir: %v", err) 21 | } 22 | defer os.RemoveAll(tmpDir) 23 | 24 | maliciousPaths := []string{ 25 | "../../../etc/passwd", 26 | "..\\..\\..\\Windows\\System32", 27 | "/etc/shadow", 28 | "C:\\Windows\\System32\\config", 29 | filepath.Join(tmpDir, ".."), 30 | } 31 | 32 | cfg := &config.Config{ 33 | Analysis: config.AnalysisConfig{ 34 | OutputPath: tmpDir, 35 | }, 36 | } 37 | 38 | analyzer := analyzer.New(cfg) 39 | 40 | for _, path := range maliciousPaths { 41 | err := analyzer.Analyze(path) 42 | if err == nil { 43 | t.Errorf("Expected error for malicious path: %s", path) 44 | } 45 | } 46 | } 47 | 48 | func TestMemoryLimit(t *testing.T) { 49 | mon := monitor.New(100 * time.Millisecond) 50 | mon.Start() 51 | defer mon.Stop() 52 | 53 | // Allocate memory gradually 54 | var slices [][]byte 55 | defer func() { 56 | slices = nil 57 | }() 58 | 59 | // Try to allocate memory until we hit the limit 60 | for i := 0; i < 100; i++ { 61 | if !mon.CheckMemoryLimit(0.8) { // 80% memory limit 62 | // Memory limit reached, test passed 63 | return 64 | } 65 | // Allocate 1MB 66 | slices = append(slices, make([]byte, 1024*1024)) 67 | } 68 | 69 | t.Error("Memory limit was not enforced") 70 | } 71 | 72 | func TestConcurrentAccess(t *testing.T) { 73 | tmpDir, err := os.MkdirTemp("", "re-centris-security-*") 74 | if err != nil { 75 | t.Fatalf("Failed to create temp dir: %v", err) 76 | } 77 | defer os.RemoveAll(tmpDir) 78 | 79 | cfg := &config.Config{ 80 | Analysis: config.AnalysisConfig{ 81 | OutputPath: tmpDir, 82 | Workers: 4, 83 | }, 84 | } 85 | 86 | analyzer := analyzer.New(cfg) 87 | 88 | // Create test files 89 | testFiles := make([]string, 10) 90 | for i := range testFiles { 91 | file := filepath.Join(tmpDir, fmt.Sprintf("test%d.cpp", i)) 92 | if err := os.WriteFile(file, []byte("int main() { return 0; }"), 0644); err != nil { 93 | t.Fatalf("Failed to create test file: %v", err) 94 | } 95 | testFiles[i] = file 96 | } 97 | 98 | // Test concurrent access 99 | var wg sync.WaitGroup 100 | errors := make(chan error, len(testFiles)) 101 | 102 | for _, file := range testFiles { 103 | wg.Add(1) 104 | go func(f string) { 105 | defer wg.Done() 106 | if err := analyzer.Analyze(f); err != nil { 107 | errors <- err 108 | } 109 | }(file) 110 | } 111 | 112 | // Wait for all goroutines to finish 113 | wg.Wait() 114 | close(errors) 115 | 116 | // Check for errors 117 | for err := range errors { 118 | t.Errorf("Concurrent analysis error: %v", err) 119 | } 120 | } 121 | 122 | func TestResourceExhaustion(t *testing.T) { 123 | tmpDir, err := os.MkdirTemp("", "re-centris-security-*") 124 | if err != nil { 125 | t.Fatalf("Failed to create temp dir: %v", err) 126 | } 127 | defer os.RemoveAll(tmpDir) 128 | 129 | cfg := &config.Config{ 130 | Analysis: config.AnalysisConfig{ 131 | OutputPath: tmpDir, 132 | Workers: 1000, // Excessive number of workers 133 | }, 134 | } 135 | 136 | analyzer := analyzer.New(cfg) 137 | 138 | // Create a large file 139 | largeFile := filepath.Join(tmpDir, "large.cpp") 140 | f, err := os.Create(largeFile) 141 | if err != nil { 142 | t.Fatalf("Failed to create large file: %v", err) 143 | } 144 | 145 | // Write 100MB of data 146 | data := make([]byte, 1024) 147 | for i := 0; i < 1024*100; i++ { 148 | if _, err := f.Write(data); err != nil { 149 | f.Close() 150 | t.Fatalf("Failed to write to large file: %v", err) 151 | } 152 | } 153 | f.Close() 154 | 155 | // Set timeout for the test 156 | done := make(chan bool) 157 | go func() { 158 | err := analyzer.Analyze(largeFile) 159 | if err != nil { 160 | t.Logf("Analysis error (expected): %v", err) 161 | } 162 | done <- true 163 | }() 164 | 165 | select { 166 | case <-done: 167 | // Test completed within timeout 168 | case <-time.After(30 * time.Second): 169 | t.Error("Analysis took too long, possible resource exhaustion") 170 | } 171 | } 172 | 173 | func TestInputValidation(t *testing.T) { 174 | tmpDir, err := os.MkdirTemp("", "re-centris-security-*") 175 | if err != nil { 176 | t.Fatalf("Failed to create temp dir: %v", err) 177 | } 178 | defer os.RemoveAll(tmpDir) 179 | 180 | cfg := &config.Config{ 181 | Analysis: config.AnalysisConfig{ 182 | OutputPath: tmpDir, 183 | }, 184 | } 185 | 186 | analyzer := analyzer.New(cfg) 187 | 188 | invalidInputs := []struct { 189 | name string 190 | path string 191 | }{ 192 | {"empty path", ""}, 193 | {"space only", " "}, 194 | {"invalid chars", string([]byte{0x00, 0x01, 0x02})}, 195 | {"very long path", strings.Repeat("a", 4096)}, 196 | } 197 | 198 | for _, tc := range invalidInputs { 199 | t.Run(tc.name, func(t *testing.T) { 200 | err := analyzer.Analyze(tc.path) 201 | if err == nil { 202 | t.Errorf("Expected error for invalid input: %s", tc.name) 203 | } 204 | }) 205 | } 206 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 核心依赖 2 | tlsh==4.8.2 3 | PyYAML>=6.0.1 4 | psutil==5.9.5 5 | chardet>=4.0.0 6 | 7 | # 数据处理 8 | numpy>=1.20.0 9 | pandas>=1.3.0 10 | 11 | # Web API 12 | flask>=2.0.0 13 | flask-cors>=3.0.10 14 | gunicorn>=20.1.0 15 | 16 | # 数据库 17 | sqlalchemy>=1.4.0 18 | 19 | # 测试工具 20 | pytest>=7.4.0 21 | pytest-cov>=4.1.0 22 | pytest-xdist>=3.3.0 23 | pytest-benchmark>=4.0.0 24 | pytest-mock>=3.11.0 25 | pytest-timeout>=2.1.0 26 | pytest-randomly>=3.13.0 27 | coverage>=7.3.0 28 | codecov>=2.1.0 29 | html-testRunner==1.2.1 30 | 31 | # 代码质量 32 | flake8>=6.1.0 33 | black>=23.7.0 34 | isort>=5.12.0 35 | mypy>=1.5.0 36 | pylint>=2.17.0 37 | bandit>=1.7.0 38 | safety>=2.3.0 39 | 40 | # 文档工具 41 | sphinx>=7.1.0 42 | sphinx-rtd-theme>=1.3.0 43 | sphinx-autodoc-typehints>=1.24.0 44 | sphinx-copybutton>=0.5.0 45 | 46 | # 开发和调试工具 47 | ipython>=8.14.0 48 | ipdb>=0.13.0 49 | debugpy>=1.6.0 50 | build>=1.0.0 51 | twine>=4.0.0 52 | wheel>=0.41.0 53 | 54 | # 性能分析 55 | memory_profiler>=0.61.0 56 | line_profiler>=4.1.0 57 | py-spy>=0.3.0 58 | 59 | # 类型检查 60 | types-PyYAML>=6.0.12.12 61 | types-psutil==5.9.5.17 62 | types-requests>=2.31.0 63 | types-setuptools>=68.0.0 64 | 65 | # 代码解析 66 | javalang==0.13.0 67 | libclang==16.0.6 68 | 69 | # 代码克隆检测 70 | scikit-learn>=1.0.0 71 | gensim>=4.0.0 72 | nltk>=3.6.0 73 | 74 | # 版本预测 75 | scipy>=1.7.0 76 | statsmodels>=0.13.0 77 | xgboost>=1.5.0 78 | lightgbm>=3.3.0 79 | catboost>=1.0.0 -------------------------------------------------------------------------------- /scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 设置错误时退出 4 | set -e 5 | 6 | # 显示执行的命令 7 | set -x 8 | 9 | # 检查环境变量 10 | if [ -z "$DOCKER_USERNAME" ] || [ -z "$DOCKER_PASSWORD" ]; then 11 | echo "Error: DOCKER_USERNAME or DOCKER_PASSWORD not set" 12 | exit 1 13 | fi 14 | 15 | # 登录Docker Hub 16 | echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin 17 | 18 | # 构建镜像 19 | docker-compose build 20 | 21 | # 运行测试 22 | docker-compose run web pytest 23 | 24 | # 如果测试通过,推送镜像 25 | if [ $? -eq 0 ]; then 26 | docker-compose push 27 | 28 | # 部署到生产环境 29 | if [ "$DEPLOY_ENV" = "production" ]; then 30 | # 备份数据库 31 | docker-compose exec postgres pg_dump -U re_centris re_centris > backup.sql 32 | 33 | # 停止旧容器 34 | docker-compose down 35 | 36 | # 启动新容器 37 | docker-compose up -d 38 | 39 | # 等待服务启动 40 | sleep 30 41 | 42 | # 检查服务健康状态 43 | docker-compose ps | grep "Up" || { 44 | echo "Error: Service failed to start" 45 | docker-compose logs 46 | exit 1 47 | } 48 | 49 | # 运行数据库迁移 50 | docker-compose exec web python manage.py db upgrade 51 | 52 | echo "Deployment successful!" 53 | else 54 | echo "Skipping production deployment" 55 | fi 56 | else 57 | echo "Tests failed, aborting deployment" 58 | exit 1 59 | fi -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Re-Centris 测试包 2 | 3 | 该包包含了Re-Centris项目的所有测试用例,包括: 4 | 1. 单元测试 - 测试各个模块的独立功能 5 | 2. 集成测试 - 测试模块间的交互 6 | 3. 性能测试 - 测试系统性能和资源使用 7 | 8 | 作者: byRen2002 9 | 修改日期: 2025年3月 10 | 许可证: MIT License 11 | """ -------------------------------------------------------------------------------- /tests/core/test_cache.py: -------------------------------------------------------------------------------- 1 | """缓存系统测试模块 2 | 3 | 该模块包含了对Cache类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import time 12 | from unittest.mock import patch, MagicMock 13 | import tempfile 14 | import os 15 | 16 | from core.cache import Cache 17 | 18 | class TestCache(unittest.TestCase): 19 | """Cache类的测试用例""" 20 | 21 | def setUp(self): 22 | """测试前的准备工作""" 23 | self.cache_size = 5 24 | self.expire_time = 1 # 1秒过期 25 | self.cache = Cache(self.cache_size, self.expire_time) 26 | 27 | def test_basic_operations(self): 28 | """测试基本的缓存操作""" 29 | # 测试设置和获取 30 | self.cache.set("key1", "value1") 31 | self.assertEqual(self.cache.get("key1"), "value1") 32 | 33 | # 测试不存在的键 34 | self.assertIsNone(self.cache.get("nonexistent")) 35 | 36 | def test_cache_size_limit(self): 37 | """测试缓存大小限制""" 38 | # 添加超过限制的项 39 | for i in range(self.cache_size + 2): 40 | self.cache.set(f"key{i}", f"value{i}") 41 | 42 | # 验证缓存大小不超过限制 43 | self.assertLessEqual(len(self.cache.cache), self.cache_size) 44 | 45 | # 验证最早的项被移除 46 | self.assertIsNone(self.cache.get("key0")) 47 | self.assertIsNotNone(self.cache.get(f"key{self.cache_size+1}")) 48 | 49 | def test_expiration(self): 50 | """测试缓存过期""" 51 | self.cache.set("expire_key", "expire_value") 52 | 53 | # 等待过期 54 | time.sleep(self.expire_time + 0.1) 55 | 56 | # 验证项已过期 57 | self.assertIsNone(self.cache.get("expire_key")) 58 | 59 | def test_clear(self): 60 | """测试清空缓存""" 61 | # 添加一些项 62 | self.cache.set("key1", "value1") 63 | self.cache.set("key2", "value2") 64 | 65 | # 清空缓存 66 | self.cache.clear() 67 | 68 | # 验证缓存为空 69 | self.assertEqual(len(self.cache.cache), 0) 70 | self.assertEqual(len(self.cache.access_times), 0) 71 | 72 | def test_update_access_time(self): 73 | """测试访问时间更新""" 74 | self.cache.set("key", "value") 75 | first_access = self.cache.access_times["key"] 76 | 77 | # 等待一小段时间 78 | time.sleep(0.1) 79 | 80 | # 再次访问 81 | self.cache.get("key") 82 | second_access = self.cache.access_times["key"] 83 | 84 | # 验证访问时间已更新 85 | self.assertGreater(second_access, first_access) 86 | 87 | def test_persistence(self): 88 | """测试缓存持久化""" 89 | # 创建临时目录 90 | temp_dir = tempfile.mkdtemp() 91 | cache_file = os.path.join(temp_dir, "cache.db") 92 | 93 | try: 94 | # 创建持久化缓存 95 | persistent_cache = Cache( 96 | self.cache_size, 97 | self.expire_time, 98 | persistent=True, 99 | cache_file=cache_file 100 | ) 101 | 102 | # 添加数据 103 | persistent_cache.set("persist_key", "persist_value") 104 | 105 | # 关闭缓存 106 | persistent_cache.close() 107 | 108 | # 重新创建缓存并验证数据 109 | new_cache = Cache( 110 | self.cache_size, 111 | self.expire_time, 112 | persistent=True, 113 | cache_file=cache_file 114 | ) 115 | 116 | self.assertEqual(new_cache.get("persist_key"), "persist_value") 117 | 118 | finally: 119 | # 清理 120 | if os.path.exists(cache_file): 121 | os.remove(cache_file) 122 | os.rmdir(temp_dir) 123 | 124 | def test_thread_safety(self): 125 | """测试线程安全性""" 126 | import threading 127 | 128 | def worker(): 129 | for i in range(100): 130 | self.cache.set(f"thread_key_{i}", f"thread_value_{i}") 131 | self.cache.get(f"thread_key_{i}") 132 | 133 | # 创建多个线程同时操作缓存 134 | threads = [threading.Thread(target=worker) for _ in range(4)] 135 | 136 | # 启动所有线程 137 | for thread in threads: 138 | thread.start() 139 | 140 | # 等待所有线程完成 141 | for thread in threads: 142 | thread.join() 143 | 144 | # 验证缓存状态正常 145 | self.assertLessEqual(len(self.cache.cache), self.cache_size) 146 | 147 | def test_invalid_inputs(self): 148 | """测试无效输入处理""" 149 | # 测试无效的缓存大小 150 | with self.assertRaises(ValueError): 151 | Cache(-1, self.expire_time) 152 | 153 | # 测试无效的过期时间 154 | with self.assertRaises(ValueError): 155 | Cache(self.cache_size, -1) 156 | 157 | def test_memory_management(self): 158 | """测试内存管理""" 159 | large_data = "x" * 1024 * 1024 # 1MB数据 160 | 161 | # 添加大量数据 162 | for i in range(10): 163 | self.cache.set(f"large_key_{i}", large_data) 164 | 165 | # 验证缓存大小限制有效 166 | self.assertLessEqual(len(self.cache.cache), self.cache_size) 167 | 168 | if __name__ == '__main__': 169 | unittest.main() -------------------------------------------------------------------------------- /tests/core/test_config_manager.py: -------------------------------------------------------------------------------- 1 | """配置管理器测试模块 2 | 3 | 该模块包含了对ConfigManager类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import os 12 | import tempfile 13 | import yaml 14 | from unittest.mock import patch, MagicMock 15 | 16 | from core.config_manager import ConfigManager 17 | 18 | class TestConfigManager(unittest.TestCase): 19 | """ConfigManager类的测试用例""" 20 | 21 | def setUp(self): 22 | """测试前的准备工作""" 23 | # 创建临时配置文件 24 | self.temp_dir = tempfile.mkdtemp() 25 | self.config_file = os.path.join(self.temp_dir, "config.yaml") 26 | 27 | # 测试配置数据 28 | self.test_config = { 29 | "paths": { 30 | "repo": "/path/to/repo", 31 | "results": "/path/to/results", 32 | "logs": "/path/to/logs" 33 | }, 34 | "performance": { 35 | "max_workers": 4, 36 | "cache_size": 1000, 37 | "memory_limit": 1024, 38 | "timeout": 300 39 | }, 40 | "logging": { 41 | "level": "INFO", 42 | "max_size": 10, 43 | "backup_count": 5 44 | } 45 | } 46 | 47 | # 写入测试配置 48 | with open(self.config_file, 'w') as f: 49 | yaml.dump(self.test_config, f) 50 | 51 | # 创建ConfigManager实例 52 | self.config_manager = ConfigManager(self.config_file) 53 | 54 | def tearDown(self): 55 | """测试后的清理工作""" 56 | # 删除临时文件和目录 57 | if os.path.exists(self.config_file): 58 | os.remove(self.config_file) 59 | os.rmdir(self.temp_dir) 60 | 61 | def test_load_config(self): 62 | """测试配置加载""" 63 | # 验证配置是否正确加载 64 | self.assertEqual( 65 | self.config_manager.get("paths.repo"), 66 | "/path/to/repo" 67 | ) 68 | self.assertEqual( 69 | self.config_manager.get("performance.max_workers"), 70 | 4 71 | ) 72 | 73 | def test_get_nested_value(self): 74 | """测试获取嵌套配置值""" 75 | # 测试多层嵌套 76 | self.assertEqual( 77 | self.config_manager.get("paths.repo"), 78 | "/path/to/repo" 79 | ) 80 | 81 | # 测试不存在的路径 82 | self.assertIsNone( 83 | self.config_manager.get("nonexistent.path") 84 | ) 85 | 86 | # 测试默认值 87 | self.assertEqual( 88 | self.config_manager.get("nonexistent.path", "default"), 89 | "default" 90 | ) 91 | 92 | def test_set_value(self): 93 | """测试设置配置值""" 94 | # 设置新值 95 | self.config_manager.set("paths.new_path", "/new/path") 96 | 97 | # 验证设置成功 98 | self.assertEqual( 99 | self.config_manager.get("paths.new_path"), 100 | "/new/path" 101 | ) 102 | 103 | # 更新现有值 104 | self.config_manager.set("paths.repo", "/updated/path") 105 | self.assertEqual( 106 | self.config_manager.get("paths.repo"), 107 | "/updated/path" 108 | ) 109 | 110 | def test_save_config(self): 111 | """测试配置保存""" 112 | # 修改配置 113 | self.config_manager.set("paths.new_path", "/new/path") 114 | 115 | # 保存配置 116 | self.config_manager.save() 117 | 118 | # 重新加载配置并验证 119 | new_config = ConfigManager(self.config_file) 120 | self.assertEqual( 121 | new_config.get("paths.new_path"), 122 | "/new/path" 123 | ) 124 | 125 | def test_environment_override(self): 126 | """测试环境变量覆盖""" 127 | with patch.dict('os.environ', { 128 | 'RE_CENTRIS_PATHS_REPO': '/env/path', 129 | 'RE_CENTRIS_PERFORMANCE_MAX_WORKERS': '8' 130 | }): 131 | # 重新加载配置 132 | config = ConfigManager(self.config_file) 133 | 134 | # 验证环境变量覆盖 135 | self.assertEqual(config.get("paths.repo"), "/env/path") 136 | self.assertEqual(config.get("performance.max_workers"), 8) 137 | 138 | def test_validation(self): 139 | """测试配置验证""" 140 | # 测试必需字段 141 | invalid_config = {"paths": {}} 142 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: 143 | yaml.dump(invalid_config, f) 144 | 145 | with self.assertRaises(ValueError): 146 | ConfigManager(f.name) 147 | 148 | os.unlink(f.name) 149 | 150 | def test_type_conversion(self): 151 | """测试类型转换""" 152 | # 测试数字转换 153 | self.assertIsInstance( 154 | self.config_manager.get("performance.max_workers"), 155 | int 156 | ) 157 | 158 | # 测试布尔值转换 159 | self.config_manager.set("feature.enabled", "true") 160 | self.assertIsInstance( 161 | self.config_manager.get("feature.enabled"), 162 | bool 163 | ) 164 | 165 | def test_merge_configs(self): 166 | """测试配置合并""" 167 | # 创建另一个配置文件 168 | other_config = { 169 | "paths": { 170 | "temp": "/path/to/temp" 171 | }, 172 | "new_section": { 173 | "key": "value" 174 | } 175 | } 176 | 177 | other_file = os.path.join(self.temp_dir, "other.yaml") 178 | with open(other_file, 'w') as f: 179 | yaml.dump(other_config, f) 180 | 181 | # 合并配置 182 | self.config_manager.merge(other_file) 183 | 184 | # 验证合并结果 185 | self.assertEqual( 186 | self.config_manager.get("paths.temp"), 187 | "/path/to/temp" 188 | ) 189 | self.assertEqual( 190 | self.config_manager.get("new_section.key"), 191 | "value" 192 | ) 193 | 194 | # 清理 195 | os.remove(other_file) 196 | 197 | if __name__ == '__main__': 198 | unittest.main() -------------------------------------------------------------------------------- /tests/core/test_memory_optimizer.py: -------------------------------------------------------------------------------- 1 | """内存优化器测试模块 2 | 3 | 该模块包含了对MemoryOptimizer类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import os 12 | import tempfile 13 | import psutil 14 | from unittest.mock import patch, MagicMock 15 | 16 | from core.memory_optimizer import MemoryOptimizer 17 | from core.config_manager import ConfigManager 18 | 19 | class TestMemoryOptimizer(unittest.TestCase): 20 | """MemoryOptimizer类的测试用例""" 21 | 22 | def setUp(self): 23 | """测试前的准备工作""" 24 | # 创建临时配置文件 25 | self.temp_dir = tempfile.mkdtemp() 26 | self.config_file = os.path.join(self.temp_dir, "config.yaml") 27 | 28 | # 测试配置数据 29 | self.test_config = { 30 | "memory": { 31 | "limit": 1024 * 1024 * 1024, # 1GB 32 | "threshold": 0.8, # 80% 33 | "cleanup_threshold": 0.9, # 90% 34 | "min_free": 512 * 1024 * 1024 # 512MB 35 | } 36 | } 37 | 38 | # 写入测试配置 39 | with open(self.config_file, 'w') as f: 40 | yaml.dump(self.test_config, f) 41 | 42 | # 创建ConfigManager实例 43 | self.config_manager = ConfigManager(self.config_file) 44 | 45 | # 创建MemoryOptimizer实例 46 | self.memory_optimizer = MemoryOptimizer(self.config_manager) 47 | 48 | def tearDown(self): 49 | """测试后的清理工作""" 50 | # 删除临时文件和目录 51 | if os.path.exists(self.config_file): 52 | os.remove(self.config_file) 53 | os.rmdir(self.temp_dir) 54 | 55 | def test_memory_check(self): 56 | """测试内存检查""" 57 | # 模拟内存使用情况 58 | with patch('psutil.virtual_memory') as mock_memory: 59 | # 模拟内存充足的情况 60 | mock_memory.return_value = MagicMock( 61 | total=8 * 1024 * 1024 * 1024, # 8GB总内存 62 | available=4 * 1024 * 1024 * 1024 # 4GB可用内存 63 | ) 64 | 65 | # 验证内存检查通过 66 | self.assertTrue( 67 | self.memory_optimizer.check_memory_available( 68 | 1024 * 1024 * 1024 # 需要1GB内存 69 | ) 70 | ) 71 | 72 | # 模拟内存不足的情况 73 | mock_memory.return_value = MagicMock( 74 | total=8 * 1024 * 1024 * 1024, # 8GB总内存 75 | available=256 * 1024 * 1024 # 256MB可用内存 76 | ) 77 | 78 | # 验证内存检查失败 79 | self.assertFalse( 80 | self.memory_optimizer.check_memory_available( 81 | 1024 * 1024 * 1024 # 需要1GB内存 82 | ) 83 | ) 84 | 85 | def test_memory_cleanup(self): 86 | """测试内存清理""" 87 | # 创建一些大对象来占用内存 88 | large_objects = [] 89 | for _ in range(5): 90 | large_objects.append(bytearray(100 * 1024 * 1024)) # 每个100MB 91 | 92 | # 记录清理前的内存使用 93 | before_cleanup = psutil.Process().memory_info().rss 94 | 95 | # 执行内存清理 96 | self.memory_optimizer.cleanup() 97 | 98 | # 记录清理后的内存使用 99 | after_cleanup = psutil.Process().memory_info().rss 100 | 101 | # 验证内存使用减少 102 | self.assertLess(after_cleanup, before_cleanup) 103 | 104 | def test_memory_monitoring(self): 105 | """测试内存监控""" 106 | # 启动监控 107 | self.memory_optimizer.start_monitoring() 108 | 109 | # 验证监控线程已启动 110 | self.assertTrue(self.memory_optimizer.is_monitoring()) 111 | 112 | # 停止监控 113 | self.memory_optimizer.stop_monitoring() 114 | 115 | # 验证监控线程已停止 116 | self.assertFalse(self.memory_optimizer.is_monitoring()) 117 | 118 | def test_memory_limit_enforcement(self): 119 | """测试内存限制执行""" 120 | # 测试超出内存限制 121 | with self.assertRaises(MemoryError): 122 | # 尝试分配超过限制的内存 123 | self.memory_optimizer.allocate_memory( 124 | self.test_config['memory']['limit'] * 2 125 | ) 126 | 127 | def test_memory_stats(self): 128 | """测试内存统计""" 129 | # 获取内存统计信息 130 | stats = self.memory_optimizer.get_memory_stats() 131 | 132 | # 验证统计信息的完整性 133 | self.assertIn('total', stats) 134 | self.assertIn('available', stats) 135 | self.assertIn('used', stats) 136 | self.assertIn('free', stats) 137 | self.assertIn('percent', stats) 138 | 139 | def test_optimization_strategies(self): 140 | """测试优化策略""" 141 | # 测试不同的优化级别 142 | strategies = [ 143 | 'minimal', # 最小优化 144 | 'moderate', # 中等优化 145 | 'aggressive' # 激进优化 146 | ] 147 | 148 | for strategy in strategies: 149 | # 设置优化策略 150 | self.memory_optimizer.set_optimization_strategy(strategy) 151 | 152 | # 验证策略设置成功 153 | self.assertEqual( 154 | self.memory_optimizer.get_current_strategy(), 155 | strategy 156 | ) 157 | 158 | def test_memory_pressure_handling(self): 159 | """测试内存压力处理""" 160 | # 模拟高内存压力情况 161 | with patch('psutil.virtual_memory') as mock_memory: 162 | mock_memory.return_value = MagicMock( 163 | percent=95.0 # 95%内存使用率 164 | ) 165 | 166 | # 触发内存压力处理 167 | self.memory_optimizer.handle_memory_pressure() 168 | 169 | # 验证是否触发了清理操作 170 | self.assertTrue(self.memory_optimizer.cleanup_triggered) 171 | 172 | def test_concurrent_memory_operations(self): 173 | """测试并发内存操作""" 174 | import threading 175 | 176 | def memory_worker(): 177 | # 执行一些内存操作 178 | for _ in range(10): 179 | # 分配和释放内存 180 | data = bytearray(10 * 1024 * 1024) # 10MB 181 | self.memory_optimizer.track_allocation(len(data)) 182 | del data 183 | self.memory_optimizer.track_deallocation(10 * 1024 * 1024) 184 | 185 | # 创建多个线程 186 | threads = [threading.Thread(target=memory_worker) for _ in range(4)] 187 | 188 | # 启动所有线程 189 | for thread in threads: 190 | thread.start() 191 | 192 | # 等待所有线程完成 193 | for thread in threads: 194 | thread.join() 195 | 196 | # 验证内存跟踪的准确性 197 | self.assertEqual( 198 | self.memory_optimizer.get_tracked_allocations(), 199 | 0 200 | ) 201 | 202 | if __name__ == '__main__': 203 | unittest.main() -------------------------------------------------------------------------------- /tests/core/test_parallel_manager.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import time 3 | from typing import List 4 | from core.parallel_manager import ParallelManager 5 | 6 | class TestParallelManager(unittest.TestCase): 7 | """ParallelManager单元测试""" 8 | 9 | def setUp(self): 10 | """测试前准备""" 11 | self.manager = ParallelManager(max_workers=2) 12 | 13 | def tearDown(self): 14 | """测试后清理""" 15 | self.manager.close_all() 16 | 17 | def test_process_items_empty(self): 18 | """测试处理空列表""" 19 | result = self.manager.process_items([], lambda x: x) 20 | self.assertEqual(result, []) 21 | 22 | def test_process_items_single_chunk(self): 23 | """测试处理单个数据块""" 24 | def square_numbers(nums: List[int]) -> List[int]: 25 | return [x * x for x in nums] 26 | 27 | items = [1, 2, 3, 4, 5] 28 | result = self.manager.process_items( 29 | items=items, 30 | process_func=square_numbers, 31 | chunk_size=5 32 | ) 33 | self.assertEqual(result, [1, 4, 9, 16, 25]) 34 | 35 | def test_process_items_multiple_chunks(self): 36 | """测试处理多个数据块""" 37 | def sum_numbers(nums: List[int]) -> int: 38 | return sum(nums) 39 | 40 | items = list(range(100)) 41 | result = self.manager.process_items( 42 | items=items, 43 | process_func=sum_numbers, 44 | chunk_size=10 45 | ) 46 | self.assertEqual(sum(result), sum(range(100))) 47 | 48 | def test_process_items_with_threads(self): 49 | """测试使用线程池处理""" 50 | def slow_increment(nums: List[int]) -> List[int]: 51 | time.sleep(0.1) # 模拟耗时操作 52 | return [x + 1 for x in nums] 53 | 54 | items = list(range(10)) 55 | result = self.manager.process_items( 56 | items=items, 57 | process_func=slow_increment, 58 | use_threads=True, 59 | chunk_size=2 60 | ) 61 | self.assertEqual(result, [x + 1 for x in range(10)]) 62 | 63 | def test_process_items_with_progress(self): 64 | """测试带进度回调的处理""" 65 | progress_updates = [] 66 | 67 | def track_progress(current: int, total: int): 68 | progress_updates.append((current, total)) 69 | 70 | def double_numbers(nums: List[int]) -> List[int]: 71 | return [x * 2 for x in nums] 72 | 73 | items = list(range(5)) 74 | result = self.manager.process_items_with_progress( 75 | items=items, 76 | process_func=double_numbers, 77 | progress_callback=track_progress, 78 | chunk_size=1 79 | ) 80 | 81 | self.assertEqual(result, [x * 2 for x in range(5)]) 82 | self.assertEqual(len(progress_updates), 5) 83 | self.assertEqual(progress_updates[-1], (5, 5)) 84 | 85 | def test_error_handling(self): 86 | """测试错误处理""" 87 | def failing_func(nums: List[int]) -> List[int]: 88 | raise ValueError("测试错误") 89 | 90 | items = list(range(5)) 91 | result = self.manager.process_items( 92 | items=items, 93 | process_func=failing_func 94 | ) 95 | self.assertEqual(result, []) 96 | 97 | def test_pool_management(self): 98 | """测试池管理""" 99 | # 测试进程池创建和关闭 100 | self.manager.process_items( 101 | items=[1, 2, 3], 102 | process_func=lambda x: x, 103 | pool_name="test_pool" 104 | ) 105 | self.assertIn("test_pool", self.manager._process_pools) 106 | 107 | # 测试关闭特定池 108 | self.manager.close_pool("test_pool") 109 | self.assertNotIn("test_pool", self.manager._process_pools) 110 | 111 | # 测试关闭所有池 112 | self.manager.process_items( 113 | items=[1, 2, 3], 114 | process_func=lambda x: x, 115 | pool_name="another_pool" 116 | ) 117 | self.manager.close_all() 118 | self.assertEqual(len(self.manager._process_pools), 0) 119 | self.assertEqual(len(self.manager._thread_pools), 0) 120 | 121 | def test_large_data_processing(self): 122 | """测试大数据处理""" 123 | items = list(range(10000)) 124 | 125 | def process_chunk(nums: List[int]) -> List[int]: 126 | return [x * x for x in nums] 127 | 128 | result = self.manager.process_items( 129 | items=items, 130 | process_func=process_chunk, 131 | chunk_size=100 132 | ) 133 | 134 | self.assertEqual(len(result), 10000) 135 | self.assertEqual(result[0], 0) 136 | self.assertEqual(result[-1], 9999 * 9999) 137 | 138 | def test_concurrent_processing(self): 139 | """测试并发处理""" 140 | start_time = time.time() 141 | 142 | def slow_process(nums: List[int]) -> List[int]: 143 | time.sleep(0.1) # 模拟耗时操作 144 | return nums 145 | 146 | items = list(range(20)) 147 | self.manager.process_items( 148 | items=items, 149 | process_func=slow_process, 150 | chunk_size=2 151 | ) 152 | 153 | duration = time.time() - start_time 154 | # 由于使用2个工作进程,处理时间应该小于串行处理的一半 155 | self.assertLess(duration, 1.0) # 串行需要2秒 156 | 157 | if __name__ == '__main__': 158 | unittest.main() -------------------------------------------------------------------------------- /tests/core/test_performance_monitor.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/test_resource_manager.py: -------------------------------------------------------------------------------- 1 | """资源管理器测试模块 2 | 3 | 该模块包含了对ResourceManager类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import os 12 | import tempfile 13 | import shutil 14 | from unittest.mock import patch, MagicMock 15 | 16 | from core.resource_manager import ResourceManager 17 | from core.config_manager import ConfigManager 18 | 19 | class TestResourceManager(unittest.TestCase): 20 | """ResourceManager类的测试用例""" 21 | 22 | def setUp(self): 23 | """测试前的准备工作""" 24 | # 创建临时目录 25 | self.temp_dir = tempfile.mkdtemp() 26 | self.test_repo_path = os.path.join(self.temp_dir, "repos") 27 | self.test_cache_path = os.path.join(self.temp_dir, "cache") 28 | 29 | # 创建测试配置 30 | self.config = { 31 | "paths": { 32 | "repo": self.test_repo_path, 33 | "cache": self.test_cache_path 34 | }, 35 | "limits": { 36 | "max_repo_size": 1024 * 1024 * 100, # 100MB 37 | "max_cache_size": 1024 * 1024 * 500 # 500MB 38 | } 39 | } 40 | 41 | # 创建配置文件 42 | self.config_file = os.path.join(self.temp_dir, "config.yaml") 43 | with open(self.config_file, 'w') as f: 44 | yaml.dump(self.config, f) 45 | 46 | # 创建ConfigManager实例 47 | self.config_manager = ConfigManager(self.config_file) 48 | 49 | # 创建ResourceManager实例 50 | self.resource_manager = ResourceManager(self.config_manager) 51 | 52 | def tearDown(self): 53 | """测试后的清理工作""" 54 | # 删除临时目录及其内容 55 | shutil.rmtree(self.temp_dir) 56 | 57 | def test_init_directories(self): 58 | """测试目录初始化""" 59 | # 验证目录是否被创建 60 | self.assertTrue(os.path.exists(self.test_repo_path)) 61 | self.assertTrue(os.path.exists(self.test_cache_path)) 62 | 63 | def test_check_disk_space(self): 64 | """测试磁盘空间检查""" 65 | # 模拟磁盘空间不足的情况 66 | with patch('psutil.disk_usage') as mock_disk_usage: 67 | mock_disk_usage.return_value = MagicMock( 68 | free=1024 * 1024 # 1MB可用空间 69 | ) 70 | 71 | with self.assertRaises(RuntimeError): 72 | self.resource_manager.check_disk_space( 73 | self.test_repo_path, 74 | required_space=1024 * 1024 * 10 # 需要10MB 75 | ) 76 | 77 | def test_cleanup_old_files(self): 78 | """测试旧文件清理""" 79 | # 创建测试文件 80 | test_files = [] 81 | for i in range(5): 82 | file_path = os.path.join(self.test_cache_path, f"test_{i}.txt") 83 | with open(file_path, 'w') as f: 84 | f.write("test data") 85 | test_files.append(file_path) 86 | 87 | # 修改文件访问时间 88 | for i, file_path in enumerate(test_files): 89 | access_time = time.time() - (i + 1) * 86400 # i+1天前 90 | os.utime(file_path, (access_time, access_time)) 91 | 92 | # 清理3天前的文件 93 | self.resource_manager.cleanup_old_files( 94 | self.test_cache_path, 95 | days=3 96 | ) 97 | 98 | # 验证结果 99 | remaining_files = os.listdir(self.test_cache_path) 100 | self.assertEqual(len(remaining_files), 3) # 应该保留3个文件 101 | 102 | def test_monitor_resource_usage(self): 103 | """测试资源使用监控""" 104 | # 创建一些测试文件来占用空间 105 | for i in range(10): 106 | file_path = os.path.join(self.test_cache_path, f"large_{i}.txt") 107 | with open(file_path, 'wb') as f: 108 | f.write(b'0' * 1024 * 1024) # 写入1MB数据 109 | 110 | # 获取资源使用情况 111 | usage = self.resource_manager.get_resource_usage() 112 | 113 | # 验证返回的数据结构 114 | self.assertIn('disk_usage', usage) 115 | self.assertIn('memory_usage', usage) 116 | self.assertIn('cpu_usage', usage) 117 | 118 | def test_resource_limits(self): 119 | """测试资源限制""" 120 | # 测试超出仓库大小限制 121 | large_data = b'0' * (self.config['limits']['max_repo_size'] + 1024) 122 | 123 | with self.assertRaises(ValueError): 124 | self.resource_manager.check_size_limit( 125 | len(large_data), 126 | 'repo' 127 | ) 128 | 129 | def test_file_operations(self): 130 | """测试文件操作""" 131 | # 测试文件写入 132 | test_data = b"test content" 133 | test_file = os.path.join(self.test_cache_path, "test.txt") 134 | 135 | self.resource_manager.write_file(test_file, test_data) 136 | self.assertTrue(os.path.exists(test_file)) 137 | 138 | # 测试文件读取 139 | read_data = self.resource_manager.read_file(test_file) 140 | self.assertEqual(read_data, test_data) 141 | 142 | # 测试文件删除 143 | self.resource_manager.delete_file(test_file) 144 | self.assertFalse(os.path.exists(test_file)) 145 | 146 | def test_path_validation(self): 147 | """测试路径验证""" 148 | # 测试无效路径 149 | invalid_paths = [ 150 | "../outside.txt", 151 | "/absolute/path/file.txt", 152 | "../../etc/passwd" 153 | ] 154 | 155 | for path in invalid_paths: 156 | with self.assertRaises(ValueError): 157 | self.resource_manager.validate_path(path) 158 | 159 | def test_concurrent_access(self): 160 | """测试并发访问""" 161 | import threading 162 | 163 | def worker(): 164 | # 执行一些文件操作 165 | for i in range(10): 166 | file_path = os.path.join( 167 | self.test_cache_path, 168 | f"thread_{threading.get_ident()}_{i}.txt" 169 | ) 170 | self.resource_manager.write_file(file_path, b"test") 171 | self.resource_manager.read_file(file_path) 172 | self.resource_manager.delete_file(file_path) 173 | 174 | # 创建多个线程 175 | threads = [threading.Thread(target=worker) for _ in range(4)] 176 | 177 | # 启动所有线程 178 | for thread in threads: 179 | thread.start() 180 | 181 | # 等待所有线程完成 182 | for thread in threads: 183 | thread.join() 184 | 185 | # 验证没有遗留文件 186 | remaining_files = os.listdir(self.test_cache_path) 187 | self.assertEqual(len(remaining_files), 0) 188 | 189 | if __name__ == '__main__': 190 | unittest.main() -------------------------------------------------------------------------------- /tests/detector/test_detector.py: -------------------------------------------------------------------------------- 1 | """检测器测试模块 2 | 3 | 该模块包含了对Detector类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import os 12 | import tempfile 13 | import json 14 | import tlsh 15 | from unittest.mock import patch, MagicMock 16 | 17 | from detector.detector import Detector 18 | from core.config_manager import ConfigManager 19 | 20 | class TestDetector(unittest.TestCase): 21 | """Detector类的测试用例""" 22 | 23 | def setUp(self): 24 | """测试前的准备工作""" 25 | # 创建临时测试目录 26 | self.temp_dir = tempfile.mkdtemp() 27 | self.test_data_dir = os.path.join(self.temp_dir, "test_data") 28 | os.makedirs(self.test_data_dir) 29 | 30 | # 创建测试配置 31 | self.config_file = os.path.join(self.temp_dir, "config.yaml") 32 | self.test_config = { 33 | "paths": { 34 | "repo": os.path.join(self.test_data_dir, "repos"), 35 | "results": os.path.join(self.test_data_dir, "results"), 36 | "components": os.path.join(self.test_data_dir, "components"), 37 | "logs": os.path.join(self.test_data_dir, "logs") 38 | }, 39 | "detection": { 40 | "tlsh_threshold": 30, 41 | "similarity_threshold": 0.8, 42 | "min_component_size": 100, 43 | "max_workers": 4 44 | }, 45 | "logging": { 46 | "level": "INFO", 47 | "file": "detector.log" 48 | } 49 | } 50 | 51 | with open(self.config_file, 'w') as f: 52 | yaml.dump(self.test_config, f) 53 | 54 | # 创建必要的目录 55 | for path in self.test_config["paths"].values(): 56 | os.makedirs(path, exist_ok=True) 57 | 58 | # 创建测试数据 59 | self._create_test_data() 60 | 61 | # 创建Detector实例 62 | self.config_manager = ConfigManager(self.config_file) 63 | self.detector = Detector(self.config_manager) 64 | 65 | def tearDown(self): 66 | """测试后的清理工作""" 67 | # 删除临时目录及其内容 68 | shutil.rmtree(self.temp_dir) 69 | 70 | def _create_test_data(self): 71 | """创建测试数据""" 72 | # 创建组件数据库 73 | component_db = os.path.join( 74 | self.test_config["paths"]["components"], 75 | "test_component.json" 76 | ) 77 | 78 | test_functions = { 79 | tlsh.hash(b"function1"): { 80 | "name": "test_func1", 81 | "file": "test1.py", 82 | "component": "component1" 83 | }, 84 | tlsh.hash(b"function2"): { 85 | "name": "test_func2", 86 | "file": "test2.py", 87 | "component": "component2" 88 | } 89 | } 90 | 91 | with open(component_db, 'w') as f: 92 | json.dump(test_functions, f) 93 | 94 | # 创建测试代码文件 95 | test_code = os.path.join( 96 | self.test_config["paths"]["repo"], 97 | "test_code.py" 98 | ) 99 | 100 | with open(test_code, 'w') as f: 101 | f.write("def test_func1():\n return 'test1'\n\n") 102 | f.write("def test_func2():\n return 'test2'\n") 103 | 104 | def test_initialization(self): 105 | """测试初始化""" 106 | # 验证配置加载 107 | self.assertIsNotNone(self.detector.config) 108 | 109 | # 验证日志设置 110 | self.assertTrue(os.path.exists( 111 | os.path.join(self.test_config["paths"]["logs"], "detector.log") 112 | )) 113 | 114 | def test_tlsh_computation(self): 115 | """测试TLSH计算""" 116 | # 计算测试函数的TLSH 117 | code = "def test_function():\n return 'test'\n" 118 | hash_value = self.detector.compute_tlsh(code) 119 | 120 | # 验证哈希值格式 121 | self.assertIsInstance(hash_value, str) 122 | self.assertGreater(len(hash_value), 0) 123 | 124 | def test_component_detection(self): 125 | """测试组件检测""" 126 | # 执行检测 127 | results = self.detector.detect( 128 | os.path.join(self.test_config["paths"]["repo"], "test_code.py") 129 | ) 130 | 131 | # 验证检测结果 132 | self.assertIsInstance(results, dict) 133 | self.assertIn("matches", results) 134 | self.assertIn("statistics", results) 135 | 136 | def test_similarity_calculation(self): 137 | """测试相似度计算""" 138 | # 计算两个相似函数的TLSH差异 139 | code1 = "def test_function():\n return 'test1'\n" 140 | code2 = "def test_function():\n return 'test2'\n" 141 | 142 | hash1 = self.detector.compute_tlsh(code1) 143 | hash2 = self.detector.compute_tlsh(code2) 144 | 145 | diff = self.detector.compute_tlsh_diff(hash1, hash2) 146 | 147 | # 验证差异值在合理范围内 148 | self.assertIsInstance(diff, int) 149 | self.assertGreaterEqual(diff, 0) 150 | self.assertLessEqual(diff, 1000) 151 | 152 | def test_parallel_processing(self): 153 | """测试并行处理""" 154 | # 创建多个测试文件 155 | for i in range(10): 156 | test_file = os.path.join( 157 | self.test_config["paths"]["repo"], 158 | f"test_code_{i}.py" 159 | ) 160 | with open(test_file, 'w') as f: 161 | f.write(f"def test_func_{i}():\n return 'test{i}'\n") 162 | 163 | # 执行并行检测 164 | results = self.detector.detect_batch( 165 | self.test_config["paths"]["repo"] 166 | ) 167 | 168 | # 验证结果 169 | self.assertEqual(len(results), 10) 170 | 171 | def test_cache_mechanism(self): 172 | """测试缓存机制""" 173 | # 第一次检测 174 | file_path = os.path.join( 175 | self.test_config["paths"]["repo"], 176 | "test_code.py" 177 | ) 178 | 179 | start_time = time.time() 180 | first_result = self.detector.detect(file_path) 181 | first_time = time.time() - start_time 182 | 183 | # 第二次检测(应该使用缓存) 184 | start_time = time.time() 185 | second_result = self.detector.detect(file_path) 186 | second_time = time.time() - start_time 187 | 188 | # 验证结果一致性和性能提升 189 | self.assertEqual(first_result, second_result) 190 | self.assertLess(second_time, first_time) 191 | 192 | def test_error_handling(self): 193 | """测试错误处理""" 194 | # 测试不存在的文件 195 | with self.assertRaises(FileNotFoundError): 196 | self.detector.detect("nonexistent_file.py") 197 | 198 | # 测试无效的组件数据库 199 | with open(os.path.join( 200 | self.test_config["paths"]["components"], 201 | "invalid.json" 202 | ), 'w') as f: 203 | f.write("invalid json") 204 | 205 | with self.assertRaises(json.JSONDecodeError): 206 | self.detector.load_component_db("invalid.json") 207 | 208 | def test_memory_management(self): 209 | """测试内存管理""" 210 | import psutil 211 | process = psutil.Process() 212 | 213 | # 记录初始内存使用 214 | initial_memory = process.memory_info().rss 215 | 216 | # 处理大量数据 217 | for i in range(100): 218 | test_file = os.path.join( 219 | self.test_config["paths"]["repo"], 220 | f"large_test_{i}.py" 221 | ) 222 | with open(test_file, 'w') as f: 223 | for j in range(1000): 224 | f.write(f"def test_func_{i}_{j}():\n return 'test'\n") 225 | 226 | self.detector.detect_batch(self.test_config["paths"]["repo"]) 227 | 228 | # 记录最终内存使用 229 | final_memory = process.memory_info().rss 230 | 231 | # 验证内存增长在合理范围内 232 | memory_growth = (final_memory - initial_memory) / (1024 * 1024) # MB 233 | self.assertLess(memory_growth, 1000) # 内存增长应小于1GB 234 | 235 | def test_performance_monitoring(self): 236 | """测试性能监控""" 237 | # 启用性能监控 238 | self.detector.enable_performance_monitoring() 239 | 240 | # 执行一些操作 241 | self.detector.detect( 242 | os.path.join(self.test_config["paths"]["repo"], "test_code.py") 243 | ) 244 | 245 | # 获取性能统计 246 | stats = self.detector.get_performance_stats() 247 | 248 | # 验证统计信息 249 | self.assertIn("processing_time", stats) 250 | self.assertIn("memory_usage", stats) 251 | self.assertIn("cpu_usage", stats) 252 | 253 | def test_result_export(self): 254 | """测试结果导出""" 255 | # 执行检测 256 | results = self.detector.detect( 257 | os.path.join(self.test_config["paths"]["repo"], "test_code.py") 258 | ) 259 | 260 | # 导出结果 261 | export_file = os.path.join( 262 | self.test_config["paths"]["results"], 263 | "test_results.json" 264 | ) 265 | self.detector.export_results(results, export_file) 266 | 267 | # 验证导出文件 268 | self.assertTrue(os.path.exists(export_file)) 269 | 270 | # 验证导出内容 271 | with open(export_file) as f: 272 | exported_data = json.load(f) 273 | self.assertEqual(exported_data, results) 274 | 275 | if __name__ == '__main__': 276 | unittest.main() -------------------------------------------------------------------------------- /tests/detector/test_version_predictor.py: -------------------------------------------------------------------------------- 1 | """版本预测器测试模块 2 | 3 | 该模块实现了对版本预测器的测试用例。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | from datetime import datetime, timedelta 12 | import numpy as np 13 | from detector.version_predictor import VersionPredictor 14 | 15 | class TestVersionPredictor(unittest.TestCase): 16 | """版本预测器测试类""" 17 | 18 | def setUp(self): 19 | """测试前的准备工作""" 20 | self.predictor = VersionPredictor() 21 | 22 | # 生成测试数据 23 | self.training_data = self._generate_test_data() 24 | self.version_dates = self._generate_version_dates() 25 | 26 | def _generate_test_data(self): 27 | """生成测试数据""" 28 | return [ 29 | { 30 | 'lines_added': 100, 31 | 'lines_deleted': 50, 32 | 'files_changed': 5, 33 | 'commit_frequency': 10, 34 | 'author_experience': 100, 35 | 'commit_time': datetime.now() - timedelta(days=30), 36 | 'content': 'def test_function():\n pass' 37 | }, 38 | { 39 | 'lines_added': 200, 40 | 'lines_deleted': 100, 41 | 'files_changed': 10, 42 | 'commit_frequency': 15, 43 | 'author_experience': 150, 44 | 'commit_time': datetime.now() - timedelta(days=20), 45 | 'content': 'class TestClass:\n def method(self):\n pass' 46 | } 47 | ] 48 | 49 | def _generate_version_dates(self): 50 | """生成版本日期""" 51 | base_date = datetime.now() - timedelta(days=60) 52 | return [ 53 | base_date + timedelta(days=i*15) 54 | for i in range(5) 55 | ] 56 | 57 | def test_initialization(self): 58 | """测试初始化""" 59 | self.assertIsNotNone(self.predictor) 60 | self.assertIsNotNone(self.predictor.models) 61 | self.assertIsNotNone(self.predictor.scaler) 62 | 63 | def test_feature_extraction(self): 64 | """测试特征提取""" 65 | features = self.predictor._extract_features(self.training_data) 66 | 67 | self.assertIsInstance(features, np.ndarray) 68 | self.assertEqual(len(features), len(self.training_data)) 69 | 70 | def test_time_feature_extraction(self): 71 | """测试时序特征提取""" 72 | time_features = self.predictor._extract_time_features( 73 | self.training_data[0] 74 | ) 75 | 76 | self.assertIsInstance(time_features, list) 77 | self.assertEqual(len(time_features), 4) # 4个时序特征 78 | 79 | def test_time_interval_computation(self): 80 | """测试时间间隔计算""" 81 | intervals = self.predictor._compute_time_intervals( 82 | self.version_dates 83 | ) 84 | 85 | self.assertIsInstance(intervals, np.ndarray) 86 | self.assertEqual(len(intervals), len(self.version_dates) - 1) 87 | 88 | def test_model_training(self): 89 | """测试模型训练""" 90 | self.predictor.train( 91 | self.training_data, 92 | self.version_dates 93 | ) 94 | 95 | # 验证模型是否已训练 96 | for model in self.predictor.models.values(): 97 | self.assertTrue(hasattr(model, 'predict')) 98 | 99 | def test_version_prediction(self): 100 | """测试版本预测""" 101 | # 先训练模型 102 | self.predictor.train( 103 | self.training_data, 104 | self.version_dates 105 | ) 106 | 107 | # 进行预测 108 | prediction = self.predictor.predict(self.training_data) 109 | 110 | self.assertIsInstance(prediction, dict) 111 | self.assertIn('predicted_interval', prediction) 112 | self.assertIn('confidence_interval', prediction) 113 | self.assertIn('model_contributions', prediction) 114 | 115 | def test_model_update(self): 116 | """测试模型更新""" 117 | # 先训练模型 118 | self.predictor.train( 119 | self.training_data, 120 | self.version_dates 121 | ) 122 | 123 | # 准备新数据 124 | new_data = [{ 125 | 'lines_added': 150, 126 | 'lines_deleted': 75, 127 | 'files_changed': 7, 128 | 'commit_frequency': 12, 129 | 'author_experience': 120, 130 | 'commit_time': datetime.now() - timedelta(days=10), 131 | 'content': 'def new_function():\n return True' 132 | }] 133 | 134 | new_date = datetime.now() 135 | 136 | # 更新模型 137 | self.predictor.update(new_data, new_date) 138 | 139 | # 验证更新后的预测 140 | prediction = self.predictor.predict(new_data) 141 | self.assertIsInstance(prediction, dict) 142 | 143 | def test_model_evaluation(self): 144 | """测试模型评估""" 145 | # 先训练模型 146 | self.predictor.train( 147 | self.training_data, 148 | self.version_dates 149 | ) 150 | 151 | # 准备测试数据 152 | test_data = self._generate_test_data() 153 | test_dates = [ 154 | datetime.now() + timedelta(days=i*15) 155 | for i in range(3) 156 | ] 157 | 158 | # 评估模型 159 | metrics = self.predictor.evaluate(test_data, test_dates) 160 | 161 | self.assertIsInstance(metrics, dict) 162 | for model_metrics in metrics.values(): 163 | self.assertIn('mse', model_metrics) 164 | self.assertIn('rmse', model_metrics) 165 | self.assertIn('r2', model_metrics) 166 | 167 | def test_confidence_interval(self): 168 | """测试置信区间计算""" 169 | X = np.array([[1, 2, 3], [4, 5, 6]]) 170 | predictions = [10, 12, 15] 171 | 172 | interval = self.predictor._compute_confidence_interval( 173 | X, 174 | predictions 175 | ) 176 | 177 | self.assertIsInstance(interval, tuple) 178 | self.assertEqual(len(interval), 2) 179 | self.assertLess(interval[0], interval[1]) 180 | 181 | def test_error_handling(self): 182 | """测试错误处理""" 183 | # 测试空数据 184 | empty_prediction = self.predictor.predict([]) 185 | self.assertEqual(empty_prediction, {}) 186 | 187 | # 测试无效数据 188 | invalid_data = [{'invalid_key': 'value'}] 189 | features = self.predictor._extract_features(invalid_data) 190 | self.assertEqual(len(features), 1) 191 | 192 | # 测试无效日期 193 | invalid_dates = [] 194 | intervals = self.predictor._compute_time_intervals(invalid_dates) 195 | self.assertEqual(len(intervals), 0) 196 | 197 | if __name__ == '__main__': 198 | unittest.main() -------------------------------------------------------------------------------- /tests/integration/test_clone_detection.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import tempfile 4 | import shutil 5 | import json 6 | from preprocessor.preprocessor import Preprocessor 7 | from detector.Detector import Detector 8 | 9 | class TestCloneDetection(unittest.TestCase): 10 | """克隆检测集成测试""" 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | """测试类初始化""" 15 | # 创建临时工作目录 16 | cls.work_dir = tempfile.mkdtemp() 17 | 18 | # 创建测试项目结构 19 | cls._create_test_project() 20 | 21 | @classmethod 22 | def tearDownClass(cls): 23 | """测试类清理""" 24 | shutil.rmtree(cls.work_dir) 25 | 26 | @classmethod 27 | def _create_test_project(cls): 28 | """创建测试项目""" 29 | # 创建目录结构 30 | dirs = [ 31 | "input/project1/src", 32 | "input/project2/src", 33 | "preprocessor/result", 34 | "preprocessor/initialSigs", 35 | "preprocessor/componentDB", 36 | "preprocessor/metaInfos", 37 | "detector/result" 38 | ] 39 | 40 | for dir_path in dirs: 41 | os.makedirs(os.path.join(cls.work_dir, dir_path)) 42 | 43 | # 创建测试源文件 44 | cls._create_test_files() 45 | 46 | @classmethod 47 | def _create_test_files(cls): 48 | """创建测试文件""" 49 | # 项目1的源文件 50 | project1_file = os.path.join(cls.work_dir, "input/project1/src/main.cpp") 51 | with open(project1_file, 'w') as f: 52 | f.write(""" 53 | int add(int a, int b) { 54 | return a + b; 55 | } 56 | 57 | int subtract(int a, int b) { 58 | return a - b; 59 | } 60 | 61 | int main() { 62 | int x = 10, y = 5; 63 | printf("%d\\n", add(x, y)); 64 | printf("%d\\n", subtract(x, y)); 65 | return 0; 66 | } 67 | """) 68 | 69 | # 项目2的源文件(包含克隆的代码) 70 | project2_file = os.path.join(cls.work_dir, "input/project2/src/calculator.cpp") 71 | with open(project2_file, 'w') as f: 72 | f.write(""" 73 | // 克隆的add函数 74 | int add(int a, int b) { 75 | return a + b; 76 | } 77 | 78 | // 修改的subtract函数 79 | int subtract(int x, int y) { 80 | int result = x - y; 81 | return result; 82 | } 83 | 84 | // 新的multiply函数 85 | int multiply(int a, int b) { 86 | return a * b; 87 | } 88 | 89 | int main() { 90 | int a = 20, b = 10; 91 | printf("%d\\n", add(a, b)); 92 | printf("%d\\n", subtract(a, b)); 93 | printf("%d\\n", multiply(a, b)); 94 | return 0; 95 | } 96 | """) 97 | 98 | def setUp(self): 99 | """测试前准备""" 100 | # 初始化预处理器和检测器 101 | self.preprocessor = Preprocessor() 102 | self.detector = Detector() 103 | 104 | # 设置工作目录 105 | self.preprocessor.config.set_base_path(self.work_dir) 106 | self.detector.base_path = self.work_dir 107 | 108 | def test_end_to_end_clone_detection(self): 109 | """端到端克隆检测测试""" 110 | try: 111 | # 1. 运行预处理 112 | self.preprocessor.run() 113 | 114 | # 验证预处理结果 115 | self._verify_preprocessing() 116 | 117 | # 2. 运行克隆检测 118 | self.detector.detect( 119 | os.path.join(self.work_dir, "input/project2"), 120 | "project2" 121 | ) 122 | 123 | # 验证检测结果 124 | self._verify_detection() 125 | 126 | except Exception as e: 127 | self.fail(f"端到端测试失败: {str(e)}") 128 | 129 | def _verify_preprocessing(self): 130 | """验证预处理结果""" 131 | # 检查初始签名 132 | initial_sigs_file = os.path.join( 133 | self.work_dir, 134 | "preprocessor/initialSigs/initialSigs.json" 135 | ) 136 | self.assertTrue(os.path.exists(initial_sigs_file)) 137 | 138 | with open(initial_sigs_file, 'r') as f: 139 | sigs = json.load(f) 140 | self.assertGreater(len(sigs), 0) 141 | 142 | # 检查组件数据库 143 | comp_db_dir = os.path.join(self.work_dir, "preprocessor/componentDB") 144 | self.assertTrue(os.path.exists(comp_db_dir)) 145 | self.assertGreater(len(os.listdir(comp_db_dir)), 0) 146 | 147 | def _verify_detection(self): 148 | """验证检测结果""" 149 | # 检查检测结果文件 150 | result_file = os.path.join( 151 | self.work_dir, 152 | "detector/result/result_project2" 153 | ) 154 | self.assertTrue(os.path.exists(result_file)) 155 | 156 | with open(result_file, 'r') as f: 157 | results = f.readlines() 158 | self.assertGreater(len(results), 0) 159 | 160 | # 解析结果 161 | for result in results: 162 | parts = result.strip().split('\t') 163 | self.assertEqual(len(parts), 7) # 验证结果格式 164 | 165 | # 验证字段 166 | project, repo, version, used, unused, modified, str_change = parts 167 | self.assertEqual(project, "project2") 168 | self.assertGreater(int(used), 0) # 应该检测到至少一个使用的函数 169 | 170 | def test_incremental_detection(self): 171 | """增量检测测试""" 172 | # 首次检测 173 | self.preprocessor.run() 174 | self.detector.detect( 175 | os.path.join(self.work_dir, "input/project2"), 176 | "project2" 177 | ) 178 | 179 | # 修改源文件 180 | project2_file = os.path.join(self.work_dir, "input/project2/src/calculator.cpp") 181 | with open(project2_file, 'a') as f: 182 | f.write(""" 183 | // 新增的divide函数 184 | float divide(int a, int b) { 185 | return a / (float)b; 186 | } 187 | """) 188 | 189 | # 再次检测 190 | self.preprocessor.run() 191 | self.detector.detect( 192 | os.path.join(self.work_dir, "input/project2"), 193 | "project2" 194 | ) 195 | 196 | # 验证结果变化 197 | result_file = os.path.join( 198 | self.work_dir, 199 | "detector/result/result_project2" 200 | ) 201 | with open(result_file, 'r') as f: 202 | results = f.readlines() 203 | last_result = results[-1].strip().split('\t') 204 | self.assertGreater(int(last_result[3]), 0) # used 205 | self.assertGreater(int(last_result[4]), 0) # unused 206 | 207 | def test_error_conditions(self): 208 | """错误条件测试""" 209 | # 测试空项目 210 | empty_dir = os.path.join(self.work_dir, "input/empty_project") 211 | os.makedirs(empty_dir) 212 | 213 | try: 214 | self.detector.detect(empty_dir, "empty_project") 215 | except Exception as e: 216 | self.fail(f"空项目处理失败: {str(e)}") 217 | 218 | # 测试无效文件 219 | invalid_dir = os.path.join(self.work_dir, "input/invalid_project") 220 | os.makedirs(invalid_dir) 221 | with open(os.path.join(invalid_dir, "invalid.cpp"), 'w') as f: 222 | f.write("This is not valid C++ code") 223 | 224 | try: 225 | self.detector.detect(invalid_dir, "invalid_project") 226 | except Exception as e: 227 | self.fail(f"无效文件处理失败: {str(e)}") 228 | 229 | def test_performance(self): 230 | """性能测试""" 231 | import time 232 | 233 | # 创建大型测试项目 234 | large_project_dir = os.path.join(self.work_dir, "input/large_project/src") 235 | os.makedirs(large_project_dir) 236 | 237 | # 生成多个源文件 238 | for i in range(100): 239 | with open(os.path.join(large_project_dir, f"file{i}.cpp"), 'w') as f: 240 | f.write(f""" 241 | int func{i}(int x) {{ 242 | return x * {i}; 243 | }} 244 | """) 245 | 246 | # 测量处理时间 247 | start_time = time.time() 248 | 249 | self.preprocessor.run() 250 | self.detector.detect( 251 | os.path.join(self.work_dir, "input/large_project"), 252 | "large_project" 253 | ) 254 | 255 | duration = time.time() - start_time 256 | 257 | # 验证性能 258 | self.assertLess(duration, 60) # 应该在60秒内完成 259 | 260 | if __name__ == '__main__': 261 | unittest.main() -------------------------------------------------------------------------------- /tests/preprocessor/test_java_processor.py: -------------------------------------------------------------------------------- 1 | """Java处理器测试模块 2 | 3 | 该模块包含了对JavaProcessor类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import os 12 | import tempfile 13 | import shutil 14 | from preprocessor.language_processors.java_processor import JavaProcessor 15 | 16 | class TestJavaProcessor(unittest.TestCase): 17 | """JavaProcessor类的测试用例""" 18 | 19 | def setUp(self): 20 | """测试前的准备工作""" 21 | self.processor = JavaProcessor() 22 | self.temp_dir = tempfile.mkdtemp() 23 | 24 | # 创建测试Java文件 25 | self.test_file = os.path.join(self.temp_dir, "TestClass.java") 26 | self._create_test_file() 27 | 28 | def tearDown(self): 29 | """测试后的清理工作""" 30 | shutil.rmtree(self.temp_dir) 31 | 32 | def _create_test_file(self): 33 | """创建测试Java文件""" 34 | test_code = ''' 35 | package com.example.test; 36 | 37 | import java.util.List; 38 | import java.util.ArrayList; 39 | 40 | public class TestClass extends BaseClass implements TestInterface { 41 | private String name; 42 | private int age; 43 | 44 | public TestClass(String name, int age) { 45 | this.name = name; 46 | this.age = age; 47 | } 48 | 49 | public String getName() { 50 | return name; 51 | } 52 | 53 | public void setName(String name) { 54 | this.name = name; 55 | } 56 | 57 | public int calculateComplexity(int n) { 58 | int result = 0; 59 | if (n > 0) { 60 | for (int i = 0; i < n; i++) { 61 | if (i % 2 == 0) { 62 | result += i; 63 | } else { 64 | result -= i; 65 | } 66 | while (result > 100) { 67 | result /= 2; 68 | } 69 | } 70 | } 71 | return result; 72 | } 73 | 74 | private List processItems(List items) { 75 | List results = new ArrayList<>(); 76 | for (String item : items) { 77 | if (item != null && !item.isEmpty()) { 78 | results.add(item.toUpperCase()); 79 | } 80 | } 81 | return results; 82 | } 83 | } 84 | ''' 85 | with open(self.test_file, 'w', encoding='utf-8') as f: 86 | f.write(test_code) 87 | 88 | def test_extract_methods(self): 89 | """测试方法提取""" 90 | methods = self.processor.extract_methods(self.test_file) 91 | 92 | # 验证方法数量 93 | self.assertEqual(len(methods), 5) # 构造函数 + 4个方法 94 | 95 | # 验证方法名称 96 | method_names = [m['name'] for m in methods] 97 | expected_names = [ 98 | 'TestClass', # 构造函数 99 | 'getName', 100 | 'setName', 101 | 'calculateComplexity', 102 | 'processItems' 103 | ] 104 | self.assertEqual(sorted(method_names), sorted(expected_names)) 105 | 106 | # 验证方法属性 107 | for method in methods: 108 | self.assertIn('name', method) 109 | self.assertIn('content', method) 110 | self.assertIn('start_line', method) 111 | self.assertIn('modifiers', method) 112 | self.assertIn('return_type', method) 113 | self.assertIn('parameters', method) 114 | 115 | def test_method_content(self): 116 | """测试方法内容提取""" 117 | methods = self.processor.extract_methods(self.test_file) 118 | 119 | # 找到calculateComplexity方法 120 | complex_method = next( 121 | m for m in methods if m['name'] == 'calculateComplexity' 122 | ) 123 | 124 | # 验证方法内容 125 | self.assertIn('if (n > 0)', complex_method['content']) 126 | self.assertIn('for (int i = 0', complex_method['content']) 127 | self.assertIn('while (result > 100)', complex_method['content']) 128 | 129 | def test_return_type(self): 130 | """测试返回类型提取""" 131 | methods = self.processor.extract_methods(self.test_file) 132 | 133 | # 验证不同返回类型 134 | return_types = {m['name']: m['return_type'] for m in methods} 135 | self.assertEqual(return_types['getName'], 'String') 136 | self.assertEqual(return_types['setName'], 'void') 137 | self.assertEqual(return_types['calculateComplexity'], 'int') 138 | 139 | def test_parameters(self): 140 | """测试参数提取""" 141 | methods = self.processor.extract_methods(self.test_file) 142 | 143 | # 验证构造函数参数 144 | constructor = next(m for m in methods if m['name'] == 'TestClass') 145 | self.assertEqual(len(constructor['parameters']), 2) 146 | self.assertEqual(constructor['parameters'][0]['type'], 'String') 147 | self.assertEqual(constructor['parameters'][0]['name'], 'name') 148 | self.assertEqual(constructor['parameters'][1]['type'], 'int') 149 | self.assertEqual(constructor['parameters'][1]['name'], 'age') 150 | 151 | def test_complexity_analysis(self): 152 | """测试复杂度分析""" 153 | methods = self.processor.extract_methods(self.test_file) 154 | 155 | # 分析calculateComplexity方法的复杂度 156 | complex_method = next( 157 | m for m in methods if m['name'] == 'calculateComplexity' 158 | ) 159 | metrics = self.processor.analyze_complexity(complex_method['content']) 160 | 161 | # 验证复杂度指标 162 | self.assertGreater(metrics['cyclomatic_complexity'], 1) 163 | self.assertGreater(metrics['cognitive_complexity'], 0) 164 | self.assertGreater(metrics['nesting_depth'], 1) 165 | 166 | def test_class_info(self): 167 | """测试类信息提取""" 168 | class_info = self.processor.extract_class_info(self.test_file) 169 | 170 | # 验证基本信息 171 | self.assertEqual(class_info['name'], 'TestClass') 172 | self.assertEqual(class_info['package'], 'com.example.test') 173 | 174 | # 验证继承和实现 175 | self.assertEqual(class_info['extends'], 'BaseClass') 176 | self.assertIn('TestInterface', class_info['implements']) 177 | 178 | # 验证导入 179 | self.assertIn('java.util.List', class_info['imports']) 180 | self.assertIn('java.util.ArrayList', class_info['imports']) 181 | 182 | def test_method_signature(self): 183 | """测试方法签名生成""" 184 | methods = self.processor.extract_methods(self.test_file) 185 | 186 | # 验证不同方法的签名 187 | for method in methods: 188 | signature = self.processor.get_method_signature(method) 189 | self.assertIsInstance(signature, str) 190 | self.assertGreater(len(signature), 0) 191 | 192 | if method['name'] == 'calculateComplexity': 193 | self.assertIn('public int calculateComplexity(int n)', signature) 194 | 195 | def test_code_normalization(self): 196 | """测试代码规范化""" 197 | test_code = ''' 198 | public void testMethod() { 199 | // This is a comment 200 | String name = "test"; /* Another comment */ 201 | if (name.equals("test")) { 202 | System.out.println("Hello"); 203 | } 204 | } 205 | ''' 206 | 207 | normalized = self.processor.normalize_code(test_code) 208 | 209 | # 验证规范化结果 210 | self.assertNotIn('//', normalized) # 注释被移除 211 | self.assertNotIn('/*', normalized) # 多行注释被移除 212 | self.assertNotIn(' ', normalized) # 多余空格被移除 213 | self.assertEqual(normalized.count('"'), 2) # 字符串被规范化 214 | 215 | def test_error_handling(self): 216 | """测试错误处理""" 217 | # 测试处理不存在的文件 218 | methods = self.processor.extract_methods("nonexistent.java") 219 | self.assertEqual(len(methods), 0) 220 | 221 | # 测试处理无效的Java代码 222 | invalid_file = os.path.join(self.temp_dir, "Invalid.java") 223 | with open(invalid_file, 'w') as f: 224 | f.write("invalid java code") 225 | 226 | methods = self.processor.extract_methods(invalid_file) 227 | self.assertEqual(len(methods), 0) 228 | 229 | def test_large_file(self): 230 | """测试处理大文件""" 231 | # 创建包含多个方法的大文件 232 | large_file = os.path.join(self.temp_dir, "LargeClass.java") 233 | with open(large_file, 'w') as f: 234 | f.write("public class LargeClass {\n") 235 | for i in range(100): 236 | f.write(f''' 237 | public void method{i}() {{ 238 | System.out.println("Method {i}"); 239 | }} 240 | ''') 241 | f.write("}") 242 | 243 | # 验证能够处理大文件 244 | methods = self.processor.extract_methods(large_file) 245 | self.assertEqual(len(methods), 100) 246 | 247 | if __name__ == '__main__': 248 | unittest.main() -------------------------------------------------------------------------------- /tests/preprocessor/test_preprocessor.py: -------------------------------------------------------------------------------- 1 | """预处理器测试模块 2 | 3 | 该模块包含了对Preprocessor类的单元测试。 4 | 5 | 作者: byRen2002 6 | 修改日期: 2025年3月 7 | 许可证: MIT License 8 | """ 9 | 10 | import unittest 11 | import os 12 | import tempfile 13 | import shutil 14 | import json 15 | from unittest.mock import patch, MagicMock 16 | 17 | from preprocessor.preprocessor import ( 18 | PreprocessorConfig, 19 | SignatureProcessor, 20 | MetaInfoManager, 21 | CodeSegmenter 22 | ) 23 | 24 | class TestPreprocessor(unittest.TestCase): 25 | """Preprocessor类的测试用例""" 26 | 27 | def setUp(self): 28 | """测试前的准备工作""" 29 | # 创建临时测试目录 30 | self.temp_dir = tempfile.mkdtemp() 31 | self.test_data_dir = os.path.join(self.temp_dir, "test_data") 32 | os.makedirs(self.test_data_dir) 33 | 34 | # 创建测试仓库目录结构 35 | self.repo_dir = os.path.join(self.test_data_dir, "repos") 36 | self.repo_date_dir = os.path.join(self.test_data_dir, "repo_date") 37 | self.repo_func_dir = os.path.join(self.test_data_dir, "repo_functions") 38 | 39 | for dir_path in [self.repo_dir, self.repo_date_dir, self.repo_func_dir]: 40 | os.makedirs(dir_path) 41 | 42 | # 创建测试配置 43 | self.config = PreprocessorConfig() 44 | self.config.current_path = self.test_data_dir 45 | self.config.tag_date_path = self.repo_date_dir 46 | self.config.result_path = self.repo_func_dir 47 | 48 | # 创建测试数据 49 | self._create_test_data() 50 | 51 | def tearDown(self): 52 | """测试后的清理工作""" 53 | # 删除临时目录及其内容 54 | shutil.rmtree(self.temp_dir) 55 | 56 | def _create_test_data(self): 57 | """创建测试数据""" 58 | # 创建版本日期文件 59 | repo_date_file = os.path.join(self.repo_date_dir, "test_repo") 60 | with open(repo_date_file, 'w') as f: 61 | f.write("2024-01-01 tag: v1.0\n") 62 | f.write("2024-02-01 tag: v1.1\n") 63 | f.write("2024-03-01 tag: v2.0\n") 64 | 65 | # 创建函数签名文件 66 | repo_func_dir = os.path.join(self.repo_func_dir, "test_repo") 67 | os.makedirs(repo_func_dir) 68 | 69 | versions = ["v1.0", "v1.1", "v2.0"] 70 | for version in versions: 71 | func_file = os.path.join(repo_func_dir, f"fuzzy_{version}.hidx") 72 | with open(func_file, 'w') as f: 73 | f.write("hash\tfunction\tfile\n") 74 | f.write(f"hash1\tfunc1\tfile1.py\n") 75 | f.write(f"hash2\tfunc2\tfile2.py\n") 76 | 77 | def test_config_initialization(self): 78 | """测试配置初始化""" 79 | # 验证目录创建 80 | self.assertTrue(os.path.exists(self.config.ver_idx_path)) 81 | self.assertTrue(os.path.exists(self.config.initial_db_path)) 82 | self.assertTrue(os.path.exists(self.config.final_db_path)) 83 | self.assertTrue(os.path.exists(self.config.meta_path)) 84 | 85 | def test_signature_processing(self): 86 | """测试签名处理""" 87 | processor = SignatureProcessor(self.config) 88 | 89 | # 处理测试仓库 90 | processor.process_single_repo("test_repo") 91 | 92 | # 验证输出文件 93 | self.assertTrue( 94 | os.path.exists( 95 | os.path.join(self.config.func_date_path, "test_repo_funcdate") 96 | ) 97 | ) 98 | self.assertTrue( 99 | os.path.exists( 100 | os.path.join(self.config.ver_idx_path, "test_repo_idx") 101 | ) 102 | ) 103 | self.assertTrue( 104 | os.path.exists( 105 | os.path.join(self.config.initial_db_path, "test_repo_sig") 106 | ) 107 | ) 108 | 109 | # 验证版本索引内容 110 | with open(os.path.join(self.config.ver_idx_path, "test_repo_idx")) as f: 111 | ver_idx = json.load(f) 112 | self.assertEqual(len(ver_idx), 3) # 应该有3个版本 113 | 114 | def test_meta_info_management(self): 115 | """测试元信息管理""" 116 | # 先处理签名 117 | processor = SignatureProcessor(self.config) 118 | processor.process_single_repo("test_repo") 119 | 120 | # 处理元信息 121 | meta_manager = MetaInfoManager(self.config) 122 | meta_manager.save_meta_infos() 123 | 124 | # 验证元信息文件 125 | self.assertTrue( 126 | os.path.exists(os.path.join(self.config.meta_path, "aveFuncs")) 127 | ) 128 | self.assertTrue( 129 | os.path.exists(os.path.join(self.config.meta_path, "allFuncs")) 130 | ) 131 | self.assertTrue( 132 | os.path.exists(os.path.join(self.config.meta_path, "uniqueFuncs")) 133 | ) 134 | 135 | # 验证权重文件 136 | self.assertTrue( 137 | os.path.exists( 138 | os.path.join(self.config.weight_path, "test_repo_weights") 139 | ) 140 | ) 141 | 142 | def test_code_segmentation(self): 143 | """测试代码分割""" 144 | # 准备数据 145 | processor = SignatureProcessor(self.config) 146 | processor.process_single_repo("test_repo") 147 | 148 | meta_manager = MetaInfoManager(self.config) 149 | meta_manager.save_meta_infos() 150 | 151 | # 执行代码分割 152 | segmenter = CodeSegmenter(self.config) 153 | segmenter.segment_code() 154 | 155 | # 验证分割结果 156 | self.assertTrue( 157 | os.path.exists( 158 | os.path.join(self.config.final_db_path, "test_repo_sig") 159 | ) 160 | ) 161 | 162 | def test_version_date_extraction(self): 163 | """测试版本日期提取""" 164 | processor = SignatureProcessor(self.config) 165 | ver_dates = processor.extract_ver_date("test_repo") 166 | 167 | # 验证版本日期 168 | self.assertEqual(ver_dates["v1.0"], "2024-01-01") 169 | self.assertEqual(ver_dates["v1.1"], "2024-02-01") 170 | self.assertEqual(ver_dates["v2.0"], "2024-03-01") 171 | 172 | def test_error_handling(self): 173 | """测试错误处理""" 174 | processor = SignatureProcessor(self.config) 175 | 176 | # 测试处理不存在的仓库 177 | processor.process_single_repo("nonexistent_repo") 178 | 179 | # 验证不会创建相关文件 180 | self.assertFalse( 181 | os.path.exists( 182 | os.path.join(self.config.func_date_path, "nonexistent_repo_funcdate") 183 | ) 184 | ) 185 | 186 | def test_concurrent_processing(self): 187 | """测试并发处理""" 188 | import threading 189 | 190 | def worker(): 191 | processor = SignatureProcessor(self.config) 192 | processor.process_single_repo("test_repo") 193 | 194 | # 创建多个线程 195 | threads = [threading.Thread(target=worker) for _ in range(4)] 196 | 197 | # 启动所有线程 198 | for thread in threads: 199 | thread.start() 200 | 201 | # 等待所有线程完成 202 | for thread in threads: 203 | thread.join() 204 | 205 | # 验证处理结果的一致性 206 | with open(os.path.join(self.config.ver_idx_path, "test_repo_idx")) as f: 207 | ver_idx = json.load(f) 208 | self.assertEqual(len(ver_idx), 3) 209 | 210 | def test_memory_efficiency(self): 211 | """测试内存效率""" 212 | import psutil 213 | process = psutil.Process() 214 | 215 | # 记录初始内存使用 216 | initial_memory = process.memory_info().rss 217 | 218 | # 处理大量数据 219 | processor = SignatureProcessor(self.config) 220 | for i in range(10): 221 | # 创建更多测试数据 222 | repo_name = f"test_repo_{i}" 223 | repo_dir = os.path.join(self.repo_func_dir, repo_name) 224 | os.makedirs(repo_dir) 225 | 226 | for j in range(100): 227 | with open(os.path.join(repo_dir, f"fuzzy_v{j}.hidx"), 'w') as f: 228 | f.write("hash\tfunction\tfile\n") 229 | for k in range(1000): 230 | f.write(f"hash{k}\tfunc{k}\tfile{k}.py\n") 231 | 232 | processor.process_single_repo(repo_name) 233 | 234 | # 记录最终内存使用 235 | final_memory = process.memory_info().rss 236 | 237 | # 验证内存增长在合理范围内 238 | memory_growth = (final_memory - initial_memory) / (1024 * 1024) # MB 239 | self.assertLess(memory_growth, 1000) # 内存增长应小于1GB 240 | 241 | if __name__ == '__main__': 242 | unittest.main() -------------------------------------------------------------------------------- /tests/run_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """测试运行器 3 | 4 | 用于运行所有单元测试、集成测试和安全测试。 5 | 支持并行测试执行和测试报告生成。 6 | 7 | 作者: byRen2002 8 | 修改日期: 2025年3月 9 | 许可证: MIT 10 | """ 11 | 12 | import os 13 | import sys 14 | import unittest 15 | import argparse 16 | import coverage 17 | import xmlrunner 18 | import concurrent.futures 19 | from typing import List, Tuple 20 | from datetime import datetime 21 | 22 | def discover_tests(start_dir: str) -> List[unittest.TestSuite]: 23 | """发现测试用例 24 | 25 | Args: 26 | start_dir: 起始目录 27 | 28 | Returns: 29 | 测试套件列表 30 | """ 31 | loader = unittest.TestLoader() 32 | suites = [] 33 | 34 | for root, _, files in os.walk(start_dir): 35 | if any(f.startswith('test_') and f.endswith('.py') for f in files): 36 | suite = loader.discover(root, pattern='test_*.py') 37 | suites.append(suite) 38 | 39 | return suites 40 | 41 | def run_test_suite(suite: unittest.TestSuite) -> Tuple[int, int, List[str]]: 42 | """运行测试套件 43 | 44 | Args: 45 | suite: 测试套件 46 | 47 | Returns: 48 | (成功数, 失败数, 错误信息列表) 49 | """ 50 | result = unittest.TestResult() 51 | suite.run(result) 52 | 53 | errors = [] 54 | for test, error in result.errors: 55 | errors.append(f"错误 ({test}): {error}") 56 | for test, failure in result.failures: 57 | errors.append(f"失败 ({test}): {failure}") 58 | 59 | return result.testsRun - len(result.failures) - len(result.errors), \ 60 | len(result.failures) + len(result.errors), \ 61 | errors 62 | 63 | def main(): 64 | """主函数""" 65 | parser = argparse.ArgumentParser(description='Re-Centris 测试运行器') 66 | parser.add_argument('--parallel', action='store_true', help='并行运行测试') 67 | parser.add_argument('--coverage', action='store_true', help='生成覆盖率报告') 68 | parser.add_argument('--xml', action='store_true', help='生成XML测试报告') 69 | parser.add_argument('--html', action='store_true', help='生成HTML测试报告') 70 | args = parser.parse_args() 71 | 72 | # 设置覆盖率收集 73 | if args.coverage: 74 | cov = coverage.Coverage() 75 | cov.start() 76 | 77 | # 发现测试 78 | suites = discover_tests('tests') 79 | if not suites: 80 | print("未发现测试用例") 81 | sys.exit(1) 82 | 83 | total_tests = 0 84 | passed_tests = 0 85 | failed_tests = 0 86 | all_errors = [] 87 | 88 | # 运行测试 89 | if args.parallel: 90 | print("并行运行测试...") 91 | with concurrent.futures.ProcessPoolExecutor() as executor: 92 | futures = [executor.submit(run_test_suite, suite) for suite in suites] 93 | for future in concurrent.futures.as_completed(futures): 94 | passed, failed, errors = future.result() 95 | passed_tests += passed 96 | failed_tests += failed 97 | all_errors.extend(errors) 98 | total_tests += passed + failed 99 | else: 100 | print("串行运行测试...") 101 | for suite in suites: 102 | passed, failed, errors = run_test_suite(suite) 103 | passed_tests += passed 104 | failed_tests += failed 105 | all_errors.extend(errors) 106 | total_tests += passed + failed 107 | 108 | # 生成报告 109 | if args.xml: 110 | print("生成XML报告...") 111 | xml_dir = 'test-reports/xml' 112 | os.makedirs(xml_dir, exist_ok=True) 113 | for suite in suites: 114 | xmlrunner.XMLTestRunner(output=xml_dir).run(suite) 115 | 116 | if args.html: 117 | print("生成HTML报告...") 118 | html_dir = 'test-reports/html' 119 | os.makedirs(html_dir, exist_ok=True) 120 | with open(os.path.join(html_dir, 'index.html'), 'w') as f: 121 | f.write(f""" 122 | 123 | 测试报告 124 | 125 |

测试报告 ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})

126 |

总测试数: {total_tests}

127 |

通过: {passed_tests}

128 |

失败: {failed_tests}

129 |

错误详情:

130 |
{'
'.join(all_errors)}
131 | 132 | 133 | """) 134 | 135 | if args.coverage: 136 | print("生成覆盖率报告...") 137 | cov.stop() 138 | cov.save() 139 | 140 | # 生成报告 141 | cov_dir = 'test-reports/coverage' 142 | os.makedirs(cov_dir, exist_ok=True) 143 | 144 | # HTML报告 145 | cov.html_report(directory=os.path.join(cov_dir, 'html')) 146 | 147 | # XML报告 148 | cov.xml_report(outfile=os.path.join(cov_dir, 'coverage.xml')) 149 | 150 | # 打印结果 151 | print("\n测试结果汇总:") 152 | print(f"总测试数: {total_tests}") 153 | print(f"通过: {passed_tests}") 154 | print(f"失败: {failed_tests}") 155 | 156 | if all_errors: 157 | print("\n错误详情:") 158 | for error in all_errors: 159 | print(error) 160 | 161 | # 返回状态码 162 | return 1 if failed_tests > 0 else 0 163 | 164 | if __name__ == '__main__': 165 | sys.exit(main()) -------------------------------------------------------------------------------- /tests/security/test_security.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import tempfile 4 | import shutil 5 | import json 6 | import subprocess 7 | from unittest.mock import patch 8 | from preprocessor.preprocessor import Preprocessor 9 | from detector.Detector import Detector 10 | 11 | class TestSecurity(unittest.TestCase): 12 | """安全测试""" 13 | 14 | @classmethod 15 | def setUpClass(cls): 16 | """测试类初始化""" 17 | cls.work_dir = tempfile.mkdtemp() 18 | cls._create_test_environment() 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | """测试类清理""" 23 | shutil.rmtree(cls.work_dir) 24 | 25 | @classmethod 26 | def _create_test_environment(cls): 27 | """创建测试环境""" 28 | # 创建目录结构 29 | dirs = [ 30 | "input", 31 | "preprocessor/result", 32 | "preprocessor/initialSigs", 33 | "preprocessor/componentDB", 34 | "preprocessor/metaInfos", 35 | "detector/result" 36 | ] 37 | 38 | for dir_path in dirs: 39 | os.makedirs(os.path.join(cls.work_dir, dir_path)) 40 | 41 | def setUp(self): 42 | """测试前准备""" 43 | self.preprocessor = Preprocessor() 44 | self.detector = Detector() 45 | 46 | self.preprocessor.config.set_base_path(self.work_dir) 47 | self.detector.base_path = self.work_dir 48 | 49 | def test_path_traversal(self): 50 | """测试路径遍历攻击防护""" 51 | # 测试相对路径遍历 52 | malicious_paths = [ 53 | "../../../etc/passwd", 54 | "..\\..\\..\\Windows\\System32\\config\\SAM", 55 | "%2e%2e%2f%2e%2e%2f%2e%2e%2f", # URL编码的../../../ 56 | "input/project/../../etc/passwd" 57 | ] 58 | 59 | for path in malicious_paths: 60 | full_path = os.path.join(self.work_dir, path) 61 | result = self.detector.process_file(full_path, self.work_dir) 62 | self.assertEqual(result, ({}, 0, 0, 0)) 63 | 64 | def test_file_content_injection(self): 65 | """测试文件内容注入防护""" 66 | # 创建包含恶意内容的文件 67 | malicious_file = os.path.join(self.work_dir, "input/malicious.cpp") 68 | with open(malicious_file, 'w') as f: 69 | f.write(""" 70 | #include 71 | 72 | int main() { 73 | system("rm -rf /"); // 危险的系统调用 74 | return 0; 75 | } 76 | 77 | __attribute__((constructor)) 78 | void init() { 79 | system("echo 'Malicious code executed'"); 80 | } 81 | """) 82 | 83 | # 确保处理过程不执行代码 84 | with patch('subprocess.run') as mock_run: 85 | self.detector.process_file(malicious_file, self.work_dir) 86 | mock_run.assert_not_called() 87 | 88 | def test_memory_limits(self): 89 | """测试内存限制""" 90 | # 创建大文件 91 | large_file = os.path.join(self.work_dir, "input/large.cpp") 92 | with open(large_file, 'w') as f: 93 | f.write("a" * (100 * 1024 * 1024)) # 100MB 94 | 95 | try: 96 | self.detector.process_file(large_file, self.work_dir) 97 | except MemoryError: 98 | self.fail("内存限制处理失败") 99 | 100 | def test_cpu_limits(self): 101 | """测试CPU限制""" 102 | # 创建CPU密集型文件 103 | cpu_intensive_file = os.path.join(self.work_dir, "input/cpu_intensive.cpp") 104 | with open(cpu_intensive_file, 'w') as f: 105 | f.write("int main() { while(1); return 0; }") 106 | 107 | start_time = time.time() 108 | self.detector.process_file(cpu_intensive_file, self.work_dir) 109 | duration = time.time() - start_time 110 | 111 | self.assertLess(duration, 10) # 应该在10秒内超时 112 | 113 | def test_file_type_validation(self): 114 | """测试文件类型验证""" 115 | # 创建伪装的可执行文件 116 | fake_cpp = os.path.join(self.work_dir, "input/fake.cpp") 117 | with open(fake_cpp, 'wb') as f: 118 | f.write(b"MZ\x90\x00\x03") # PE文件头 119 | 120 | result = self.detector.process_file(fake_cpp, self.work_dir) 121 | self.assertEqual(result, ({}, 0, 0, 0)) 122 | 123 | def test_input_sanitization(self): 124 | """测试输入净化""" 125 | # 测试SQL注入 126 | malicious_input = "'; DROP TABLE users; --" 127 | safe_path = os.path.join(self.work_dir, malicious_input) 128 | result = self.detector.process_file(safe_path, self.work_dir) 129 | self.assertEqual(result, ({}, 0, 0, 0)) 130 | 131 | # 测试命令注入 132 | malicious_input = "; rm -rf /" 133 | safe_path = os.path.join(self.work_dir, malicious_input) 134 | result = self.detector.process_file(safe_path, self.work_dir) 135 | self.assertEqual(result, ({}, 0, 0, 0)) 136 | 137 | def test_file_permissions(self): 138 | """测试文件权限""" 139 | # 创建只读文件 140 | readonly_file = os.path.join(self.work_dir, "input/readonly.cpp") 141 | with open(readonly_file, 'w') as f: 142 | f.write("int main() { return 0; }") 143 | 144 | # 设置只读权限 145 | os.chmod(readonly_file, 0o444) 146 | 147 | try: 148 | self.detector.process_file(readonly_file, self.work_dir) 149 | except PermissionError: 150 | self.fail("文件权限处理失败") 151 | 152 | def test_concurrent_access(self): 153 | """测试并发访问安全""" 154 | import threading 155 | 156 | # 创建测试文件 157 | test_file = os.path.join(self.work_dir, "input/concurrent.cpp") 158 | with open(test_file, 'w') as f: 159 | f.write("int main() { return 0; }") 160 | 161 | # 并发访问 162 | def process_file(): 163 | self.detector.process_file(test_file, self.work_dir) 164 | 165 | threads = [] 166 | for _ in range(10): 167 | thread = threading.Thread(target=process_file) 168 | threads.append(thread) 169 | thread.start() 170 | 171 | for thread in threads: 172 | thread.join() 173 | 174 | def test_resource_cleanup(self): 175 | """测试资源清理""" 176 | import psutil 177 | 178 | # 记录初始文件描述符数量 179 | process = psutil.Process() 180 | initial_fds = process.num_fds() 181 | 182 | # 执行多次操作 183 | for _ in range(10): 184 | test_file = os.path.join(self.work_dir, "input/test.cpp") 185 | with open(test_file, 'w') as f: 186 | f.write("int main() { return 0; }") 187 | 188 | self.detector.process_file(test_file, self.work_dir) 189 | 190 | # 验证文件描述符没有泄漏 191 | final_fds = process.num_fds() 192 | self.assertLessEqual(final_fds - initial_fds, 5) 193 | 194 | def test_data_validation(self): 195 | """测试数据验证""" 196 | # 测试无效的TLSH哈希 197 | invalid_hashes = [ 198 | "not_a_hash", 199 | "T1" + "0" * 69, # 长度不足 200 | "T1" + "0" * 71, # 长度过长 201 | "T1" + "XYZ" + "0" * 67 # 无效字符 202 | ] 203 | 204 | for hash_val in invalid_hashes: 205 | result = self.detector._compute_tlsh(hash_val) 206 | self.assertIsNone(result) 207 | 208 | def test_error_handling(self): 209 | """测试错误处理""" 210 | # 测试文件不存在 211 | result = self.detector.process_file( 212 | "nonexistent.cpp", 213 | self.work_dir 214 | ) 215 | self.assertEqual(result, ({}, 0, 0, 0)) 216 | 217 | # 测试无效组件 218 | result = self.detector.process_component( 219 | ("invalid_comp", {}, "test_repo", {}) 220 | ) 221 | self.assertIsNone(result) 222 | 223 | # 测试无效配置 224 | with self.assertRaises(Exception): 225 | detector = Detector("invalid_config.yaml") 226 | 227 | if __name__ == '__main__': 228 | unittest.main() --------------------------------------------------------------------------------