├── .github
    └── workflows
    │   └── ci.yml
├── Dockerfile
├── LICENSE
├── README.md
├── clone
    └── Clone_Repo.py
├── config.yaml
├── core
    ├── __init__.py
    ├── cache.py
    ├── config_manager.py
    ├── logger.py
    ├── memory_optimizer.py
    ├── parallel_manager.py
    ├── performance_monitor.py
    └── resource_manager.py
├── detector
    ├── Detector.py
    ├── __init__.py
    ├── ast_analyzer.py
    ├── clone_detector.py
    ├── metrics.py
    ├── semantic_analyzer.py
    └── version_predictor.py
├── docker-compose.yml
├── osscollector
    ├── collector.py
    └── sample
├── preprocessor
    ├── Preprocessor_full.py
    ├── Preprocessor_lite.py
    ├── __init__.py
    ├── language_processors
    │   ├── cpp_processor.py
    │   └── java_processor.py
    └── preprocessor.py
├── prometheus.yml
├── re-centris-go
    ├── cmd
    │   └── re-centris
    │   │   └── main.go
    ├── config.yaml
    ├── go.mod
    ├── internal
    │   ├── analyzer
    │   │   ├── analyzer.go
    │   │   ├── parser
    │   │   │   ├── cpp
    │   │   │   │   ├── parser.go
    │   │   │   │   └── parser_test.go
    │   │   │   └── parser.go
    │   │   └── tlsh
    │   │   │   ├── errors.go
    │   │   │   ├── tlsh.go
    │   │   │   └── tlsh_test.go
    │   ├── cmd
    │   │   ├── analyze.go
    │   │   ├── clone.go
    │   │   ├── detect.go
    │   │   └── root.go
    │   ├── collector
    │   │   └── clone
    │   │   │   └── clone.go
    │   ├── common
    │   │   ├── cache
    │   │   │   └── cache.go
    │   │   ├── logger
    │   │   │   └── logger.go
    │   │   └── monitor
    │   │   │   └── monitor.go
    │   ├── config
    │   │   └── config.go
    │   ├── detector
    │   │   └── detector.go
    │   └── preprocessor
    │   │   └── preprocessor.go
    └── tests
    │   ├── integration
    │       └── clone_analyze_test.go
    │   └── security
    │       └── security_test.go
├── requirements.txt
├── scripts
    └── deploy.sh
└── tests
    ├── __init__.py
    ├── core
        ├── test_cache.py
        ├── test_config_manager.py
        ├── test_memory_optimizer.py
        ├── test_parallel_manager.py
        ├── test_performance_monitor.py
        └── test_resource_manager.py
    ├── detector
        ├── test_detector.py
        └── test_version_predictor.py
    ├── integration
        ├── test_clone_detection.py
        └── test_integration.py
    ├── preprocessor
        ├── test_cpp_processor.py
        ├── test_java_processor.py
        └── test_preprocessor.py
    ├── run_tests.py
    └── security
        └── test_security.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Re-Centris CI/CD
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main, develop ]
 6 |   pull_request:
 7 |     branches: [ main, develop ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: [3.8, 3.9, "3.10"]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v3
18 |     
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |         
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install -r requirements.txt
28 |         pip install -r requirements-dev.txt
29 |         
30 |     - name: Run linting
31 |       run: |
32 |         flake8 .
33 |         black . --check
34 |         isort . --check-only
35 |         mypy .
36 |         
37 |     - name: Run tests
38 |       run: |
39 |         pytest --cov=. --cov-report=xml
40 |         
41 |     - name: Upload coverage
42 |       uses: codecov/codecov-action@v3
43 |       with:
44 |         file: ./coverage.xml
45 |         
46 |   build:
47 |     needs: test
48 |     runs-on: ubuntu-latest
49 |     if: github.event_name == 'push' && github.ref == 'refs/heads/main'
50 |     
51 |     steps:
52 |     - uses: actions/checkout@v3
53 |     
54 |     - name: Set up Python
55 |       uses: actions/setup-python@v4
56 |       with:
57 |         python-version: "3.10"
58 |         
59 |     - name: Build package
60 |       run: |
61 |         pip install build
62 |         python -m build
63 |         
64 |     - name: Upload artifact
65 |       uses: actions/upload-artifact@v3
66 |       with:
67 |         name: dist
68 |         path: dist/
69 |         
70 |   deploy:
71 |     needs: build
72 |     runs-on: ubuntu-latest
73 |     if: github.event_name == 'push' && github.ref == 'refs/heads/main'
74 |     
75 |     steps:
76 |     - uses: actions/download-artifact@v3
77 |       with:
78 |         name: dist
79 |         path: dist/
80 |         
81 |     - name: Set up Python
82 |       uses: actions/setup-python@v4
83 |       with:
84 |         python-version: "3.10"
85 |         
86 |     - name: Install twine
87 |       run: pip install twine
88 |       
89 |     - name: Publish to PyPI
90 |       env:
91 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
92 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
93 |       run: twine upload dist/* 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # 使用Python官方镜像作为基础镜像
 2 | FROM python:3.10-slim
 3 | 
 4 | # 设置工作目录
 5 | WORKDIR /app
 6 | 
 7 | # 设置环境变量
 8 | ENV PYTHONUNBUFFERED=1 \
 9 |     PYTHONDONTWRITEBYTECODE=1 \
10 |     PIP_NO_CACHE_DIR=1 \
11 |     PIP_DISABLE_PIP_VERSION_CHECK=1
12 | 
13 | # 安装系统依赖
14 | RUN apt-get update && apt-get install -y --no-install-recommends \
15 |     build-essential \
16 |     libclang-dev \
17 |     git \
18 |     && rm -rf /var/lib/apt/lists/*
19 | 
20 | # 复制项目文件
21 | COPY . .
22 | 
23 | # 安装Python依赖
24 | RUN pip install --no-cache-dir -r requirements.txt
25 | 
26 | # 暴露端口
27 | EXPOSE 8000
28 | 
29 | # 设置健康检查
30 | HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
31 |     CMD curl -f http://localhost:8000/health || exit 1
32 | 
33 | # 设置启动命令
34 | CMD ["gunicorn", "--bind", "0.0.0.0:8000", "app:app"] 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 byRen2002
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Re-Centris
  2 | 
  3 | Re-Centris是一个高性能的代码相似度分析工具，基于TLSH(Trend Micro Locality Sensitive Hash)算法实现。它专注于代码克隆检测、开源组件识别和依赖关系分析，支持多种编程语言。
  4 | 
  5 | ## 主要特性
  6 | 
  7 | - **高精度代码相似度分析**
  8 |   - 基于TLSH算法的模糊哈希匹配
  9 |   - 支持检测代码重构和变体
 10 |   - 函数级别的细粒度分析
 11 | 
 12 | - **多语言支持**
 13 |   - Python版本支持：C/C++、Java、Python
 14 |   - Go版本当前支持：C/C++（其他语言支持持续添加中）
 15 | 
 16 | - **高性能设计**
 17 |   - 多进程/协程并行处理
 18 |   - 内存映射技术
 19 |   - 智能缓存机制
 20 |   - 资源使用优化
 21 | 
 22 | - **丰富的分析功能**
 23 |   - 开源组件识别
 24 |   - 代码克隆检测
 25 |   - 依赖关系分析
 26 |   - 版本信息提取
 27 | 
 28 | ## 版本选择指南
 29 | 
 30 | ### Python版本
 31 | - 适用场景：
 32 |   - 需要分析多种编程语言
 33 |   - 需要更灵活的扩展性
 34 |   - 对易用性要求较高
 35 | 
 36 | ### Go版本
 37 | - 适用场景：
 38 |   - 大规模代码库分析
 39 |   - 对性能要求极高
 40 |   - 主要分析C/C++代码
 41 | 
 42 | ## 快速开始
 43 | 
 44 | ### Python版本安装
 45 | 
 46 | ```bash
 47 | # 1. 克隆仓库
 48 | git clone https://github.com/xxx/xxx.git
 49 | cd re-centris
 50 | 
 51 | # 2. 创建并激活虚拟环境
 52 | python -m venv venv
 53 | source venv/bin/activate  # Linux/Mac
 54 | venv\Scripts\activate     # Windows
 55 | 
 56 | # 3. 安装依赖
 57 | pip install -r requirements.txt
 58 | ```
 59 | 
 60 | ### Go版本安装
 61 | 
 62 | ```bash
 63 | # 1. 克隆仓库
 64 | git clone https://github.com/yourusername/re-centris-go.git
 65 | cd re-centris-go
 66 | 
 67 | # 2. 构建项目
 68 | go build -o re-centris ./cmd/re-centris
 69 | 
 70 | # 3. (可选)系统级安装
 71 | go install ./cmd/re-centris
 72 | ```
 73 | 
 74 | ## 使用示例
 75 | 
 76 | ### Python版本
 77 | 
 78 | ```bash
 79 | # 1. 收集开源代码信息
 80 | python -m osscollector.collector -c config.yaml
 81 | 
 82 | # 2. 预处理代码
 83 | python -m preprocessor.preprocessor -c config.yaml
 84 | 
 85 | # 3. 执行相似度检测
 86 | python -m detector.detector -c config.yaml -i path/to/input/code
 87 | ```
 88 | 
 89 | ### Go版本
 90 | 
 91 | ```bash
 92 | # 1. 克隆并收集代码
 93 | re-centris clone repo-list.txt -o ./repos
 94 | 
 95 | # 2. 分析代码
 96 | re-centris analyze ./source-code -o ./analysis
 97 | 
 98 | # 3. 执行相似度检测
 99 | re-centris detect target-file.cpp -k ./known-files -o results.json
100 | ```
101 | 
102 | ## 配置说明
103 | 
104 | 配置文件使用YAML格式，支持以下主要配置项：
105 | 
106 | ```yaml
107 | paths:
108 |   repo_path: "./repos"
109 |   tag_date_path: "./data/repo_date"
110 |   result_path: "./data/repo_functions"
111 | 
112 | performance:
113 |   max_workers: 0  # 自动使用可用CPU核心数
114 |   cache_size: 1000
115 |   memory_limit: 0.8  # 最大内存使用率
116 | 
117 | languages:
118 |   cpp:
119 |     enabled: true
120 |     extensions: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"]
121 |   java:
122 |     enabled: false
123 |     extensions: [".java"]
124 |   python:
125 |     enabled: false
126 |     extensions: [".py"]
127 | ```
128 | 
129 | ## 项目结构
130 | 
131 | ### Python版本
132 | ```
133 | re-centris/
134 | ├── core/                  # 核心功能模块
135 | ├── osscollector/         # 开源代码收集
136 | ├── preprocessor/         # 代码预处理
137 | ├── detector/             # 相似度检测
138 | ├── config.yaml          
139 | └── requirements.txt      
140 | ```
141 | 
142 | ### Go版本
143 | ```
144 | re-centris-go/
145 | ├── cmd/                  # CLI入口
146 | ├── internal/            # 核心实现
147 | │   ├── analyzer/       # 代码分析
148 | │   ├── collector/      # 代码收集
149 | │   ├── detector/      # 相似度检测
150 | │   └── preprocessor/  # 预处理
151 | └── config.yaml
152 | ```
153 | 
154 | ## 输出结果
155 | 
156 | 分析结果以JSON格式输出，包含：
157 | - 相似度评分
158 | - 函数级别匹配信息
159 | - 依赖关系图谱
160 | - 版本信息追踪
161 | 
162 | ## 贡献指南
163 | 
164 | 欢迎提交Pull Request！请确保：
165 | 
166 | 1. 代码通过所有测试
167 | 2. 添加必要的测试用例
168 | 3. 更新相关文档
169 | 4. 遵循项目代码规范
170 | 
171 | ## 许可证
172 | 
173 | MIT License - 详见LICENSE文件
174 | 
175 | ## 关于
176 | 
177 | 由byRen2002开发维护。问题反馈请提交GitHub Issue。 


--------------------------------------------------------------------------------
/clone/Clone_Repo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import re
  4 | from concurrent.futures import ThreadPoolExecutor, as_completed
  5 | from tqdm import tqdm
  6 | import logging
  7 | from typing import List, Tuple
  8 | 
  9 | # 配置日志
 10 | logging.basicConfig(
 11 |     level=logging.INFO,
 12 |     format='%(asctime)s - %(levelname)s - %(message)s',
 13 |     handlers=[
 14 |         logging.FileHandler('clone.log'),
 15 |         logging.StreamHandler()
 16 |     ]
 17 | )
 18 | 
 19 | def parse_repo_url(repo_url: str) -> Tuple[str, str, str]:
 20 |     """
 21 |     解析GitHub仓库URL,返回作者和仓库名
 22 |     
 23 |     Args:
 24 |         repo_url: GitHub仓库URL
 25 |         
 26 |     Returns:
 27 |         Tuple[str, str, str]: 作者名,仓库名,目标路径
 28 |     """
 29 |     match = re.search(r'github\.com/([^/]+)/([^/]+)', repo_url)
 30 |     if not match:
 31 |         raise ValueError(f"无法解析仓库URL: {repo_url}")
 32 |     
 33 |     author, repo_name = match.groups()
 34 |     repo_name = repo_name[:-4] if repo_name.endswith('.git') else repo_name
 35 |     
 36 |     return author, repo_name, repo_url
 37 | 
 38 | def clone_single_repo(repo_info: Tuple[str, str, str], clone_path: str) -> bool:
 39 |     """
 40 |     克隆单个仓库
 41 |     
 42 |     Args:
 43 |         repo_info: 包含作者名,仓库名,URL的元组
 44 |         clone_path: 克隆目标路径
 45 |         
 46 |     Returns:
 47 |         bool: 克隆是否成功
 48 |     """
 49 |     author, repo_name, repo_url = repo_info
 50 |     folder_name = f"{author}%{repo_name}"
 51 |     target_path = os.path.join(clone_path, folder_name)
 52 | 
 53 |     if os.path.exists(target_path):
 54 |         logging.info(f"仓库 {folder_name} 已存在,跳过克隆")
 55 |         return True
 56 | 
 57 |     try:
 58 |         # 优化的git clone命令
 59 |         cmd = [
 60 |             'git', 'clone',
 61 |             '--depth', '1',  # 只克隆最新版本
 62 |             '--single-branch',  # 只克隆默认分支
 63 |             '--no-tags',  # 不克隆标签
 64 |             repo_url,
 65 |             target_path
 66 |         ]
 67 |         
 68 |         subprocess.run(
 69 |             cmd,
 70 |             check=True,
 71 |             stdout=subprocess.PIPE,
 72 |             stderr=subprocess.PIPE
 73 |         )
 74 |         
 75 |         logging.info(f"成功克隆仓库 {folder_name}")
 76 |         return True
 77 | 
 78 |     except subprocess.CalledProcessError as e:
 79 |         logging.error(f"克隆仓库 {repo_url} 失败: {e.stderr.decode()}")
 80 |         return False
 81 |     except Exception as e:
 82 |         logging.error(f"处理仓库 {repo_url} 时发生错误: {str(e)}")
 83 |         return False
 84 | 
 85 | def clone_repositories(repo_list_file: str, clone_path: str, max_workers: int = 5):
 86 |     """
 87 |     并行克隆多个GitHub仓库
 88 |     
 89 |     Args:
 90 |         repo_list_file: 包含GitHub仓库URL的文件路径
 91 |         clone_path: 克隆仓库的目标路径
 92 |         max_workers: 最大并行工作线程数
 93 |     """
 94 |     # 确保目标目录存在
 95 |     os.makedirs(clone_path, exist_ok=True)
 96 |     
 97 |     # 读取仓库URL列表
 98 |     try:
 99 |         with open(repo_list_file, 'r', buffering=8192) as f:
100 |             repo_urls = [url.strip() for url in f if url.strip()]
101 |     except Exception as e:
102 |         logging.error(f"读取仓库列表文件失败: {str(e)}")
103 |         return
104 | 
105 |     if not repo_urls:
106 |         logging.warning("仓库列表为空")
107 |         return
108 | 
109 |     # 解析所有仓库URL
110 |     repo_infos = []
111 |     for url in repo_urls:
112 |         try:
113 |             repo_infos.append(parse_repo_url(url))
114 |         except ValueError as e:
115 |             logging.error(str(e))
116 |             continue
117 | 
118 |     # 使用线程池并行克隆
119 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
120 |         # 提交所有克隆任务
121 |         future_to_repo = {
122 |             executor.submit(clone_single_repo, repo_info, clone_path): repo_info 
123 |             for repo_info in repo_infos
124 |         }
125 |         
126 |         # 使用tqdm显示进度
127 |         with tqdm(total=len(repo_infos), desc="克隆进度") as pbar:
128 |             for future in as_completed(future_to_repo):
129 |                 repo_info = future_to_repo[future]
130 |                 try:
131 |                     success = future.result()
132 |                     if success:
133 |                         pbar.set_description(f"成功克隆 {repo_info[1]}")
134 |                     else:
135 |                         pbar.set_description(f"克隆失败 {repo_info[1]}")
136 |                 except Exception as e:
137 |                     logging.error(f"处理仓库 {repo_info[1]} 时发生错误: {str(e)}")
138 |                 finally:
139 |                     pbar.update(1)
140 | 
141 |     logging.info("所有仓库克隆完成")
142 | 
143 | if __name__ == "__main__":
144 |     clone_repositories(
145 |         '/home/rby/Project/project-file/dependency_analysis/sample',
146 |         '/home/rby/Project/project-file/dependency_analysis/repo_src'
147 |     )
148 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | # Re-Centris 配置文件
 2 | 
 3 | # 路径配置
 4 | paths:
 5 |   # 仓库路径
 6 |   repo_path: "./repos"
 7 |   # 标签日期路径
 8 |   tag_date_path: "./osscollector/repo_date"
 9 |   # 结果路径
10 |   result_path: "./osscollector/repo_functions"
11 |   # 日志路径
12 |   log_path: "./logs"
13 |   # 版本索引路径
14 |   ver_idx_path: "./preprocessor/verIDX"
15 |   # 初始签名数据库路径
16 |   initial_db_path: "./preprocessor/initialSigs"
17 |   # 最终组件数据库路径
18 |   final_db_path: "./preprocessor/componentDB"
19 |   # 元信息路径
20 |   meta_path: "./preprocessor/metaInfos"
21 |   # 权重路径
22 |   weight_path: "./preprocessor/metaInfos/weights"
23 |   # 函数日期路径
24 |   func_date_path: "./preprocessor/funcDate"
25 |   # 缓存路径
26 |   cache_path: "./cache"
27 | 
28 | # 性能配置
29 | performance:
30 |   # 最大工作进程数，默认为CPU核心数
31 |   max_workers: null
32 |   # 缓存大小
33 |   cache_size: 1000
34 |   # 缓存过期时间(秒)
35 |   cache_expire: 3600
36 |   # 内存使用限制(0.0-1.0)
37 |   memory_limit: 0.9
38 |   # 超时时间(秒)
39 |   timeout: 30
40 |   # 批处理大小
41 |   batch_size: 1000
42 | 
43 | # 日志配置
44 | logging:
45 |   # 日志级别: DEBUG, INFO, WARNING, ERROR, CRITICAL
46 |   level: "INFO"
47 |   # 日志文件最大大小(字节)
48 |   max_size: 10485760  # 10MB
49 |   # 日志文件备份数量
50 |   backup_count: 5
51 | 
52 | # 分析配置
53 | analysis:
54 |   # 相似度阈值
55 |   theta: 0.1
56 |   # TLSH差异阈值
57 |   tlsh_threshold: 30
58 | 
59 | # 外部工具配置
60 | external_tools:
61 |   # ctags路径
62 |   ctags_path: "ctags"
63 | 
64 | # 支持的语言配置
65 | languages:
66 |   # C/C++
67 |   cpp:
68 |     enabled: true
69 |     extensions: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"]
70 |   # Java
71 |   java:
72 |     enabled: false
73 |     extensions: [".java"]
74 |   # Python
75 |   python:
76 |     enabled: false
77 |     extensions: [".py"]
78 |   # JavaScript
79 |   javascript:
80 |     enabled: false
81 |     extensions: [".js", ".jsx", ".ts", ".tsx"]
82 |   # Go
83 |   go:
84 |     enabled: false
85 |     extensions: [".go"] 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """Re-Centris 核心模块
 2 | 
 3 | 该模块提供了Re-Centris项目的核心功能和工具类，包括：
 4 | - 缓存管理
 5 | - 资源管理
 6 | - 内存优化
 7 | - 性能监控
 8 | - 配置管理
 9 | - 日志系统
10 | 
11 | 作者: Re-Centris团队
12 | 版本: 1.0.0
13 | 许可证: MIT
14 | """
15 | 
16 | from .cache import Cache
17 | from .resource_manager import ResourceManager
18 | from .memory_optimizer import MemoryOptimizer
19 | from .performance_monitor import PerformanceMonitor
20 | from .config_manager import ConfigManager
21 | from .logger import setup_logger
22 | 
23 | __all__ = [
24 |     'Cache',
25 |     'ResourceManager',
26 |     'MemoryOptimizer',
27 |     'PerformanceMonitor',
28 |     'ConfigManager',
29 |     'setup_logger'
30 | ] 


--------------------------------------------------------------------------------
/core/cache.py:
--------------------------------------------------------------------------------
  1 | """缓存模块
  2 | 
  3 | 该模块提供了统一的缓存管理功能，支持内存缓存和持久化缓存，
  4 | 以及LRU淘汰策略、过期时间和大小限制等功能。
  5 | 
  6 | 作者: Re-Centris团队
  7 | 版本: 1.0.0
  8 | 许可证: MIT
  9 | """
 10 | 
 11 | import os
 12 | import time
 13 | import pickle
 14 | import threading
 15 | import logging
 16 | from typing import Dict, Any, Optional, Tuple, List, Callable
 17 | from functools import wraps
 18 | 
 19 | # 获取模块日志记录器
 20 | logger = logging.getLogger("re-centris.cache")
 21 | 
 22 | 
 23 | class Cache:
 24 |     """通用缓存类，支持LRU淘汰策略、过期时间和大小限制"""
 25 |     
 26 |     def __init__(
 27 |         self,
 28 |         max_size: int = 1000,
 29 |         expire_time: int = 3600,
 30 |         persistent: bool = False,
 31 |         cache_dir: Optional[str] = None
 32 |     ):
 33 |         """初始化缓存
 34 |         
 35 |         Args:
 36 |             max_size: 缓存最大条目数
 37 |             expire_time: 缓存过期时间(秒)
 38 |             persistent: 是否持久化缓存
 39 |             cache_dir: 缓存目录，仅在persistent=True时有效
 40 |         """
 41 |         self.max_size = max_size
 42 |         self.expire_time = expire_time
 43 |         self.persistent = persistent
 44 |         self.cache_dir = cache_dir
 45 |         
 46 |         if persistent and not cache_dir:
 47 |             raise ValueError("持久化缓存必须指定缓存目录")
 48 |         
 49 |         if persistent and not os.path.exists(cache_dir):
 50 |             os.makedirs(cache_dir)
 51 |         
 52 |         self._cache: Dict[str, Any] = {}
 53 |         self._access_times: Dict[str, float] = {}
 54 |         self._lock = threading.Lock()
 55 |     
 56 |     def get(self, key: str) -> Optional[Any]:
 57 |         """获取缓存值
 58 |         
 59 |         Args:
 60 |             key: 缓存键
 61 |         
 62 |         Returns:
 63 |             缓存值，如果不存在或已过期则返回None
 64 |         """
 65 |         with self._lock:
 66 |             # 检查内存缓存
 67 |             if key in self._cache:
 68 |                 access_time = self._access_times[key]
 69 |                 if time.time() - access_time <= self.expire_time:
 70 |                     # 更新访问时间
 71 |                     self._access_times[key] = time.time()
 72 |                     return self._cache[key]
 73 |                 else:
 74 |                     # 缓存已过期，删除
 75 |                     del self._cache[key]
 76 |                     del self._access_times[key]
 77 |             
 78 |             # 如果启用了持久化缓存，尝试从文件加载
 79 |             if self.persistent:
 80 |                 cache_file = self._get_cache_file(key)
 81 |                 if os.path.exists(cache_file):
 82 |                     try:
 83 |                         with open(cache_file, 'rb') as f:
 84 |                             data = pickle.load(f)
 85 |                             timestamp, value = data
 86 |                             
 87 |                             if time.time() - timestamp <= self.expire_time:
 88 |                                 # 加载到内存缓存
 89 |                                 self._cache[key] = value
 90 |                                 self._access_times[key] = time.time()
 91 |                                 return value
 92 |                             else:
 93 |                                 # 缓存已过期，删除文件
 94 |                                 os.remove(cache_file)
 95 |                     except Exception as e:
 96 |                         logger.warning(f"从持久化缓存加载失败: {e}")
 97 |             
 98 |             return None
 99 |     
100 |     def put(self, key: str, value: Any) -> None:
101 |         """存入缓存值
102 |         
103 |         Args:
104 |             key: 缓存键
105 |             value: 缓存值
106 |         """
107 |         with self._lock:
108 |             # 如果键已存在，更新访问时间
109 |             if key in self._cache:
110 |                 self._access_times[key] = time.time()
111 |                 self._cache[key] = value
112 |             else:
113 |                 # 如果缓存已满，淘汰最久未使用的项
114 |                 if len(self._cache) >= self.max_size:
115 |                     self._evict_lru()
116 |                 
117 |                 # 添加新项
118 |                 self._cache[key] = value
119 |                 self._access_times[key] = time.time()
120 |             
121 |             # 如果启用了持久化缓存，保存到文件
122 |             if self.persistent:
123 |                 self._save_to_file(key, value)
124 |     
125 |     def _evict_lru(self) -> None:
126 |         """淘汰最久未使用的缓存项"""
127 |         if not self._access_times:
128 |             return
129 |         
130 |         # 找出访问时间最早的键
131 |         oldest_key = min(self._access_times.items(), key=lambda x: x[1])[0]
132 |         
133 |         # 从内存缓存中删除
134 |         del self._cache[oldest_key]
135 |         del self._access_times[oldest_key]
136 |         
137 |         # 如果启用了持久化缓存，删除文件
138 |         if self.persistent:
139 |             cache_file = self._get_cache_file(oldest_key)
140 |             if os.path.exists(cache_file):
141 |                 try:
142 |                     os.remove(cache_file)
143 |                 except Exception as e:
144 |                     logger.warning(f"删除缓存文件失败: {e}")
145 |     
146 |     def _get_cache_file(self, key: str) -> str:
147 |         """获取缓存文件路径
148 |         
149 |         Args:
150 |             key: 缓存键
151 |         
152 |         Returns:
153 |             缓存文件路径
154 |         """
155 |         # 使用MD5哈希作为文件名，避免文件名无效字符
156 |         import hashlib
157 |         key_hash = hashlib.md5(key.encode()).hexdigest()
158 |         return os.path.join(self.cache_dir, f"{key_hash}.cache")
159 |     
160 |     def _save_to_file(self, key: str, value: Any) -> None:
161 |         """保存缓存项到文件
162 |         
163 |         Args:
164 |             key: 缓存键
165 |             value: 缓存值
166 |         """
167 |         if not self.persistent:
168 |             return
169 |         
170 |         cache_file = self._get_cache_file(key)
171 |         try:
172 |             with open(cache_file, 'wb') as f:
173 |                 # 保存时间戳和值
174 |                 data = (time.time(), value)
175 |                 pickle.dump(data, f)
176 |         except Exception as e:
177 |             logger.warning(f"保存缓存到文件失败: {e}")
178 |     
179 |     def clear(self) -> None:
180 |         """清空缓存"""
181 |         with self._lock:
182 |             self._cache.clear()
183 |             self._access_times.clear()
184 |             
185 |             # 如果启用了持久化缓存，删除所有缓存文件
186 |             if self.persistent and os.path.exists(self.cache_dir):
187 |                 for filename in os.listdir(self.cache_dir):
188 |                     if filename.endswith(".cache"):
189 |                         try:
190 |                             os.remove(os.path.join(self.cache_dir, filename))
191 |                         except Exception as e:
192 |                             logger.warning(f"删除缓存文件失败: {e}")
193 |     
194 |     def remove(self, key: str) -> bool:
195 |         """删除缓存项
196 |         
197 |         Args:
198 |             key: 缓存键
199 |         
200 |         Returns:
201 |             是否成功删除
202 |         """
203 |         with self._lock:
204 |             if key in self._cache:
205 |                 del self._cache[key]
206 |                 del self._access_times[key]
207 |                 
208 |                 # 如果启用了持久化缓存，删除文件
209 |                 if self.persistent:
210 |                     cache_file = self._get_cache_file(key)
211 |                     if os.path.exists(cache_file):
212 |                         try:
213 |                             os.remove(cache_file)
214 |                         except Exception as e:
215 |                             logger.warning(f"删除缓存文件失败: {e}")
216 |                 
217 |                 return True
218 |             return False
219 |     
220 |     def keys(self) -> List[str]:
221 |         """获取所有缓存键
222 |         
223 |         Returns:
224 |             缓存键列表
225 |         """
226 |         with self._lock:
227 |             return list(self._cache.keys())
228 |     
229 |     def size(self) -> int:
230 |         """获取缓存大小
231 |         
232 |         Returns:
233 |             缓存条目数
234 |         """
235 |         with self._lock:
236 |             return len(self._cache)
237 |     
238 |     def has_key(self, key: str) -> bool:
239 |         """检查缓存键是否存在
240 |         
241 |         Args:
242 |             key: 缓存键
243 |         
244 |         Returns:
245 |             缓存键是否存在
246 |         """
247 |         with self._lock:
248 |             return key in self._cache
249 | 
250 | 
251 | def cached(cache: Cache, key_func: Optional[Callable] = None):
252 |     """函数结果缓存装饰器
253 |     
254 |     Args:
255 |         cache: 缓存对象
256 |         key_func: 缓存键生成函数，如果为None则使用函数名和参数生成
257 |     
258 |     Returns:
259 |         装饰器函数
260 |     """
261 |     def decorator(func):
262 |         @wraps(func)
263 |         def wrapper(*args, **kwargs):
264 |             # 生成缓存键
265 |             if key_func:
266 |                 key = key_func(*args, **kwargs)
267 |             else:
268 |                 # 默认使用函数名和参数生成键
269 |                 key = f"{func.__module__}.{func.__name__}:{str(args)}:{str(kwargs)}"
270 |             
271 |             # 尝试从缓存获取
272 |             result = cache.get(key)
273 |             if result is not None:
274 |                 return result
275 |             
276 |             # 执行函数
277 |             result = func(*args, **kwargs)
278 |             
279 |             # 存入缓存
280 |             cache.put(key, result)
281 |             
282 |             return result
283 |         return wrapper
284 |     return decorator 


--------------------------------------------------------------------------------
/core/config_manager.py:
--------------------------------------------------------------------------------
  1 | """配置管理模块
  2 | 
  3 | 该模块提供了统一的配置管理功能，支持从配置文件、环境变量和命令行参数加载配置。
  4 | 配置项包括路径设置、性能参数、日志设置等。
  5 | 
  6 | 作者: Re-Centris团队
  7 | 版本: 1.0.0
  8 | 许可证: MIT
  9 | """
 10 | 
 11 | import os
 12 | import json
 13 | import yaml
 14 | import logging
 15 | from pathlib import Path
 16 | from typing import Dict, Any, Optional, Union
 17 | 
 18 | 
 19 | class ConfigManager:
 20 |     """配置管理类，负责加载、验证和提供配置信息"""
 21 |     
 22 |     def __init__(self, config_file: Optional[str] = None):
 23 |         """初始化配置管理器
 24 |         
 25 |         Args:
 26 |             config_file: 配置文件路径，如果为None，则尝试从默认位置加载
 27 |         """
 28 |         self.config: Dict[str, Any] = {}
 29 |         self.config_file = config_file
 30 |         self._load_default_config()
 31 |         
 32 |         if config_file:
 33 |             self.load_config(config_file)
 34 |         else:
 35 |             # 尝试从默认位置加载配置
 36 |             default_locations = [
 37 |                 "./config.yaml",
 38 |                 "./config.json",
 39 |                 os.path.expanduser("~/.re-centris/config.yaml"),
 40 |                 os.path.expanduser("~/.re-centris/config.json"),
 41 |                 "/etc/re-centris/config.yaml",
 42 |                 "/etc/re-centris/config.json"
 43 |             ]
 44 |             
 45 |             for location in default_locations:
 46 |                 if os.path.exists(location):
 47 |                     self.load_config(location)
 48 |                     break
 49 |         
 50 |         # 从环境变量加载配置
 51 |         self._load_from_env()
 52 |     
 53 |     def _load_default_config(self) -> None:
 54 |         """加载默认配置"""
 55 |         # 获取当前工作目录
 56 |         current_dir = os.getcwd()
 57 |         
 58 |         self.config = {
 59 |             "paths": {
 60 |                 "current_path": current_dir,
 61 |                 "repo_path": os.path.join(current_dir, "repos"),
 62 |                 "tag_date_path": os.path.join(current_dir, "osscollector", "repo_date"),
 63 |                 "result_path": os.path.join(current_dir, "osscollector", "repo_functions"),
 64 |                 "log_path": os.path.join(current_dir, "logs"),
 65 |                 "ver_idx_path": os.path.join(current_dir, "preprocessor", "verIDX"),
 66 |                 "initial_db_path": os.path.join(current_dir, "preprocessor", "initialSigs"),
 67 |                 "final_db_path": os.path.join(current_dir, "preprocessor", "componentDB"),
 68 |                 "meta_path": os.path.join(current_dir, "preprocessor", "metaInfos"),
 69 |                 "weight_path": os.path.join(current_dir, "preprocessor", "metaInfos", "weights"),
 70 |                 "func_date_path": os.path.join(current_dir, "preprocessor", "funcDate"),
 71 |                 "cache_path": os.path.join(current_dir, "cache")
 72 |             },
 73 |             "performance": {
 74 |                 "max_workers": os.cpu_count(),
 75 |                 "cache_size": 1000,
 76 |                 "cache_expire": 3600,  # 1小时
 77 |                 "memory_limit": 0.9,  # 90%
 78 |                 "timeout": 30,  # 30秒
 79 |                 "batch_size": 1000
 80 |             },
 81 |             "logging": {
 82 |                 "level": "INFO",
 83 |                 "max_size": 10 * 1024 * 1024,  # 10MB
 84 |                 "backup_count": 5
 85 |             },
 86 |             "analysis": {
 87 |                 "theta": 0.1,  # 相似度阈值
 88 |                 "tlsh_threshold": 30  # TLSH差异阈值
 89 |             },
 90 |             "external_tools": {
 91 |                 "ctags_path": "ctags"  # 默认从PATH中查找
 92 |             }
 93 |         }
 94 |     
 95 |     def load_config(self, config_file: str) -> None:
 96 |         """从文件加载配置
 97 |         
 98 |         Args:
 99 |             config_file: 配置文件路径
100 |         
101 |         Raises:
102 |             FileNotFoundError: 配置文件不存在
103 |             ValueError: 配置文件格式错误
104 |         """
105 |         if not os.path.exists(config_file):
106 |             raise FileNotFoundError(f"配置文件不存在: {config_file}")
107 |         
108 |         try:
109 |             ext = os.path.splitext(config_file)[1].lower()
110 |             
111 |             if ext == '.json':
112 |                 with open(config_file, 'r', encoding='utf-8') as f:
113 |                     file_config = json.load(f)
114 |             elif ext in ['.yaml', '.yml']:
115 |                 with open(config_file, 'r', encoding='utf-8') as f:
116 |                     file_config = yaml.safe_load(f)
117 |             else:
118 |                 raise ValueError(f"不支持的配置文件格式: {ext}")
119 |             
120 |             # 递归更新配置
121 |             self._update_config(self.config, file_config)
122 |             
123 |             logging.info(f"已从 {config_file} 加载配置")
124 |             
125 |         except Exception as e:
126 |             logging.error(f"加载配置文件失败: {e}")
127 |             raise
128 |     
129 |     def _update_config(self, target: Dict, source: Dict) -> None:
130 |         """递归更新配置字典
131 |         
132 |         Args:
133 |             target: 目标配置字典
134 |             source: 源配置字典
135 |         """
136 |         for key, value in source.items():
137 |             if key in target and isinstance(target[key], dict) and isinstance(value, dict):
138 |                 self._update_config(target[key], value)
139 |             else:
140 |                 target[key] = value
141 |     
142 |     def _load_from_env(self) -> None:
143 |         """从环境变量加载配置
144 |         
145 |         环境变量格式: RECENTRIS_SECTION_KEY=value
146 |         例如: RECENTRIS_PATHS_REPO_PATH=/path/to/repos
147 |         """
148 |         prefix = "RECENTRIS_"
149 |         
150 |         for key, value in os.environ.items():
151 |             if key.startswith(prefix):
152 |                 parts = key[len(prefix):].lower().split('_')
153 |                 
154 |                 if len(parts) >= 2:
155 |                     section = parts[0]
156 |                     subkey = '_'.join(parts[1:])
157 |                     
158 |                     if section in self.config:
159 |                         if subkey in self.config[section]:
160 |                             # 尝试转换值类型
161 |                             orig_value = self.config[section][subkey]
162 |                             if isinstance(orig_value, bool):
163 |                                 self.config[section][subkey] = value.lower() in ['true', '1', 'yes']
164 |                             elif isinstance(orig_value, int):
165 |                                 self.config[section][subkey] = int(value)
166 |                             elif isinstance(orig_value, float):
167 |                                 self.config[section][subkey] = float(value)
168 |                             else:
169 |                                 self.config[section][subkey] = value
170 |     
171 |     def get(self, section: str, key: str, default: Any = None) -> Any:
172 |         """获取配置值
173 |         
174 |         Args:
175 |             section: 配置部分
176 |             key: 配置键
177 |             default: 默认值，如果配置不存在则返回该值
178 |         
179 |         Returns:
180 |             配置值
181 |         """
182 |         if section in self.config and key in self.config[section]:
183 |             return self.config[section][key]
184 |         return default
185 |     
186 |     def set(self, section: str, key: str, value: Any) -> None:
187 |         """设置配置值
188 |         
189 |         Args:
190 |             section: 配置部分
191 |             key: 配置键
192 |             value: 配置值
193 |         """
194 |         if section not in self.config:
195 |             self.config[section] = {}
196 |         
197 |         self.config[section][key] = value
198 |     
199 |     def get_path(self, key: str) -> str:
200 |         """获取路径配置
201 |         
202 |         Args:
203 |             key: 路径键名
204 |         
205 |         Returns:
206 |             路径字符串
207 |         """
208 |         path = self.get("paths", key)
209 |         if path:
210 |             # 确保目录存在
211 |             os.makedirs(path, exist_ok=True)
212 |         return path
213 |     
214 |     def save_config(self, config_file: Optional[str] = None) -> None:
215 |         """保存配置到文件
216 |         
217 |         Args:
218 |             config_file: 配置文件路径，如果为None则使用初始化时的配置文件
219 |         """
220 |         if config_file is None:
221 |             config_file = self.config_file
222 |         
223 |         if not config_file:
224 |             raise ValueError("未指定配置文件路径")
225 |         
226 |         try:
227 |             # 确保目录存在
228 |             os.makedirs(os.path.dirname(os.path.abspath(config_file)), exist_ok=True)
229 |             
230 |             ext = os.path.splitext(config_file)[1].lower()
231 |             
232 |             if ext == '.json':
233 |                 with open(config_file, 'w', encoding='utf-8') as f:
234 |                     json.dump(self.config, f, indent=2, ensure_ascii=False)
235 |             elif ext in ['.yaml', '.yml']:
236 |                 with open(config_file, 'w', encoding='utf-8') as f:
237 |                     yaml.dump(self.config, f, default_flow_style=False, allow_unicode=True)
238 |             else:
239 |                 raise ValueError(f"不支持的配置文件格式: {ext}")
240 |             
241 |             logging.info(f"配置已保存到 {config_file}")
242 |             
243 |         except Exception as e:
244 |             logging.error(f"保存配置文件失败: {e}")
245 |             raise
246 |     
247 |     def create_required_directories(self) -> None:
248 |         """创建所有必需的目录"""
249 |         for key, path in self.config["paths"].items():
250 |             if isinstance(path, str) and not os.path.exists(path):
251 |                 try:
252 |                     os.makedirs(path)
253 |                     logging.info(f"创建目录: {path}")
254 |                 except Exception as e:
255 |                     logging.error(f"创建目录 {path} 失败: {e}") 


--------------------------------------------------------------------------------
/core/logger.py:
--------------------------------------------------------------------------------
  1 | """日志模块
  2 | 
  3 | 该模块提供了统一的日志配置和管理功能，支持文件日志和控制台日志，
  4 | 以及日志轮转、级别控制和格式化等功能。
  5 | 
  6 | 作者: Re-Centris团队
  7 | 版本: 1.0.0
  8 | 许可证: MIT
  9 | """
 10 | 
 11 | import os
 12 | import sys
 13 | import logging
 14 | import datetime
 15 | from logging.handlers import RotatingFileHandler
 16 | from typing import Optional, Dict, Any, Union
 17 | 
 18 | 
 19 | def setup_logger(
 20 |     name: str = "re-centris",
 21 |     log_file: Optional[str] = None,
 22 |     log_level: Union[int, str] = logging.INFO,
 23 |     max_size: int = 10 * 1024 * 1024,  # 10MB
 24 |     backup_count: int = 5,
 25 |     console: bool = True,
 26 |     format_str: Optional[str] = None
 27 | ) -> logging.Logger:
 28 |     """设置日志记录器
 29 |     
 30 |     Args:
 31 |         name: 日志记录器名称
 32 |         log_file: 日志文件路径，如果为None则不记录到文件
 33 |         log_level: 日志级别，可以是整数或字符串
 34 |         max_size: 日志文件最大大小(字节)
 35 |         backup_count: 日志文件备份数量
 36 |         console: 是否输出到控制台
 37 |         format_str: 日志格式字符串，如果为None则使用默认格式
 38 |     
 39 |     Returns:
 40 |         配置好的日志记录器
 41 |     """
 42 |     # 转换日志级别
 43 |     if isinstance(log_level, str):
 44 |         log_level = getattr(logging, log_level.upper(), logging.INFO)
 45 |     
 46 |     # 创建日志记录器
 47 |     logger = logging.getLogger(name)
 48 |     logger.setLevel(log_level)
 49 |     
 50 |     # 清除现有处理器
 51 |     for handler in logger.handlers[:]:
 52 |         logger.removeHandler(handler)
 53 |     
 54 |     # 设置日志格式
 55 |     if format_str is None:
 56 |         format_str = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
 57 |     formatter = logging.Formatter(format_str)
 58 |     
 59 |     # 添加文件处理器
 60 |     if log_file:
 61 |         # 确保日志目录存在
 62 |         log_dir = os.path.dirname(log_file)
 63 |         if log_dir and not os.path.exists(log_dir):
 64 |             os.makedirs(log_dir)
 65 |         
 66 |         file_handler = RotatingFileHandler(
 67 |             log_file,
 68 |             maxBytes=max_size,
 69 |             backupCount=backup_count,
 70 |             encoding="utf-8"
 71 |         )
 72 |         file_handler.setFormatter(formatter)
 73 |         logger.addHandler(file_handler)
 74 |     
 75 |     # 添加控制台处理器
 76 |     if console:
 77 |         console_handler = logging.StreamHandler()
 78 |         console_handler.setFormatter(formatter)
 79 |         logger.addHandler(console_handler)
 80 |     
 81 |     return logger
 82 | 
 83 | 
 84 | def get_module_logger(module_name: str) -> logging.Logger:
 85 |     """获取模块日志记录器
 86 |     
 87 |     Args:
 88 |         module_name: 模块名称
 89 |     
 90 |     Returns:
 91 |         模块日志记录器
 92 |     """
 93 |     return logging.getLogger(f"re-centris.{module_name}")
 94 | 
 95 | 
 96 | class LoggerAdapter(logging.LoggerAdapter):
 97 |     """日志适配器，用于添加上下文信息"""
 98 |     
 99 |     def __init__(self, logger: logging.Logger, extra: Optional[Dict[str, Any]] = None):
100 |         """初始化日志适配器
101 |         
102 |         Args:
103 |             logger: 日志记录器
104 |             extra: 额外上下文信息
105 |         """
106 |         super().__init__(logger, extra or {})
107 |     
108 |     def process(self, msg: str, kwargs: Dict[str, Any]) -> tuple:
109 |         """处理日志消息，添加上下文信息
110 |         
111 |         Args:
112 |             msg: 日志消息
113 |             kwargs: 关键字参数
114 |         
115 |         Returns:
116 |             处理后的消息和关键字参数
117 |         """
118 |         context_str = " ".join(f"[{k}={v}]" for k, v in self.extra.items())
119 |         if context_str:
120 |             msg = f"{context_str} {msg}"
121 |         return msg, kwargs
122 | 
123 | 
124 | def create_context_logger(
125 |     logger: logging.Logger,
126 |     context: Dict[str, Any]
127 | ) -> LoggerAdapter:
128 |     """创建带有上下文的日志记录器
129 |     
130 |     Args:
131 |         logger: 基础日志记录器
132 |         context: 上下文信息
133 |     
134 |     Returns:
135 |         带有上下文的日志适配器
136 |     """
137 |     return LoggerAdapter(logger, context) 


--------------------------------------------------------------------------------
/core/memory_optimizer.py:
--------------------------------------------------------------------------------
  1 | """内存优化器模块
  2 | 
  3 | 该模块提供了内存使用优化功能，包括内存使用监控、分批处理数据、
  4 | 自动垃圾回收和内存限制等功能。
  5 | 
  6 | 作者: Re-Centris团队
  7 | 版本: 1.0.0
  8 | 许可证: MIT
  9 | """
 10 | 
 11 | import os
 12 | import gc
 13 | import sys
 14 | import time
 15 | import logging
 16 | import threading
 17 | from typing import List, Any, Callable, Generator, TypeVar, Generic, Optional
 18 | 
 19 | # 获取模块日志记录器
 20 | logger = logging.getLogger("re-centris.memory_optimizer")
 21 | 
 22 | # 定义泛型类型
 23 | T = TypeVar('T')
 24 | R = TypeVar('R')
 25 | 
 26 | 
 27 | class MemoryOptimizer:
 28 |     """内存优化器，提供内存使用优化功能"""
 29 |     
 30 |     def __init__(
 31 |         self,
 32 |         target_memory_usage: float = 0.8,
 33 |         initial_batch_size: int = 1000,
 34 |         min_batch_size: int = 100,
 35 |         max_batch_size: int = 10000,
 36 |         check_interval: int = 10
 37 |     ):
 38 |         """初始化内存优化器
 39 |         
 40 |         Args:
 41 |             target_memory_usage: 目标内存使用率(0.0-1.0)
 42 |             initial_batch_size: 初始批处理大小
 43 |             min_batch_size: 最小批处理大小
 44 |             max_batch_size: 最大批处理大小
 45 |             check_interval: 内存检查间隔(秒)
 46 |         """
 47 |         self.target_memory_usage = target_memory_usage
 48 |         self.current_batch_size = initial_batch_size
 49 |         self.min_batch_size = min_batch_size
 50 |         self.max_batch_size = max_batch_size
 51 |         self.check_interval = check_interval
 52 |         self._lock = threading.Lock()
 53 |         self._last_check_time = 0
 54 |         self._last_gc_time = 0
 55 |     
 56 |     def get_memory_usage(self) -> float:
 57 |         """获取当前内存使用率
 58 |         
 59 |         Returns:
 60 |             内存使用率(0.0-1.0)
 61 |         """
 62 |         try:
 63 |             import psutil
 64 |             process = psutil.Process()
 65 |             return process.memory_percent() / 100
 66 |         except ImportError:
 67 |             # 如果没有psutil，使用简单的内存使用估计
 68 |             if hasattr(sys, 'getsizeof'):
 69 |                 # 获取Python解释器使用的内存
 70 |                 memory_used = 0
 71 |                 for obj in gc.get_objects():
 72 |                     try:
 73 |                         memory_used += sys.getsizeof(obj)
 74 |                     except:
 75 |                         pass
 76 |                 # 估计总内存
 77 |                 try:
 78 |                     with open('/proc/meminfo', 'r') as f:
 79 |                         for line in f:
 80 |                             if 'MemTotal' in line:
 81 |                                 total_memory = int(line.split()[1]) * 1024
 82 |                                 return memory_used / total_memory
 83 |                 except:
 84 |                     pass
 85 |             return 0.5  # 默认返回中等内存使用率
 86 |     
 87 |     def should_gc(self) -> bool:
 88 |         """判断是否需要执行垃圾回收
 89 |         
 90 |         Returns:
 91 |             是否需要执行垃圾回收
 92 |         """
 93 |         current_time = time.time()
 94 |         
 95 |         # 至少间隔10秒检查一次
 96 |         if current_time - self._last_check_time < self.check_interval:
 97 |             return False
 98 |         
 99 |         self._last_check_time = current_time
100 |         
101 |         # 检查内存使用率
102 |         memory_usage = self.get_memory_usage()
103 |         
104 |         # 如果内存使用率超过目标，执行垃圾回收
105 |         if memory_usage > self.target_memory_usage:
106 |             # 至少间隔30秒执行一次垃圾回收
107 |             if current_time - self._last_gc_time >= 30:
108 |                 self._last_gc_time = current_time
109 |                 return True
110 |         
111 |         return False
112 |     
113 |     def optimize(self) -> None:
114 |         """执行内存优化"""
115 |         if self.should_gc():
116 |             logger.debug("执行垃圾回收")
117 |             gc.collect()
118 |     
119 |     def adjust_batch_size(self) -> int:
120 |         """根据内存使用情况调整批处理大小
121 |         
122 |         Returns:
123 |             调整后的批处理大小
124 |         """
125 |         with self._lock:
126 |             memory_usage = self.get_memory_usage()
127 |             
128 |             if memory_usage > self.target_memory_usage:
129 |                 # 内存使用率过高，减小批处理大小
130 |                 self.current_batch_size = max(
131 |                     self.min_batch_size,
132 |                     int(self.current_batch_size * 0.8)
133 |                 )
134 |             elif memory_usage < self.target_memory_usage * 0.7:
135 |                 # 内存使用率较低，增加批处理大小
136 |                 self.current_batch_size = min(
137 |                     self.max_batch_size,
138 |                     int(self.current_batch_size * 1.2)
139 |                 )
140 |             
141 |             return self.current_batch_size
142 |     
143 |     def batch_items(self, items: List[T]) -> Generator[List[T], None, None]:
144 |         """分批处理数据
145 |         
146 |         Args:
147 |             items: 数据项列表
148 |         
149 |         Yields:
150 |             批次数据
151 |         """
152 |         for i in range(0, len(items), self.current_batch_size):
153 |             batch = items[i:i + self.current_batch_size]
154 |             yield batch
155 |             
156 |             # 优化内存
157 |             self.optimize()
158 |             
159 |             # 调整批处理大小
160 |             self.adjust_batch_size()
161 |     
162 |     def process_in_batches(
163 |         self,
164 |         items: List[T],
165 |         processor: Callable[[List[T]], List[R]]
166 |     ) -> List[R]:
167 |         """分批处理数据并合并结果
168 |         
169 |         Args:
170 |             items: 数据项列表
171 |             processor: 处理函数，接受一个批次数据，返回处理结果
172 |         
173 |         Returns:
174 |             所有批次的处理结果合并
175 |         """
176 |         results = []
177 |         
178 |         for batch in self.batch_items(items):
179 |             batch_results = processor(batch)
180 |             results.extend(batch_results)
181 |         
182 |         return results
183 |     
184 |     def monitor_memory(
185 |         self,
186 |         interval: int = 60,
187 |         callback: Optional[Callable[[float], None]] = None
188 |     ) -> threading.Thread:
189 |         """启动内存监控线程
190 |         
191 |         Args:
192 |             interval: 监控间隔(秒)
193 |             callback: 回调函数，接受当前内存使用率
194 |         
195 |         Returns:
196 |             监控线程
197 |         """
198 |         def _monitor():
199 |             while True:
200 |                 try:
201 |                     memory_usage = self.get_memory_usage()
202 |                     
203 |                     if callback:
204 |                         callback(memory_usage)
205 |                     else:
206 |                         logger.info(f"当前内存使用率: {memory_usage:.2%}")
207 |                     
208 |                     # 如果内存使用率过高，执行垃圾回收
209 |                     if memory_usage > self.target_memory_usage:
210 |                         logger.warning(f"内存使用率过高: {memory_usage:.2%}")
211 |                         gc.collect()
212 |                     
213 |                     time.sleep(interval)
214 |                 except Exception as e:
215 |                     logger.error(f"内存监控异常: {e}")
216 |                     time.sleep(interval)
217 |         
218 |         thread = threading.Thread(target=_monitor, daemon=True)
219 |         thread.start()
220 |         return thread
221 | 
222 | 
223 | class BatchProcessor(Generic[T, R]):
224 |     """批处理器，用于高效处理大量数据"""
225 |     
226 |     def __init__(
227 |         self,
228 |         processor: Callable[[T], R],
229 |         batch_size: int = 1000,
230 |         memory_optimizer: Optional[MemoryOptimizer] = None
231 |     ):
232 |         """初始化批处理器
233 |         
234 |         Args:
235 |             processor: 处理函数，接受一个数据项，返回处理结果
236 |             batch_size: 批处理大小
237 |             memory_optimizer: 内存优化器，如果为None则创建新的
238 |         """
239 |         self.processor = processor
240 |         self.batch_size = batch_size
241 |         self.memory_optimizer = memory_optimizer or MemoryOptimizer()
242 |     
243 |     def process(self, items: List[T]) -> List[R]:
244 |         """处理数据项列表
245 |         
246 |         Args:
247 |             items: 数据项列表
248 |         
249 |         Returns:
250 |             处理结果列表
251 |         """
252 |         results = []
253 |         
254 |         for batch in self.memory_optimizer.batch_items(items):
255 |             batch_results = [self.processor(item) for item in batch]
256 |             results.extend(batch_results)
257 |             
258 |             # 优化内存
259 |             self.memory_optimizer.optimize()
260 |         
261 |         return results
262 |     
263 |     def process_generator(self, items_generator: Generator[T, None, None]) -> Generator[R, None, None]:
264 |         """处理数据项生成器
265 |         
266 |         Args:
267 |             items_generator: 数据项生成器
268 |         
269 |         Yields:
270 |             处理结果
271 |         """
272 |         batch = []
273 |         
274 |         for item in items_generator:
275 |             batch.append(item)
276 |             
277 |             if len(batch) >= self.batch_size:
278 |                 for result in self._process_batch(batch):
279 |                     yield result
280 |                 batch = []
281 |                 
282 |                 # 优化内存
283 |                 self.memory_optimizer.optimize()
284 |         
285 |         # 处理剩余项
286 |         if batch:
287 |             for result in self._process_batch(batch):
288 |                 yield result
289 |     
290 |     def _process_batch(self, batch: List[T]) -> List[R]:
291 |         """处理单个批次
292 |         
293 |         Args:
294 |             batch: 批次数据
295 |         
296 |         Returns:
297 |             处理结果
298 |         """
299 |         return [self.processor(item) for item in batch] 


--------------------------------------------------------------------------------
/core/parallel_manager.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import multiprocessing
  4 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
  5 | from typing import List, Callable, Any, Dict, Optional, Union
  6 | from functools import partial
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | class ParallelManager:
 11 |     """并行处理管理器类"""
 12 |     
 13 |     def __init__(self, max_workers: Optional[int] = None):
 14 |         """初始化并行处理管理器
 15 |         
 16 |         Args:
 17 |             max_workers: 最大工作进程数，默认为CPU核心数
 18 |         """
 19 |         self.max_workers = max_workers or multiprocessing.cpu_count()
 20 |         self._process_pools: Dict[str, ProcessPoolExecutor] = {}
 21 |         self._thread_pools: Dict[str, ThreadPoolExecutor] = {}
 22 |         
 23 |     def process_items(self,
 24 |                      items: List[Any],
 25 |                      process_func: Callable,
 26 |                      pool_name: str = "default",
 27 |                      chunk_size: Optional[int] = None,
 28 |                      use_threads: bool = False,
 29 |                      **kwargs) -> List[Any]:
 30 |         """并行处理项目列表
 31 |         
 32 |         Args:
 33 |             items: 待处理项目列表
 34 |             process_func: 处理函数
 35 |             pool_name: 进程池名称
 36 |             chunk_size: 分块大小
 37 |             use_threads: 是否使用线程池
 38 |             **kwargs: 传递给处理函数的额外参数
 39 |             
 40 |         Returns:
 41 |             处理结果列表
 42 |         """
 43 |         if not items:
 44 |             return []
 45 |             
 46 |         # 确定分块大小
 47 |         if chunk_size is None:
 48 |             chunk_size = max(1, len(items) // (self.max_workers * 4))
 49 |             
 50 |         # 准备任务
 51 |         chunked_items = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
 52 |         partial_func = partial(process_func, **kwargs)
 53 |         
 54 |         # 选择执行器
 55 |         executor_cls = ThreadPoolExecutor if use_threads else ProcessPoolExecutor
 56 |         executor_dict = self._thread_pools if use_threads else self._process_pools
 57 |         
 58 |         # 获取或创建执行器
 59 |         if pool_name not in executor_dict:
 60 |             executor_dict[pool_name] = executor_cls(max_workers=self.max_workers)
 61 |         executor = executor_dict[pool_name]
 62 |         
 63 |         results = []
 64 |         try:
 65 |             # 提交任务
 66 |             futures = [
 67 |                 executor.submit(partial_func, chunk)
 68 |                 for chunk in chunked_items
 69 |             ]
 70 |             
 71 |             # 收集结果
 72 |             for future in as_completed(futures):
 73 |                 try:
 74 |                     result = future.result()
 75 |                     if isinstance(result, list):
 76 |                         results.extend(result)
 77 |                     else:
 78 |                         results.append(result)
 79 |                 except Exception as e:
 80 |                     logger.error(f"处理任务失败: {str(e)}")
 81 |                     
 82 |         except Exception as e:
 83 |             logger.error(f"并行处理失败: {str(e)}")
 84 |             
 85 |         return results
 86 |         
 87 |     def process_items_with_progress(self,
 88 |                                   items: List[Any],
 89 |                                   process_func: Callable,
 90 |                                   progress_callback: Callable[[int, int], None],
 91 |                                   pool_name: str = "default",
 92 |                                   chunk_size: Optional[int] = None,
 93 |                                   use_threads: bool = False,
 94 |                                   **kwargs) -> List[Any]:
 95 |         """带进度回调的并行处理
 96 |         
 97 |         Args:
 98 |             items: 待处理项目列表
 99 |             process_func: 处理函数
100 |             progress_callback: 进度回调函数
101 |             pool_name: 进程池名称
102 |             chunk_size: 分块大小
103 |             use_threads: 是否使用线程池
104 |             **kwargs: 传递给处理函数的额外参数
105 |             
106 |         Returns:
107 |             处理结果列表
108 |         """
109 |         total_items = len(items)
110 |         processed_items = 0
111 |         
112 |         def update_progress(result):
113 |             nonlocal processed_items
114 |             processed_items += len(result) if isinstance(result, list) else 1
115 |             progress_callback(processed_items, total_items)
116 |             return result
117 |             
118 |         results = self.process_items(
119 |             items=items,
120 |             process_func=process_func,
121 |             pool_name=pool_name,
122 |             chunk_size=chunk_size,
123 |             use_threads=use_threads,
124 |             **kwargs
125 |         )
126 |         
127 |         for result in results:
128 |             update_progress(result)
129 |             
130 |         return results
131 |         
132 |     def close_pool(self, pool_name: str, use_threads: bool = False):
133 |         """关闭指定的进程池或线程池
134 |         
135 |         Args:
136 |             pool_name: 池名称
137 |             use_threads: 是否为线程池
138 |         """
139 |         pool_dict = self._thread_pools if use_threads else self._process_pools
140 |         if pool_name in pool_dict:
141 |             pool_dict[pool_name].shutdown()
142 |             del pool_dict[pool_name]
143 |             
144 |     def close_all(self):
145 |         """关闭所有进程池和线程池"""
146 |         for pool in list(self._process_pools.values()):
147 |             pool.shutdown()
148 |         self._process_pools.clear()
149 |         
150 |         for pool in list(self._thread_pools.values()):
151 |             pool.shutdown()
152 |         self._thread_pools.clear() 


--------------------------------------------------------------------------------
/core/resource_manager.py:
--------------------------------------------------------------------------------
  1 | """资源管理器模块
  2 | 
  3 | 该模块提供了统一的资源管理功能，包括文件句柄、进程池、线程池等资源的管理，
  4 | 确保资源在使用后被正确释放，避免资源泄漏。
  5 | 
  6 | 作者: Re-Centris团队
  7 | 版本: 1.0.0
  8 | 许可证: MIT
  9 | """
 10 | 
 11 | import os
 12 | import logging
 13 | import threading
 14 | import multiprocessing
 15 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 16 | from typing import Dict, Any, Optional, Tuple, Union, Set
 17 | 
 18 | # 获取模块日志记录器
 19 | logger = logging.getLogger("re-centris.resource_manager")
 20 | 
 21 | 
 22 | class ResourceManager:
 23 |     """资源管理器，负责管理和释放各种资源"""
 24 |     
 25 |     def __init__(self):
 26 |         """初始化资源管理器"""
 27 |         self._file_handles: Dict[Tuple[str, str], Any] = {}
 28 |         self._process_pools: Dict[str, ProcessPoolExecutor] = {}
 29 |         self._thread_pools: Dict[str, ThreadPoolExecutor] = {}
 30 |         self._resources: Dict[str, Any] = {}
 31 |         self._lock = threading.Lock()
 32 |     
 33 |     def __enter__(self):
 34 |         """上下文管理器入口"""
 35 |         return self
 36 |     
 37 |     def __exit__(self, exc_type, exc_val, exc_tb):
 38 |         """上下文管理器退出时释放所有资源"""
 39 |         self.close_all()
 40 |     
 41 |     def get_file_handle(self, path: str, mode: str = 'r', encoding: Optional[str] = None) -> Any:
 42 |         """获取文件句柄
 43 |         
 44 |         Args:
 45 |             path: 文件路径
 46 |             mode: 打开模式
 47 |             encoding: 文件编码
 48 |         
 49 |         Returns:
 50 |             文件句柄
 51 |         """
 52 |         with self._lock:
 53 |             key = (path, mode)
 54 |             if key not in self._file_handles:
 55 |                 try:
 56 |                     if encoding:
 57 |                         self._file_handles[key] = open(path, mode, encoding=encoding)
 58 |                     else:
 59 |                         self._file_handles[key] = open(path, mode)
 60 |                 except Exception as e:
 61 |                     logger.error(f"打开文件失败 {path}: {e}")
 62 |                     raise
 63 |             return self._file_handles[key]
 64 |     
 65 |     def close_file(self, path: str, mode: str = 'r') -> None:
 66 |         """关闭文件句柄
 67 |         
 68 |         Args:
 69 |             path: 文件路径
 70 |             mode: 打开模式
 71 |         """
 72 |         with self._lock:
 73 |             key = (path, mode)
 74 |             if key in self._file_handles:
 75 |                 try:
 76 |                     self._file_handles[key].close()
 77 |                 except Exception as e:
 78 |                     logger.warning(f"关闭文件失败 {path}: {e}")
 79 |                 finally:
 80 |                     del self._file_handles[key]
 81 |     
 82 |     def get_process_pool(self, name: str = "default", max_workers: Optional[int] = None) -> ProcessPoolExecutor:
 83 |         """获取进程池
 84 |         
 85 |         Args:
 86 |             name: 进程池名称
 87 |             max_workers: 最大工作进程数，如果为None则使用CPU核心数
 88 |         
 89 |         Returns:
 90 |             进程池
 91 |         """
 92 |         with self._lock:
 93 |             if name not in self._process_pools:
 94 |                 if max_workers is None:
 95 |                     max_workers = multiprocessing.cpu_count()
 96 |                 self._process_pools[name] = ProcessPoolExecutor(max_workers=max_workers)
 97 |             return self._process_pools[name]
 98 |     
 99 |     def get_thread_pool(self, name: str = "default", max_workers: Optional[int] = None) -> ThreadPoolExecutor:
100 |         """获取线程池
101 |         
102 |         Args:
103 |             name: 线程池名称
104 |             max_workers: 最大工作线程数，如果为None则使用CPU核心数的5倍
105 |         
106 |         Returns:
107 |             线程池
108 |         """
109 |         with self._lock:
110 |             if name not in self._thread_pools:
111 |                 if max_workers is None:
112 |                     max_workers = multiprocessing.cpu_count() * 5
113 |                 self._thread_pools[name] = ThreadPoolExecutor(max_workers=max_workers)
114 |             return self._thread_pools[name]
115 |     
116 |     def register_resource(self, name: str, resource: Any, close_method: str = "close") -> None:
117 |         """注册自定义资源
118 |         
119 |         Args:
120 |             name: 资源名称
121 |             resource: 资源对象
122 |             close_method: 关闭资源的方法名
123 |         """
124 |         with self._lock:
125 |             if name in self._resources:
126 |                 logger.warning(f"资源 {name} 已存在，将被覆盖")
127 |             
128 |             self._resources[name] = (resource, close_method)
129 |     
130 |     def get_resource(self, name: str) -> Optional[Any]:
131 |         """获取自定义资源
132 |         
133 |         Args:
134 |             name: 资源名称
135 |         
136 |         Returns:
137 |             资源对象，如果不存在则返回None
138 |         """
139 |         with self._lock:
140 |             if name in self._resources:
141 |                 return self._resources[name][0]
142 |             return None
143 |     
144 |     def close_resource(self, name: str) -> bool:
145 |         """关闭自定义资源
146 |         
147 |         Args:
148 |             name: 资源名称
149 |         
150 |         Returns:
151 |             是否成功关闭
152 |         """
153 |         with self._lock:
154 |             if name in self._resources:
155 |                 resource, close_method = self._resources[name]
156 |                 try:
157 |                     getattr(resource, close_method)()
158 |                     del self._resources[name]
159 |                     return True
160 |                 except Exception as e:
161 |                     logger.warning(f"关闭资源 {name} 失败: {e}")
162 |             return False
163 |     
164 |     def close_all(self) -> None:
165 |         """关闭所有资源"""
166 |         with self._lock:
167 |             # 关闭文件句柄
168 |             for key, handle in list(self._file_handles.items()):
169 |                 try:
170 |                     handle.close()
171 |                 except Exception as e:
172 |                     logger.warning(f"关闭文件失败 {key[0]}: {e}")
173 |             self._file_handles.clear()
174 |             
175 |             # 关闭进程池
176 |             for name, pool in list(self._process_pools.items()):
177 |                 try:
178 |                     pool.shutdown()
179 |                 except Exception as e:
180 |                     logger.warning(f"关闭进程池 {name} 失败: {e}")
181 |             self._process_pools.clear()
182 |             
183 |             # 关闭线程池
184 |             for name, pool in list(self._thread_pools.items()):
185 |                 try:
186 |                     pool.shutdown()
187 |                 except Exception as e:
188 |                     logger.warning(f"关闭线程池 {name} 失败: {e}")
189 |             self._thread_pools.clear()
190 |             
191 |             # 关闭自定义资源
192 |             for name, (resource, close_method) in list(self._resources.items()):
193 |                 try:
194 |                     getattr(resource, close_method)()
195 |                 except Exception as e:
196 |                     logger.warning(f"关闭资源 {name} 失败: {e}")
197 |             self._resources.clear()
198 |     
199 |     def __del__(self):
200 |         """析构时关闭所有资源"""
201 |         self.close_all()
202 | 
203 | 
204 | class SafeFileHandler:
205 |     """安全的文件处理器，自动处理文件打开和关闭"""
206 |     
207 |     def __init__(self, path: str, mode: str = 'r', encoding: Optional[str] = None):
208 |         """初始化安全文件处理器
209 |         
210 |         Args:
211 |             path: 文件路径
212 |             mode: 打开模式
213 |             encoding: 文件编码
214 |         """
215 |         self.path = path
216 |         self.mode = mode
217 |         self.encoding = encoding
218 |         self.file = None
219 |     
220 |     def __enter__(self):
221 |         """上下文管理器入口"""
222 |         try:
223 |             if self.encoding:
224 |                 self.file = open(self.path, self.mode, encoding=self.encoding)
225 |             else:
226 |                 self.file = open(self.path, self.mode)
227 |             return self.file
228 |         except Exception as e:
229 |             logger.error(f"打开文件失败 {self.path}: {e}")
230 |             raise
231 |     
232 |     def __exit__(self, exc_type, exc_val, exc_tb):
233 |         """上下文管理器退出时关闭文件"""
234 |         if self.file:
235 |             try:
236 |                 self.file.close()
237 |             except Exception as e:
238 |                 logger.warning(f"关闭文件失败 {self.path}: {e}")
239 | 
240 | 
241 | def safe_open(path: str, mode: str = 'r', encoding: Optional[str] = None) -> SafeFileHandler:
242 |     """安全打开文件
243 |     
244 |     Args:
245 |         path: 文件路径
246 |         mode: 打开模式
247 |         encoding: 文件编码
248 |     
249 |     Returns:
250 |         安全文件处理器
251 |     """
252 |     return SafeFileHandler(path, mode, encoding) 


--------------------------------------------------------------------------------
/detector/__init__.py:
--------------------------------------------------------------------------------
 1 | """Re-Centris 检测器包 - 基于TLSH的代码克隆和依赖关系检测工具。
 2 | 
 3 | 主要功能:
 4 | 1. 代码克隆检测 - 使用TLSH算法检测代码克隆
 5 | 2. 依赖关系分析 - 分析组件间的依赖关系
 6 | 3. 版本预测 - 预测使用的组件版本
 7 | 
 8 | 作者: byRen2002
 9 | 修改日期: 2025年3月
10 | 许可证: MIT License
11 | """
12 | 
13 | from .detector import Detector
14 | 
15 | __all__ = ['Detector'] 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: '3.8'
  2 | 
  3 | services:
  4 |   web:
  5 |     build: .
  6 |     ports:
  7 |       - "8000:8000"
  8 |     volumes:
  9 |       - .:/app
 10 |     environment:
 11 |       - FLASK_ENV=production
 12 |       - FLASK_APP=app.py
 13 |     depends_on:
 14 |       - redis
 15 |       - postgres
 16 |     networks:
 17 |       - re-centris-network
 18 |     deploy:
 19 |       replicas: 2
 20 |       restart_policy:
 21 |         condition: on-failure
 22 |         max_attempts: 3
 23 |       resources:
 24 |         limits:
 25 |           cpus: '1'
 26 |           memory: 1G
 27 |     healthcheck:
 28 |       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
 29 |       interval: 30s
 30 |       timeout: 10s
 31 |       retries: 3
 32 |       start_period: 40s
 33 | 
 34 |   redis:
 35 |     image: redis:alpine
 36 |     ports:
 37 |       - "6379:6379"
 38 |     volumes:
 39 |       - redis-data:/data
 40 |     networks:
 41 |       - re-centris-network
 42 |     deploy:
 43 |       resources:
 44 |         limits:
 45 |           cpus: '0.5'
 46 |           memory: 500M
 47 | 
 48 |   postgres:
 49 |     image: postgres:13-alpine
 50 |     ports:
 51 |       - "5432:5432"
 52 |     volumes:
 53 |       - postgres-data:/var/lib/postgresql/data
 54 |     environment:
 55 |       - POSTGRES_DB=re_centris
 56 |       - POSTGRES_USER=re_centris
 57 |       - POSTGRES_PASSWORD=${DB_PASSWORD}
 58 |     networks:
 59 |       - re-centris-network
 60 |     deploy:
 61 |       resources:
 62 |         limits:
 63 |           cpus: '1'
 64 |           memory: 1G
 65 | 
 66 |   prometheus:
 67 |     image: prom/prometheus
 68 |     ports:
 69 |       - "9090:9090"
 70 |     volumes:
 71 |       - ./prometheus.yml:/etc/prometheus/prometheus.yml
 72 |       - prometheus-data:/prometheus
 73 |     networks:
 74 |       - re-centris-network
 75 |     deploy:
 76 |       resources:
 77 |         limits:
 78 |           cpus: '0.5'
 79 |           memory: 500M
 80 | 
 81 |   grafana:
 82 |     image: grafana/grafana
 83 |     ports:
 84 |       - "3000:3000"
 85 |     volumes:
 86 |       - grafana-data:/var/lib/grafana
 87 |     depends_on:
 88 |       - prometheus
 89 |     networks:
 90 |       - re-centris-network
 91 |     deploy:
 92 |       resources:
 93 |         limits:
 94 |           cpus: '0.5'
 95 |           memory: 500M
 96 | 
 97 | volumes:
 98 |   redis-data:
 99 |   postgres-data:
100 |   prometheus-data:
101 |   grafana-data:
102 | 
103 | networks:
104 |   re-centris-network:
105 |     driver: bridge 


--------------------------------------------------------------------------------
/osscollector/sample:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/redis/redis.git
 2 | git clone https://github.com/torvalds/linux.git
 3 | git clone https://github.com/git/git.git
 4 | git clone https://github.com/openssl/openssl.git
 5 | git clone https://github.com/tensorflow/tensorflow.git
 6 | git clone https://github.com/electron/electron.git
 7 | git clone https://github.com/microsoft/terminal.git
 8 | git clone https://github.com/apple/swift.git
 9 | git clone https://github.com/opencv/opencv.git
10 | git clone https://github.com/bitcoin/bitcoin.git


--------------------------------------------------------------------------------
/preprocessor/__init__.py:
--------------------------------------------------------------------------------
 1 | """Re-Centris 预处理器包
 2 | 
 3 | 该包提供了预处理开源代码库中收集的函数信息的功能。
 4 | 
 5 | 作者: byRen2002
 6 | 修改日期: 2025年3月
 7 | 许可证: MIT
 8 | """
 9 | 
10 | from .preprocessor import Preprocessor
11 | 
12 | __all__ = ['Preprocessor'] 


--------------------------------------------------------------------------------
/preprocessor/language_processors/java_processor.py:
--------------------------------------------------------------------------------
  1 | """Java语言处理器
  2 | 
  3 | 该模块实现了Java代码的解析和处理功能。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import os
 11 | import re
 12 | import ast
 13 | import javalang
 14 | from typing import Dict, List, Tuple, Optional
 15 | import logging
 16 | 
 17 | class JavaProcessor:
 18 |     """Java代码处理器类"""
 19 |     
 20 |     def __init__(self):
 21 |         """初始化Java处理器"""
 22 |         self.method_pattern = re.compile(
 23 |             r'(?:public|private|protected|static|\s) +[\w\<\>\[\]]+\s+(\w+) *\([^\)]*\) *\{?[^\{]*$'
 24 |         )
 25 |         
 26 |     def extract_methods(self, file_path: str) -> List[Dict[str, str]]:
 27 |         """提取Java文件中的方法
 28 |         
 29 |         参数:
 30 |             file_path: Java文件路径
 31 |             
 32 |         返回:
 33 |             方法列表，每个方法包含名称、内容、起始行等信息
 34 |         """
 35 |         methods = []
 36 |         try:
 37 |             with open(file_path, 'r', encoding='utf-8') as f:
 38 |                 content = f.read()
 39 |                 
 40 |             # 使用javalang解析Java代码
 41 |             tree = javalang.parse.parse(content)
 42 |             
 43 |             for _, node in tree.filter(javalang.tree.MethodDeclaration):
 44 |                 method = {
 45 |                     'name': node.name,
 46 |                     'content': self._get_method_content(content, node),
 47 |                     'start_line': node.position.line if node.position else 0,
 48 |                     'modifiers': [str(mod) for mod in node.modifiers],
 49 |                     'return_type': self._get_return_type(node),
 50 |                     'parameters': self._get_parameters(node)
 51 |                 }
 52 |                 methods.append(method)
 53 |                 
 54 |         except Exception as e:
 55 |             logging.error(f"处理Java文件 {file_path} 时出错: {e}")
 56 |             
 57 |         return methods
 58 |         
 59 |     def _get_method_content(self, content: str, node: javalang.tree.MethodDeclaration) -> str:
 60 |         """获取方法的完整内容"""
 61 |         try:
 62 |             lines = content.splitlines()
 63 |             start_line = node.position.line - 1
 64 |             
 65 |             # 找到方法体的结束位置
 66 |             end_line = start_line
 67 |             brace_count = 0
 68 |             found_first_brace = False
 69 |             
 70 |             for i, line in enumerate(lines[start_line:], start_line):
 71 |                 if '{' in line:
 72 |                     brace_count += line.count('{')
 73 |                     found_first_brace = True
 74 |                 if '}' in line:
 75 |                     brace_count -= line.count('}')
 76 |                     
 77 |                 if found_first_brace and brace_count == 0:
 78 |                     end_line = i
 79 |                     break
 80 |                     
 81 |             return '\n'.join(lines[start_line:end_line + 1])
 82 |             
 83 |         except Exception as e:
 84 |             logging.error(f"提取方法内容时出错: {e}")
 85 |             return ""
 86 |             
 87 |     def _get_return_type(self, node: javalang.tree.MethodDeclaration) -> str:
 88 |         """获取方法返回类型"""
 89 |         try:
 90 |             return str(node.return_type.name) if node.return_type else "void"
 91 |         except:
 92 |             return "void"
 93 |             
 94 |     def _get_parameters(self, node: javalang.tree.MethodDeclaration) -> List[Dict[str, str]]:
 95 |         """获取方法参数列表"""
 96 |         params = []
 97 |         try:
 98 |             for param in node.parameters:
 99 |                 params.append({
100 |                     'name': param.name,
101 |                     'type': str(param.type.name)
102 |                 })
103 |         except:
104 |             pass
105 |         return params
106 |         
107 |     def analyze_complexity(self, method_content: str) -> Dict[str, int]:
108 |         """分析方法的复杂度
109 |         
110 |         参数:
111 |             method_content: 方法内容
112 |             
113 |         返回:
114 |             包含圈复杂度、认知复杂度等指标的字典
115 |         """
116 |         metrics = {
117 |             'cyclomatic_complexity': 1,  # 基础复杂度为1
118 |             'cognitive_complexity': 0,
119 |             'nesting_depth': 0
120 |         }
121 |         
122 |         try:
123 |             # 计算圈复杂度
124 |             metrics['cyclomatic_complexity'] += (
125 |                 method_content.count('if ') +
126 |                 method_content.count('while ') +
127 |                 method_content.count('for ') +
128 |                 method_content.count('case ') +
129 |                 method_content.count('catch ') +
130 |                 method_content.count('&&') +
131 |                 method_content.count('||')
132 |             )
133 |             
134 |             # 计算嵌套深度
135 |             current_depth = 0
136 |             max_depth = 0
137 |             
138 |             for line in method_content.split('\n'):
139 |                 if '{' in line:
140 |                     current_depth += 1
141 |                     max_depth = max(max_depth, current_depth)
142 |                 if '}' in line:
143 |                     current_depth -= 1
144 |                     
145 |             metrics['nesting_depth'] = max_depth
146 |             
147 |             # 计算认知复杂度
148 |             metrics['cognitive_complexity'] = (
149 |                 metrics['cyclomatic_complexity'] +
150 |                 metrics['nesting_depth']
151 |             )
152 |             
153 |         except Exception as e:
154 |             logging.error(f"分析方法复杂度时出错: {e}")
155 |             
156 |         return metrics
157 |         
158 |     def extract_class_info(self, file_path: str) -> Dict[str, any]:
159 |         """提取类信息
160 |         
161 |         参数:
162 |             file_path: Java文件路径
163 |             
164 |         返回:
165 |             包含类名、包名、导入等信息的字典
166 |         """
167 |         class_info = {
168 |             'name': '',
169 |             'package': '',
170 |             'imports': [],
171 |             'extends': None,
172 |             'implements': [],
173 |             'modifiers': []
174 |         }
175 |         
176 |         try:
177 |             with open(file_path, 'r', encoding='utf-8') as f:
178 |                 content = f.read()
179 |                 
180 |             tree = javalang.parse.parse(content)
181 |             
182 |             # 获取包名
183 |             if tree.package:
184 |                 class_info['package'] = str(tree.package.name)
185 |                 
186 |             # 获取导入
187 |             class_info['imports'] = [
188 |                 str(imp.path) for imp in tree.imports
189 |             ]
190 |             
191 |             # 获取类信息
192 |             for path, node in tree.filter(javalang.tree.ClassDeclaration):
193 |                 class_info['name'] = node.name
194 |                 class_info['modifiers'] = [str(mod) for mod in node.modifiers]
195 |                 
196 |                 if node.extends:
197 |                     class_info['extends'] = str(node.extends.name)
198 |                     
199 |                 if node.implements:
200 |                     class_info['implements'] = [
201 |                         str(impl.name) for impl in node.implements
202 |                     ]
203 |                 break  # 只处理第一个类
204 |                 
205 |         except Exception as e:
206 |             logging.error(f"提取类信息时出错: {e}")
207 |             
208 |         return class_info
209 |         
210 |     def get_method_signature(self, method: Dict[str, str]) -> str:
211 |         """生成方法签名
212 |         
213 |         参数:
214 |             method: 方法信息字典
215 |             
216 |         返回:
217 |             标准化的方法签名
218 |         """
219 |         try:
220 |             modifiers = ' '.join(method.get('modifiers', []))
221 |             return_type = method.get('return_type', 'void')
222 |             name = method.get('name', '')
223 |             
224 |             params = []
225 |             for param in method.get('parameters', []):
226 |                 params.append(f"{param['type']} {param['name']}")
227 |                 
228 |             signature = f"{modifiers} {return_type} {name}({', '.join(params)})"
229 |             return signature.strip()
230 |             
231 |         except Exception as e:
232 |             logging.error(f"生成方法签名时出错: {e}")
233 |             return ""
234 |             
235 |     def normalize_code(self, code: str) -> str:
236 |         """规范化代码
237 |         
238 |         参数:
239 |             code: 源代码
240 |             
241 |         返回:
242 |             规范化后的代码
243 |         """
244 |         try:
245 |             # 移除注释
246 |             code = re.sub(r'//.*?\n|/\*.*?\*/', '', code, flags=re.DOTALL)
247 |             
248 |             # 移除空行
249 |             code = '\n'.join(
250 |                 line for line in code.splitlines()
251 |                 if line.strip()
252 |             )
253 |             
254 |             # 规范化空白字符
255 |             code = re.sub(r'\s+', ' ', code)
256 |             
257 |             # 规范化字符串字面量
258 |             code = re.sub(r'"[^"]*"', '""', code)
259 |             
260 |             return code.strip()
261 |             
262 |         except Exception as e:
263 |             logging.error(f"规范化代码时出错: {e}")
264 |             return code 


--------------------------------------------------------------------------------
/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 15s
 3 |   evaluation_interval: 15s
 4 | 
 5 | alerting:
 6 |   alertmanagers:
 7 |     - static_configs:
 8 |         - targets:
 9 |           # - alertmanager:9093
10 | 
11 | rule_files:
12 |   # - "first_rules.yml"
13 |   # - "second_rules.yml"
14 | 
15 | scrape_configs:
16 |   - job_name: 're-centris'
17 |     static_configs:
18 |       - targets: ['web:8000']
19 |     metrics_path: '/metrics'
20 |     scrape_interval: 5s
21 | 
22 |   - job_name: 'prometheus'
23 |     static_configs:
24 |       - targets: ['localhost:9090']
25 | 
26 |   - job_name: 'redis'
27 |     static_configs:
28 |       - targets: ['redis:6379']
29 | 
30 |   - job_name: 'postgres'
31 |     static_configs:
32 |       - targets: ['postgres:5432'] 


--------------------------------------------------------------------------------
/re-centris-go/cmd/re-centris/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 
 6 | 	"github.com/re-centris/re-centris-go/internal/cmd"
 7 | )
 8 | 
 9 | func main() {
10 | 	if err := cmd.Execute(); err != nil {
11 | 		log.Fatal(err)
12 | 	}
13 | } 


--------------------------------------------------------------------------------
/re-centris-go/config.yaml:
--------------------------------------------------------------------------------
 1 | # Re-Centris Configuration
 2 | 
 3 | # Path configurations
 4 | paths:
 5 |   repo_path: "./repos"
 6 |   tag_date_path: "./data/repo_date"
 7 |   result_path: "./data/repo_functions"
 8 | 
 9 | # Performance settings
10 | performance:
11 |   max_workers: 0  # 0 means use number of CPU cores
12 |   cache_size: 1000
13 |   memory_limit: 0.8  # Maximum memory usage (80%)
14 | 
15 | # Language settings
16 | languages:
17 |   cpp:
18 |     enabled: true
19 |     extensions:
20 |       - ".c"
21 |       - ".cc"
22 |       - ".cpp"
23 |       - ".cxx"
24 |       - ".h"
25 |       - ".hpp"
26 |   java:
27 |     enabled: false
28 |     extensions:
29 |       - ".java"
30 |   python:
31 |     enabled: false
32 |     extensions:
33 |       - ".py"
34 | 
35 | # Clone settings
36 | clone:
37 |   output: "./repos"
38 |   workers: 5
39 | 
40 | # Analysis settings
41 | analyze:
42 |   output: "./analysis"
43 |   workers: 5
44 | 
45 | # Detection settings
46 | detect:
47 |   known_files: "./known-files"
48 |   output: "detection-results.json"
49 |   workers: 5
50 |   threshold: 0.8  # Similarity threshold (0.0-1.0) 


--------------------------------------------------------------------------------
/re-centris-go/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/re-centris/re-centris-go
 2 | 
 3 | go 1.21
 4 | 
 5 | require (
 6 | 	github.com/spf13/cobra v1.8.0
 7 | 	github.com/spf13/viper v1.18.2
 8 | 	go.uber.org/zap v1.27.0
 9 | 	golang.org/x/sync v0.6.0
10 | 	gopkg.in/yaml.v3 v3.0.1
11 | ) 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/analyzer.go:
--------------------------------------------------------------------------------
  1 | package analyzer
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"context"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"strings"
 11 | 	"sync"
 12 | 
 13 | 	"github.com/re-centris/re-centris-go/internal/analyzer/tlsh"
 14 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
 15 | 	"golang.org/x/sync/errgroup"
 16 | )
 17 | 
 18 | // FileInfo represents information about an analyzed file
 19 | type FileInfo struct {
 20 | 	Path     string
 21 | 	Language string
 22 | 	Hash     *tlsh.TLSH
 23 | 	Size     int64
 24 | }
 25 | 
 26 | // AnalyzerOptions contains options for the analyzer
 27 | type AnalyzerOptions struct {
 28 | 	MaxWorkers int
 29 | 	Languages  map[string][]string // map of language to file extensions
 30 | }
 31 | 
 32 | // Analyzer handles code analysis
 33 | type Analyzer struct {
 34 | 	opts AnalyzerOptions
 35 | }
 36 | 
 37 | // New creates a new Analyzer
 38 | func New(opts AnalyzerOptions) *Analyzer {
 39 | 	return &Analyzer{opts: opts}
 40 | }
 41 | 
 42 | // AnalyzeFile analyzes a single file and returns its FileInfo
 43 | func (a *Analyzer) AnalyzeFile(ctx context.Context, path string) (*FileInfo, error) {
 44 | 	// Get file extension
 45 | 	ext := strings.ToLower(filepath.Ext(path))
 46 | 	
 47 | 	// Find language for this extension
 48 | 	var language string
 49 | 	for lang, exts := range a.opts.Languages {
 50 | 		for _, e := range exts {
 51 | 			if e == ext {
 52 | 				language = lang
 53 | 				break
 54 | 			}
 55 | 		}
 56 | 		if language != "" {
 57 | 			break
 58 | 		}
 59 | 	}
 60 | 
 61 | 	if language == "" {
 62 | 		return nil, fmt.Errorf("unsupported file extension: %s", ext)
 63 | 	}
 64 | 
 65 | 	// Open and read file
 66 | 	file, err := os.Open(path)
 67 | 	if err != nil {
 68 | 		return nil, fmt.Errorf("failed to open file: %v", err)
 69 | 	}
 70 | 	defer file.Close()
 71 | 
 72 | 	// Get file size
 73 | 	stat, err := file.Stat()
 74 | 	if err != nil {
 75 | 		return nil, fmt.Errorf("failed to get file stats: %v", err)
 76 | 	}
 77 | 
 78 | 	// Read file content
 79 | 	content, err := io.ReadAll(file)
 80 | 	if err != nil {
 81 | 		return nil, fmt.Errorf("failed to read file: %v", err)
 82 | 	}
 83 | 
 84 | 	// Calculate TLSH hash
 85 | 	hash, err := tlsh.New(content)
 86 | 	if err != nil {
 87 | 		return nil, fmt.Errorf("failed to calculate TLSH hash: %v", err)
 88 | 	}
 89 | 
 90 | 	return &FileInfo{
 91 | 		Path:     path,
 92 | 		Language: language,
 93 | 		Hash:     hash,
 94 | 		Size:     stat.Size(),
 95 | 	}, nil
 96 | }
 97 | 
 98 | // AnalyzeDirectory analyzes all files in a directory and its subdirectories
 99 | func (a *Analyzer) AnalyzeDirectory(ctx context.Context, dir string) ([]*FileInfo, error) {
100 | 	var (
101 | 		files    []*FileInfo
102 | 		filesMux sync.Mutex
103 | 	)
104 | 
105 | 	// Create error group with context and worker limit
106 | 	g, ctx := errgroup.WithContext(ctx)
107 | 	g.SetLimit(a.opts.MaxWorkers)
108 | 
109 | 	// Walk through directory
110 | 	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
111 | 		if err != nil {
112 | 			return err
113 | 		}
114 | 
115 | 		// Skip directories
116 | 		if info.IsDir() {
117 | 			return nil
118 | 		}
119 | 
120 | 		// Check if context is cancelled
121 | 		select {
122 | 		case <-ctx.Done():
123 | 			return ctx.Err()
124 | 		default:
125 | 		}
126 | 
127 | 		// Process file in goroutine
128 | 		g.Go(func() error {
129 | 			fileInfo, err := a.AnalyzeFile(ctx, path)
130 | 			if err != nil {
131 | 				if err == tlsh.ErrDataTooSmall {
132 | 					// Skip files that are too small
133 | 					return nil
134 | 				}
135 | 				logger.Error("Failed to analyze file",
136 | 					zap.String("path", path),
137 | 					zap.Error(err))
138 | 				return err
139 | 			}
140 | 
141 | 			// Add file info to results
142 | 			filesMux.Lock()
143 | 			files = append(files, fileInfo)
144 | 			filesMux.Unlock()
145 | 
146 | 			return nil
147 | 		})
148 | 
149 | 		return nil
150 | 	})
151 | 
152 | 	if err != nil {
153 | 		return nil, fmt.Errorf("failed to walk directory: %v", err)
154 | 	}
155 | 
156 | 	// Wait for all goroutines to complete
157 | 	if err := g.Wait(); err != nil {
158 | 		return nil, fmt.Errorf("error while analyzing files: %v", err)
159 | 	}
160 | 
161 | 	return files, nil
162 | }
163 | 
164 | // FindSimilarFiles finds files similar to the target file
165 | func (a *Analyzer) FindSimilarFiles(target *FileInfo, candidates []*FileInfo, threshold int) []*FileInfo {
166 | 	var similar []*FileInfo
167 | 	
168 | 	for _, candidate := range candidates {
169 | 		// Skip same file
170 | 		if target.Path == candidate.Path {
171 | 			continue
172 | 		}
173 | 
174 | 		// Skip files with different languages
175 | 		if target.Language != candidate.Language {
176 | 			continue
177 | 		}
178 | 
179 | 		// Calculate distance
180 | 		distance := target.Hash.Distance(candidate.Hash)
181 | 		if distance <= threshold {
182 | 			similar = append(similar, candidate)
183 | 		}
184 | 	}
185 | 
186 | 	return similar
187 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/parser/cpp/parser.go:
--------------------------------------------------------------------------------
  1 | package cpp
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"regexp"
  9 | 	"strings"
 10 | 
 11 | 	"github.com/re-centris/re-centris-go/internal/analyzer/parser"
 12 | 	"github.com/re-centris/re-centris-go/internal/analyzer/tlsh"
 13 | )
 14 | 
 15 | var (
 16 | 	// Function declaration pattern
 17 | 	funcPattern = regexp.MustCompile(`^[\s]*(?:virtual\s+)?(?:static\s+)?(?:inline\s+)?(?:explicit\s+)?(?:[\w:]+[\s*&]+)?[\w:~]+[\s*&]*\s*[\w:]+\s*\([^)]*\)\s*(?:const\s*)?(?:noexcept\s*)?(?:override\s*)?(?:final\s*)?(?:=\s*0\s*)?(?:=\s*default\s*)?(?:=\s*delete\s*)?(?:\s*{\s*)?$`)
 18 | 
 19 | 	// Class declaration pattern
 20 | 	classPattern = regexp.MustCompile(`^[\s]*(?:class|struct)\s+\w+(?:\s*:\s*(?:public|protected|private)\s+\w+(?:\s*,\s*(?:public|protected|private)\s+\w+)*)?(?:\s*{\s*)?$`)
 21 | )
 22 | 
 23 | // CPPParser implements the Parser interface for C/C++
 24 | type CPPParser struct{}
 25 | 
 26 | // New creates a new C/C++ parser
 27 | func New() *CPPParser {
 28 | 	return &CPPParser{}
 29 | }
 30 | 
 31 | // GetLanguage returns the language name
 32 | func (p *CPPParser) GetLanguage() string {
 33 | 	return "cpp"
 34 | }
 35 | 
 36 | // GetExtensions returns supported file extensions
 37 | func (p *CPPParser) GetExtensions() []string {
 38 | 	return []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"}
 39 | }
 40 | 
 41 | // Parse parses C/C++ source code and extracts functions
 42 | func (p *CPPParser) Parse(reader io.Reader) ([]parser.Function, error) {
 43 | 	var (
 44 | 		functions []parser.Function
 45 | 		scanner   = bufio.NewScanner(reader)
 46 | 		lineNum  = 0
 47 | 		inFunc   = false
 48 | 		inClass  = false
 49 | 		curFunc  parser.Function
 50 | 		content  strings.Builder
 51 | 	)
 52 | 
 53 | 	// Stack to track nested braces
 54 | 	braceCount := 0
 55 | 
 56 | 	for scanner.Scan() {
 57 | 		lineNum++
 58 | 		line := scanner.Text()
 59 | 		trimmedLine := strings.TrimSpace(line)
 60 | 
 61 | 		// Skip empty lines and comments
 62 | 		if trimmedLine == "" || strings.HasPrefix(trimmedLine, "//") {
 63 | 			continue
 64 | 		}
 65 | 
 66 | 		// Handle multi-line comments
 67 | 		if strings.HasPrefix(trimmedLine, "/*") {
 68 | 			for scanner.Scan() {
 69 | 				lineNum++
 70 | 				if strings.Contains(scanner.Text(), "*/") {
 71 | 					break
 72 | 				}
 73 | 			}
 74 | 			continue
 75 | 		}
 76 | 
 77 | 		// Track braces
 78 | 		braceCount += strings.Count(line, "{") - strings.Count(line, "}")
 79 | 
 80 | 		// Check for class/struct declarations
 81 | 		if classPattern.MatchString(line) {
 82 | 			inClass = true
 83 | 			continue
 84 | 		}
 85 | 
 86 | 		// Check for function declarations
 87 | 		if !inFunc && funcPattern.MatchString(line) {
 88 | 			inFunc = true
 89 | 			curFunc = parser.Function{
 90 | 				Name:      extractFunctionName(line),
 91 | 				StartLine: lineNum,
 92 | 				Content:   line + "\n",
 93 | 			}
 94 | 			continue
 95 | 		}
 96 | 
 97 | 		// Inside function
 98 | 		if inFunc {
 99 | 			content.WriteString(line)
100 | 			content.WriteString("\n")
101 | 
102 | 			// Function ends when braces are balanced
103 | 			if braceCount == 0 {
104 | 				curFunc.EndLine = lineNum
105 | 				curFunc.Content = content.String()
106 | 
107 | 				// Calculate hash
108 | 				hash, err := tlsh.New([]byte(curFunc.Content))
109 | 				if err == nil {
110 | 					curFunc.Hash = hash.String()
111 | 				}
112 | 
113 | 				functions = append(functions, curFunc)
114 | 				inFunc = false
115 | 				content.Reset()
116 | 			}
117 | 		}
118 | 
119 | 		// Reset class state when closing brace is found
120 | 		if inClass && braceCount == 0 {
121 | 			inClass = false
122 | 		}
123 | 	}
124 | 
125 | 	if err := scanner.Err(); err != nil {
126 | 		return nil, fmt.Errorf("error scanning C/C++ code: %v", err)
127 | 	}
128 | 
129 | 	return functions, nil
130 | }
131 | 
132 | // extractFunctionName extracts the function name from the declaration
133 | func extractFunctionName(line string) string {
134 | 	// Remove return type and parameters
135 | 	line = strings.TrimSpace(line)
136 | 	if idx := strings.Index(line, "("); idx > 0 {
137 | 		line = strings.TrimSpace(line[:idx])
138 | 	}
139 | 
140 | 	// Get the last word before parameters
141 | 	parts := strings.Fields(line)
142 | 	if len(parts) > 0 {
143 | 		return parts[len(parts)-1]
144 | 	}
145 | 
146 | 	return ""
147 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/parser/cpp/parser_test.go:
--------------------------------------------------------------------------------
  1 | package cpp
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestCPPParser_GetLanguage(t *testing.T) {
  9 | 	parser := New()
 10 | 	if lang := parser.GetLanguage(); lang != "cpp" {
 11 | 		t.Errorf("GetLanguage() = %v, want cpp", lang)
 12 | 	}
 13 | }
 14 | 
 15 | func TestCPPParser_GetExtensions(t *testing.T) {
 16 | 	parser := New()
 17 | 	exts := parser.GetExtensions()
 18 | 	expected := []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"}
 19 | 
 20 | 	if len(exts) != len(expected) {
 21 | 		t.Errorf("GetExtensions() returned %d extensions, want %d", len(exts), len(expected))
 22 | 	}
 23 | 
 24 | 	for i, ext := range expected {
 25 | 		if exts[i] != ext {
 26 | 			t.Errorf("GetExtensions()[%d] = %v, want %v", i, exts[i], ext)
 27 | 		}
 28 | 	}
 29 | }
 30 | 
 31 | func TestCPPParser_Parse(t *testing.T) {
 32 | 	tests := []struct {
 33 | 		name          string
 34 | 		code          string
 35 | 		wantFunctions int
 36 | 		wantNames     []string
 37 | 	}{
 38 | 		{
 39 | 			name: "simple function",
 40 | 			code: `
 41 | 				int add(int a, int b) {
 42 | 					return a + b;
 43 | 				}
 44 | 			`,
 45 | 			wantFunctions: 1,
 46 | 			wantNames:     []string{"add"},
 47 | 		},
 48 | 		{
 49 | 			name: "class method",
 50 | 			code: `
 51 | 				class Calculator {
 52 | 				public:
 53 | 					int add(int a, int b) {
 54 | 						return a + b;
 55 | 					}
 56 | 					virtual void process() = 0;
 57 | 				};
 58 | 			`,
 59 | 			wantFunctions: 2,
 60 | 			wantNames:     []string{"add", "process"},
 61 | 		},
 62 | 		{
 63 | 			name: "multiple functions",
 64 | 			code: `
 65 | 				void init() {}
 66 | 				int calculate(double x) {
 67 | 					return static_cast<int>(x);
 68 | 				}
 69 | 				namespace test {
 70 | 					void helper() {}
 71 | 				}
 72 | 			`,
 73 | 			wantFunctions: 3,
 74 | 			wantNames:     []string{"init", "calculate", "helper"},
 75 | 		},
 76 | 		{
 77 | 			name: "complex function",
 78 | 			code: `
 79 | 				template<typename T>
 80 | 				static inline T* createObject(const std::string& name) noexcept {
 81 | 					return new T(name);
 82 | 				}
 83 | 			`,
 84 | 			wantFunctions: 1,
 85 | 			wantNames:     []string{"createObject"},
 86 | 		},
 87 | 	}
 88 | 
 89 | 	parser := New()
 90 | 	for _, tt := range tests {
 91 | 		t.Run(tt.name, func(t *testing.T) {
 92 | 			reader := strings.NewReader(tt.code)
 93 | 			functions, err := parser.Parse(reader)
 94 | 			
 95 | 			if err != nil {
 96 | 				t.Errorf("Parse() error = %v", err)
 97 | 				return
 98 | 			}
 99 | 
100 | 			if len(functions) != tt.wantFunctions {
101 | 				t.Errorf("Parse() got %v functions, want %v", len(functions), tt.wantFunctions)
102 | 				return
103 | 			}
104 | 
105 | 			for i, wantName := range tt.wantNames {
106 | 				if i >= len(functions) {
107 | 					t.Errorf("Missing function %v", wantName)
108 | 					continue
109 | 				}
110 | 				if functions[i].Name != wantName {
111 | 					t.Errorf("Function[%d].Name = %v, want %v", i, functions[i].Name, wantName)
112 | 				}
113 | 				if functions[i].Hash == "" {
114 | 					t.Errorf("Function[%d].Hash is empty", i)
115 | 				}
116 | 			}
117 | 		})
118 | 	}
119 | }
120 | 
121 | func TestCPPParser_ParseEdgeCases(t *testing.T) {
122 | 	tests := []struct {
123 | 		name    string
124 | 		code    string
125 | 		wantErr bool
126 | 	}{
127 | 		{
128 | 			name:    "empty code",
129 | 			code:    "",
130 | 			wantErr: false,
131 | 		},
132 | 		{
133 | 			name: "only comments",
134 | 			code: `
135 | 				// This is a comment
136 | 				/* This is a
137 | 				   multi-line comment */
138 | 			`,
139 | 			wantErr: false,
140 | 		},
141 | 		{
142 | 			name: "incomplete function",
143 | 			code: `
144 | 				int add(int a, int b) {
145 | 					return a + b;
146 | 				// missing closing brace
147 | 			`,
148 | 			wantErr: false, // parser should handle this gracefully
149 | 		},
150 | 		{
151 | 			name: "nested functions",
152 | 			code: `
153 | 				void outer() {
154 | 					void inner() {
155 | 						// nested function (invalid in C++)
156 | 					}
157 | 				}
158 | 			`,
159 | 			wantErr: false,
160 | 		},
161 | 	}
162 | 
163 | 	parser := New()
164 | 	for _, tt := range tests {
165 | 		t.Run(tt.name, func(t *testing.T) {
166 | 			reader := strings.NewReader(tt.code)
167 | 			_, err := parser.Parse(reader)
168 | 			if (err != nil) != tt.wantErr {
169 | 				t.Errorf("Parse() error = %v, wantErr %v", err, tt.wantErr)
170 | 			}
171 | 		})
172 | 	}
173 | }
174 | 
175 | func BenchmarkCPPParser_Parse(b *testing.B) {
176 | 	code := `
177 | 		class Example {
178 | 		public:
179 | 			void method1() { }
180 | 			int method2(int x) { return x * 2; }
181 | 			virtual void method3() = 0;
182 | 		};
183 | 
184 | 		namespace test {
185 | 			void function1() {
186 | 				// some code
187 | 			}
188 | 			
189 | 			int function2(double x) {
190 | 				return static_cast<int>(x);
191 | 			}
192 | 		}
193 | 	`
194 | 	
195 | 	parser := New()
196 | 	b.ResetTimer()
197 | 	
198 | 	for i := 0; i < b.N; i++ {
199 | 		reader := strings.NewReader(code)
200 | 		_, _ = parser.Parse(reader)
201 | 	}
202 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/parser/parser.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"io"
 5 | )
 6 | 
 7 | // Function represents a parsed function
 8 | type Function struct {
 9 | 	Name      string
10 | 	StartLine int
11 | 	EndLine   int
12 | 	Content   string
13 | 	Hash      string
14 | }
15 | 
16 | // Parser defines the interface for language-specific parsers
17 | type Parser interface {
18 | 	// Parse parses the source code and returns extracted functions
19 | 	Parse(reader io.Reader) ([]Function, error)
20 | 	
21 | 	// GetLanguage returns the language name
22 | 	GetLanguage() string
23 | 	
24 | 	// GetExtensions returns supported file extensions
25 | 	GetExtensions() []string
26 | }
27 | 
28 | // Registry maintains a map of language parsers
29 | type Registry struct {
30 | 	parsers map[string]Parser
31 | }
32 | 
33 | // NewRegistry creates a new parser registry
34 | func NewRegistry() *Registry {
35 | 	return &Registry{
36 | 		parsers: make(map[string]Parser),
37 | 	}
38 | }
39 | 
40 | // Register registers a parser for a language
41 | func (r *Registry) Register(parser Parser) {
42 | 	r.parsers[parser.GetLanguage()] = parser
43 | }
44 | 
45 | // Get returns a parser for the given language
46 | func (r *Registry) Get(language string) (Parser, bool) {
47 | 	parser, ok := r.parsers[language]
48 | 	return parser, ok
49 | }
50 | 
51 | // GetByExtension returns a parser for the given file extension
52 | func (r *Registry) GetByExtension(ext string) (Parser, bool) {
53 | 	for _, parser := range r.parsers {
54 | 		for _, e := range parser.GetExtensions() {
55 | 			if e == ext {
56 | 				return parser, true
57 | 			}
58 | 		}
59 | 	}
60 | 	return nil, false
61 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/tlsh/errors.go:
--------------------------------------------------------------------------------
 1 | package tlsh
 2 | 
 3 | import "errors"
 4 | 
 5 | var (
 6 | 	// ErrDataTooSmall is returned when input data is too small for TLSH calculation
 7 | 	ErrDataTooSmall = errors.New("input data must be at least 50 bytes")
 8 | 
 9 | 	// ErrInvalidHash is returned when trying to parse an invalid TLSH hash string
10 | 	ErrInvalidHash = errors.New("invalid TLSH hash format")
11 | 
12 | 	// ErrNilHash is returned when trying to operate on a nil TLSH hash
13 | 	ErrNilHash = errors.New("nil TLSH hash")
14 | ) 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/tlsh/tlsh.go:
--------------------------------------------------------------------------------
  1 | package tlsh
  2 | 
  3 | import (
  4 | 	"crypto/sha256"
  5 | 	"encoding/hex"
  6 | 	"math"
  7 | 	"sort"
  8 | )
  9 | 
 10 | const (
 11 | 	bucketCount   = 256
 12 | 	windowSize    = 5
 13 | 	minDataLength = 50
 14 | )
 15 | 
 16 | // TLSH represents a Trend Micro Locality Sensitive Hash
 17 | type TLSH struct {
 18 | 	Checksum   byte
 19 | 	LValue     byte
 20 | 	Q1Ratio    byte
 21 | 	Q2Ratio    byte
 22 | 	QRatios    [2]byte
 23 | 	Buckets    [bucketCount]byte
 24 | 	DataLength int
 25 | }
 26 | 
 27 | // New creates a new TLSH hash from a byte slice
 28 | func New(data []byte) (*TLSH, error) {
 29 | 	if len(data) < minDataLength {
 30 | 		return nil, ErrDataTooSmall
 31 | 	}
 32 | 
 33 | 	tlsh := &TLSH{
 34 | 		DataLength: len(data),
 35 | 	}
 36 | 
 37 | 	// Calculate sliding window
 38 | 	buckets := make([]int, bucketCount)
 39 | 	for i := 0; i < len(data)-windowSize; i++ {
 40 | 		window := data[i : i+windowSize]
 41 | 		triplet := (int(window[0]) << 16) | (int(window[2]) << 8) | int(window[4])
 42 | 		bucket := triplet % bucketCount
 43 | 		buckets[bucket]++
 44 | 	}
 45 | 
 46 | 	// Calculate quartiles
 47 | 	sortedBuckets := make([]int, len(buckets))
 48 | 	copy(sortedBuckets, buckets)
 49 | 	sort.Ints(sortedBuckets)
 50 | 
 51 | 	q1Pos := len(sortedBuckets) / 4
 52 | 	q2Pos := len(sortedBuckets) / 2
 53 | 	q3Pos := (3 * len(sortedBuckets)) / 4
 54 | 
 55 | 	q1 := sortedBuckets[q1Pos]
 56 | 	q2 := sortedBuckets[q2Pos]
 57 | 	q3 := sortedBuckets[q3Pos]
 58 | 
 59 | 	// Calculate ratios
 60 | 	tlsh.Q1Ratio = byte((float64(q1) / float64(q3)) * 16)
 61 | 	tlsh.Q2Ratio = byte((float64(q2) / float64(q3)) * 16)
 62 | 
 63 | 	// Calculate final bucket values
 64 | 	for i := 0; i < bucketCount; i++ {
 65 | 		if buckets[i] <= q1 {
 66 | 			tlsh.Buckets[i] = 0
 67 | 		} else if buckets[i] <= q2 {
 68 | 			tlsh.Buckets[i] = 1
 69 | 		} else if buckets[i] <= q3 {
 70 | 			tlsh.Buckets[i] = 2
 71 | 		} else {
 72 | 			tlsh.Buckets[i] = 3
 73 | 		}
 74 | 	}
 75 | 
 76 | 	// Calculate checksum
 77 | 	h := sha256.New()
 78 | 	h.Write(data)
 79 | 	tlsh.Checksum = h.Sum(nil)[0]
 80 | 
 81 | 	// Calculate L-Value (log base 2 of the file size)
 82 | 	tlsh.LValue = byte(math.Log2(float64(len(data))))
 83 | 
 84 | 	return tlsh, nil
 85 | }
 86 | 
 87 | // Distance calculates the distance between two TLSH hashes
 88 | func (t *TLSH) Distance(other *TLSH) int {
 89 | 	if t == nil || other == nil {
 90 | 		return -1
 91 | 	}
 92 | 
 93 | 	// Calculate L-Value difference
 94 | 	lDiff := math.Abs(float64(t.LValue - other.LValue))
 95 | 
 96 | 	// Calculate bucket difference
 97 | 	bucketDiff := 0
 98 | 	for i := 0; i < bucketCount; i++ {
 99 | 		bucketDiff += int(math.Abs(float64(t.Buckets[i] - other.Buckets[i])))
100 | 	}
101 | 
102 | 	// Calculate quartile ratio difference
103 | 	q1Diff := math.Abs(float64(t.Q1Ratio - other.Q1Ratio))
104 | 	q2Diff := math.Abs(float64(t.Q2Ratio - other.Q2Ratio))
105 | 
106 | 	// Weighted sum of differences
107 | 	return int(lDiff*12 + float64(bucketDiff) + (q1Diff+q2Diff)*12)
108 | }
109 | 
110 | // String returns the hex representation of the TLSH hash
111 | func (t *TLSH) String() string {
112 | 	if t == nil {
113 | 		return ""
114 | 	}
115 | 
116 | 	result := make([]byte, bucketCount/2+4)
117 | 	result[0] = t.Checksum
118 | 	result[1] = t.LValue
119 | 	result[2] = t.Q1Ratio
120 | 	result[3] = t.Q2Ratio
121 | 
122 | 	// Pack buckets (2 buckets per byte)
123 | 	for i := 0; i < bucketCount/2; i++ {
124 | 		result[i+4] = (t.Buckets[i*2] << 4) | t.Buckets[i*2+1]
125 | 	}
126 | 
127 | 	return hex.EncodeToString(result)
128 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/analyzer/tlsh/tlsh_test.go:
--------------------------------------------------------------------------------
  1 | package tlsh
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | func TestTLSH(t *testing.T) {
  8 | 	tests := []struct {
  9 | 		name     string
 10 | 		data     []byte
 11 | 		wantErr  bool
 12 | 		distance int // distance with itself should be 0
 13 | 	}{
 14 | 		{
 15 | 			name:     "normal text",
 16 | 			data:     []byte("This is a test string that is long enough to generate a TLSH hash"),
 17 | 			wantErr:  false,
 18 | 			distance: 0,
 19 | 		},
 20 | 		{
 21 | 			name:     "too short",
 22 | 			data:     []byte("too short"),
 23 | 			wantErr:  true,
 24 | 			distance: -1,
 25 | 		},
 26 | 		{
 27 | 			name:     "repeated content",
 28 | 			data:     []byte("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
 29 | 			wantErr:  false,
 30 | 			distance: 0,
 31 | 		},
 32 | 	}
 33 | 
 34 | 	for _, tt := range tests {
 35 | 		t.Run(tt.name, func(t *testing.T) {
 36 | 			hash1, err1 := New(tt.data)
 37 | 			if (err1 != nil) != tt.wantErr {
 38 | 				t.Errorf("New() error = %v, wantErr %v", err1, tt.wantErr)
 39 | 				return
 40 | 			}
 41 | 			if tt.wantErr {
 42 | 				return
 43 | 			}
 44 | 
 45 | 			// Test distance with itself
 46 | 			if dist := hash1.Distance(hash1); dist != tt.distance {
 47 | 				t.Errorf("Distance with itself = %v, want %v", dist, tt.distance)
 48 | 			}
 49 | 
 50 | 			// Test string representation
 51 | 			if str := hash1.String(); str == "" {
 52 | 				t.Error("String() returned empty string")
 53 | 			}
 54 | 
 55 | 			// Test with modified data
 56 | 			modifiedData := make([]byte, len(tt.data))
 57 | 			copy(modifiedData, tt.data)
 58 | 			modifiedData[len(modifiedData)-1]++ // modify last byte
 59 | 			hash2, _ := New(modifiedData)
 60 | 
 61 | 			// Distance should be non-zero for different data
 62 | 			if dist := hash1.Distance(hash2); dist == 0 {
 63 | 				t.Error("Distance should be non-zero for different data")
 64 | 			}
 65 | 		})
 66 | 	}
 67 | }
 68 | 
 69 | func TestTLSHEdgeCases(t *testing.T) {
 70 | 	tests := []struct {
 71 | 		name    string
 72 | 		data    []byte
 73 | 		wantErr bool
 74 | 	}{
 75 | 		{
 76 | 			name:    "nil data",
 77 | 			data:    nil,
 78 | 			wantErr: true,
 79 | 		},
 80 | 		{
 81 | 			name:    "empty data",
 82 | 			data:    []byte{},
 83 | 			wantErr: true,
 84 | 		},
 85 | 		{
 86 | 			name:    "exactly minimum length",
 87 | 			data:    make([]byte, minDataLength),
 88 | 			wantErr: false,
 89 | 		},
 90 | 		{
 91 | 			name:    "one byte less than minimum",
 92 | 			data:    make([]byte, minDataLength-1),
 93 | 			wantErr: true,
 94 | 		},
 95 | 	}
 96 | 
 97 | 	for _, tt := range tests {
 98 | 		t.Run(tt.name, func(t *testing.T) {
 99 | 			_, err := New(tt.data)
100 | 			if (err != nil) != tt.wantErr {
101 | 				t.Errorf("New() error = %v, wantErr %v", err, tt.wantErr)
102 | 			}
103 | 		})
104 | 	}
105 | }
106 | 
107 | func BenchmarkTLSH(b *testing.B) {
108 | 	data := []byte(`This is a test string that is long enough to generate a TLSH hash.
109 | 		We need to make it even longer to ensure we have enough data for meaningful benchmarks.
110 | 		Adding more text to make it more realistic and provide better performance measurements.`)
111 | 
112 | 	b.ResetTimer()
113 | 	for i := 0; i < b.N; i++ {
114 | 		_, _ = New(data)
115 | 	}
116 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/cmd/analyze.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"github.com/re-centris/re-centris-go/internal/analyzer"
 7 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
 8 | 	"github.com/spf13/cobra"
 9 | 	"github.com/spf13/viper"
10 | 	"go.uber.org/zap"
11 | )
12 | 
13 | var analyzeCmd = &cobra.Command{
14 | 	Use:   "analyze [directory]",
15 | 	Short: "Analyze source code files",
16 | 	Long: `Analyze source code files in a directory to calculate TLSH hashes
17 | and extract function information.`,
18 | 	Args: cobra.ExactArgs(1),
19 | 	RunE: runAnalyze,
20 | }
21 | 
22 | func init() {
23 | 	rootCmd.AddCommand(analyzeCmd)
24 | 
25 | 	analyzeCmd.Flags().StringP("output", "o", "./analysis", "Output directory for analysis results")
26 | 	analyzeCmd.Flags().IntP("workers", "w", 5, "Number of parallel workers")
27 | 
28 | 	viper.BindPFlag("analyze.output", analyzeCmd.Flags().Lookup("output"))
29 | 	viper.BindPFlag("analyze.workers", analyzeCmd.Flags().Lookup("workers"))
30 | }
31 | 
32 | func runAnalyze(cmd *cobra.Command, args []string) error {
33 | 	// Get target directory
34 | 	targetDir := args[0]
35 | 
36 | 	// Create analyzer options
37 | 	opts := analyzer.AnalyzerOptions{
38 | 		MaxWorkers: viper.GetInt("analyze.workers"),
39 | 		Languages: map[string][]string{
40 | 			"cpp": {".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"},
41 | 			"java": {".java"},
42 | 			"python": {".py"},
43 | 		},
44 | 	}
45 | 
46 | 	// Create analyzer
47 | 	a := analyzer.New(opts)
48 | 
49 | 	// Analyze directory
50 | 	logger.Info("Starting code analysis",
51 | 		zap.String("directory", targetDir))
52 | 
53 | 	files, err := a.AnalyzeDirectory(context.Background(), targetDir)
54 | 	if err != nil {
55 | 		return err
56 | 	}
57 | 
58 | 	logger.Info("Code analysis completed",
59 | 		zap.Int("total_files", len(files)))
60 | 
61 | 	return nil
62 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/cmd/clone.go:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/re-centris-go/internal/cmd/detect.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"github.com/re-centris/re-centris-go/internal/detector"
 7 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
 8 | 	"github.com/spf13/cobra"
 9 | 	"github.com/spf13/viper"
10 | )
11 | 
12 | var detectCmd = &cobra.Command{
13 | 	Use:   "detect [target-files...]",
14 | 	Short: "Detect code similarities",
15 | 	Long: `Detect code similarities between target files and known files
16 | using TLSH hash comparison.`,
17 | 	Args: cobra.MinimumNArgs(1),
18 | 	RunE: runDetect,
19 | }
20 | 
21 | func init() {
22 | 	rootCmd.AddCommand(detectCmd)
23 | 
24 | 	detectCmd.Flags().StringP("known-files", "k", "./known-files", "Directory containing known files")
25 | 	detectCmd.Flags().StringP("output", "o", "detection-results.json", "Output file for detection results")
26 | 	detectCmd.Flags().IntP("workers", "w", 5, "Number of parallel workers")
27 | 	detectCmd.Flags().Float64P("threshold", "t", 0.8, "Similarity threshold (0.0-1.0)")
28 | 
29 | 	viper.BindPFlag("detect.known_files", detectCmd.Flags().Lookup("known-files"))
30 | 	viper.BindPFlag("detect.output", detectCmd.Flags().Lookup("output"))
31 | 	viper.BindPFlag("detect.workers", detectCmd.Flags().Lookup("workers"))
32 | 	viper.BindPFlag("detect.threshold", detectCmd.Flags().Lookup("threshold"))
33 | }
34 | 
35 | func runDetect(cmd *cobra.Command, args []string) error {
36 | 	// Create detector options
37 | 	opts := detector.DetectorOptions{
38 | 		MaxWorkers:         viper.GetInt("detect.workers"),
39 | 		SimilarityThreshold: viper.GetFloat64("detect.threshold"),
40 | 		KnownFilesDir:      viper.GetString("detect.known_files"),
41 | 		Languages: map[string][]string{
42 | 			"cpp":    {".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"},
43 | 			"java":   {".java"},
44 | 			"python": {".py"},
45 | 		},
46 | 	}
47 | 
48 | 	// Create detector
49 | 	d := detector.New(opts)
50 | 
51 | 	// Detect similarities
52 | 	logger.Info("Starting similarity detection",
53 | 		zap.Int("target_files", len(args)),
54 | 		zap.String("known_files_dir", opts.KnownFilesDir))
55 | 
56 | 	results, err := d.DetectSimilarity(context.Background(), args)
57 | 	if err != nil {
58 | 		return err
59 | 	}
60 | 
61 | 	// Save results
62 | 	outputFile := viper.GetString("detect.output")
63 | 	if err := d.SaveResults(results, outputFile); err != nil {
64 | 		return err
65 | 	}
66 | 
67 | 	logger.Info("Similarity detection completed",
68 | 		zap.String("output_file", outputFile))
69 | 
70 | 	return nil
71 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/cmd/root.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/spf13/cobra"
 8 | 	"github.com/spf13/viper"
 9 | )
10 | 
11 | var (
12 | 	cfgFile string
13 | 	rootCmd = &cobra.Command{
14 | 		Use:   "re-centris",
15 | 		Short: "Re-Centris is a code analysis and dependency detection tool",
16 | 		Long: `Re-Centris is a tool based on TLSH (Trend Micro Locality Sensitive Hash) 
17 | for analyzing source code and detecting dependencies. It can identify open source 
18 | components used in codebases, detect code clones, and analyze dependencies.`,
19 | 	}
20 | )
21 | 
22 | // Execute adds all child commands to the root command and sets flags appropriately.
23 | func Execute() error {
24 | 	return rootCmd.Execute()
25 | }
26 | 
27 | func init() {
28 | 	cobra.OnInitialize(initConfig)
29 | 
30 | 	rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.re-centris.yaml)")
31 | }
32 | 
33 | func initConfig() {
34 | 	if cfgFile != "" {
35 | 		viper.SetConfigFile(cfgFile)
36 | 	} else {
37 | 		home, err := os.UserHomeDir()
38 | 		cobra.CheckErr(err)
39 | 
40 | 		viper.AddConfigPath(home)
41 | 		viper.SetConfigType("yaml")
42 | 		viper.SetConfigName(".re-centris")
43 | 	}
44 | 
45 | 	viper.AutomaticEnv()
46 | 
47 | 	if err := viper.ReadInConfig(); err == nil {
48 | 		fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed())
49 | 	}
50 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/collector/clone/clone.go:
--------------------------------------------------------------------------------
  1 | package clone
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"os/exec"
  8 | 	"path/filepath"
  9 | 	"strings"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
 13 | 	"golang.org/x/sync/errgroup"
 14 | )
 15 | 
 16 | // RepoInfo contains information about a repository
 17 | type RepoInfo struct {
 18 | 	Author string
 19 | 	Name   string
 20 | 	URL    string
 21 | }
 22 | 
 23 | // CloneOptions contains options for cloning repositories
 24 | type CloneOptions struct {
 25 | 	TargetDir  string
 26 | 	MaxWorkers int
 27 | }
 28 | 
 29 | // ParseRepoURL parses a GitHub repository URL and returns RepoInfo
 30 | func ParseRepoURL(url string) (*RepoInfo, error) {
 31 | 	parts := strings.Split(url, "/")
 32 | 	if len(parts) < 2 {
 33 | 		return nil, fmt.Errorf("invalid repository URL: %s", url)
 34 | 	}
 35 | 
 36 | 	name := parts[len(parts)-1]
 37 | 	author := parts[len(parts)-2]
 38 | 
 39 | 	// Remove .git suffix if present
 40 | 	name = strings.TrimSuffix(name, ".git")
 41 | 
 42 | 	return &RepoInfo{
 43 | 		Author: author,
 44 | 		Name:   name,
 45 | 		URL:    url,
 46 | 	}, nil
 47 | }
 48 | 
 49 | // CloneRepository clones a single repository
 50 | func CloneRepository(ctx context.Context, info *RepoInfo, targetDir string) error {
 51 | 	folderName := fmt.Sprintf("%s%%%s", info.Author, info.Name)
 52 | 	targetPath := filepath.Join(targetDir, folderName)
 53 | 
 54 | 	// Check if repository already exists
 55 | 	if _, err := os.Stat(targetPath); !os.IsNotExist(err) {
 56 | 		logger.Info("Repository already exists, skipping", 
 57 | 			zap.String("repo", folderName))
 58 | 		return nil
 59 | 	}
 60 | 
 61 | 	// Prepare git clone command
 62 | 	cmd := exec.CommandContext(ctx, "git", "clone",
 63 | 		"--depth", "1",
 64 | 		"--single-branch",
 65 | 		"--no-tags",
 66 | 		info.URL,
 67 | 		targetPath,
 68 | 	)
 69 | 
 70 | 	// Execute command
 71 | 	if output, err := cmd.CombinedOutput(); err != nil {
 72 | 		return fmt.Errorf("failed to clone repository %s: %v\nOutput: %s", 
 73 | 			info.URL, err, string(output))
 74 | 	}
 75 | 
 76 | 	logger.Info("Successfully cloned repository",
 77 | 		zap.String("repo", folderName))
 78 | 	return nil
 79 | }
 80 | 
 81 | // CloneRepositories clones multiple repositories in parallel
 82 | func CloneRepositories(ctx context.Context, urls []string, opts CloneOptions) error {
 83 | 	// Create target directory if it doesn't exist
 84 | 	if err := os.MkdirAll(opts.TargetDir, 0755); err != nil {
 85 | 		return fmt.Errorf("failed to create target directory: %v", err)
 86 | 	}
 87 | 
 88 | 	// Create error group with context
 89 | 	g, ctx := errgroup.WithContext(ctx)
 90 | 	g.SetLimit(opts.MaxWorkers)
 91 | 
 92 | 	// Process each repository URL
 93 | 	for _, url := range urls {
 94 | 		url := url // Create new variable for goroutine
 95 | 		g.Go(func() error {
 96 | 			info, err := ParseRepoURL(url)
 97 | 			if err != nil {
 98 | 				logger.Error("Failed to parse repository URL",
 99 | 					zap.String("url", url),
100 | 					zap.Error(err))
101 | 				return err
102 | 			}
103 | 
104 | 			return CloneRepository(ctx, info, opts.TargetDir)
105 | 		})
106 | 	}
107 | 
108 | 	// Wait for all goroutines to complete
109 | 	if err := g.Wait(); err != nil {
110 | 		return fmt.Errorf("error while cloning repositories: %v", err)
111 | 	}
112 | 
113 | 	return nil
114 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/common/cache/cache.go:
--------------------------------------------------------------------------------
  1 | package cache
  2 | 
  3 | import (
  4 | 	"container/list"
  5 | 	"sync"
  6 | )
  7 | 
  8 | // Cache is a thread-safe LRU cache
  9 | type Cache struct {
 10 | 	capacity int
 11 | 	items    map[string]*list.Element
 12 | 	queue    *list.List
 13 | 	mutex    sync.RWMutex
 14 | }
 15 | 
 16 | // item represents a cache item
 17 | type item struct {
 18 | 	key   string
 19 | 	value interface{}
 20 | }
 21 | 
 22 | // New creates a new cache with the given capacity
 23 | func New(capacity int) *Cache {
 24 | 	return &Cache{
 25 | 		capacity: capacity,
 26 | 		items:    make(map[string]*list.Element),
 27 | 		queue:    list.New(),
 28 | 	}
 29 | }
 30 | 
 31 | // Get retrieves a value from the cache
 32 | func (c *Cache) Get(key string) (interface{}, bool) {
 33 | 	c.mutex.RLock()
 34 | 	if element, exists := c.items[key]; exists {
 35 | 		c.mutex.RUnlock()
 36 | 		c.mutex.Lock()
 37 | 		c.queue.MoveToFront(element)
 38 | 		c.mutex.Unlock()
 39 | 		return element.Value.(*item).value, true
 40 | 	}
 41 | 	c.mutex.RUnlock()
 42 | 	return nil, false
 43 | }
 44 | 
 45 | // Set adds or updates a value in the cache
 46 | func (c *Cache) Set(key string, value interface{}) {
 47 | 	c.mutex.Lock()
 48 | 	defer c.mutex.Unlock()
 49 | 
 50 | 	// If key exists, update its value and move to front
 51 | 	if element, exists := c.items[key]; exists {
 52 | 		c.queue.MoveToFront(element)
 53 | 		element.Value.(*item).value = value
 54 | 		return
 55 | 	}
 56 | 
 57 | 	// Add new item
 58 | 	element := c.queue.PushFront(&item{key: key, value: value})
 59 | 	c.items[key] = element
 60 | 
 61 | 	// Remove oldest item if cache is full
 62 | 	if c.queue.Len() > c.capacity {
 63 | 		oldest := c.queue.Back()
 64 | 		if oldest != nil {
 65 | 			c.queue.Remove(oldest)
 66 | 			delete(c.items, oldest.Value.(*item).key)
 67 | 		}
 68 | 	}
 69 | }
 70 | 
 71 | // Delete removes a value from the cache
 72 | func (c *Cache) Delete(key string) {
 73 | 	c.mutex.Lock()
 74 | 	defer c.mutex.Unlock()
 75 | 
 76 | 	if element, exists := c.items[key]; exists {
 77 | 		c.queue.Remove(element)
 78 | 		delete(c.items, key)
 79 | 	}
 80 | }
 81 | 
 82 | // Clear removes all items from the cache
 83 | func (c *Cache) Clear() {
 84 | 	c.mutex.Lock()
 85 | 	defer c.mutex.Unlock()
 86 | 
 87 | 	c.items = make(map[string]*list.Element)
 88 | 	c.queue = list.New()
 89 | }
 90 | 
 91 | // Len returns the number of items in the cache
 92 | func (c *Cache) Len() int {
 93 | 	c.mutex.RLock()
 94 | 	defer c.mutex.RUnlock()
 95 | 	return len(c.items)
 96 | }
 97 | 
 98 | // Keys returns all keys in the cache
 99 | func (c *Cache) Keys() []string {
100 | 	c.mutex.RLock()
101 | 	defer c.mutex.RUnlock()
102 | 
103 | 	keys := make([]string, 0, len(c.items))
104 | 	for key := range c.items {
105 | 		keys = append(keys, key)
106 | 	}
107 | 	return keys
108 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/common/logger/logger.go:
--------------------------------------------------------------------------------
 1 | package logger
 2 | 
 3 | import (
 4 | 	"os"
 5 | 
 6 | 	"go.uber.org/zap"
 7 | 	"go.uber.org/zap/zapcore"
 8 | )
 9 | 
10 | var log *zap.Logger
11 | 
12 | // Init initializes the logger
13 | func Init(debug bool) {
14 | 	config := zap.NewProductionConfig()
15 | 	if debug {
16 | 		config.Level = zap.NewAtomicLevelAt(zap.DebugLevel)
17 | 	}
18 | 	
19 | 	config.OutputPaths = []string{"stdout", "re-centris.log"}
20 | 	config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
21 | 	
22 | 	var err error
23 | 	log, err = config.Build()
24 | 	if err != nil {
25 | 		os.Exit(1)
26 | 	}
27 | }
28 | 
29 | // Debug logs a debug message
30 | func Debug(msg string, fields ...zap.Field) {
31 | 	log.Debug(msg, fields...)
32 | }
33 | 
34 | // Info logs an info message
35 | func Info(msg string, fields ...zap.Field) {
36 | 	log.Info(msg, fields...)
37 | }
38 | 
39 | // Warn logs a warning message
40 | func Warn(msg string, fields ...zap.Field) {
41 | 	log.Warn(msg, fields...)
42 | }
43 | 
44 | // Error logs an error message
45 | func Error(msg string, fields ...zap.Field) {
46 | 	log.Error(msg, fields...)
47 | }
48 | 
49 | // Fatal logs a fatal message and exits
50 | func Fatal(msg string, fields ...zap.Field) {
51 | 	log.Fatal(msg, fields...)
52 | }
53 | 
54 | // Sync flushes any buffered log entries
55 | func Sync() error {
56 | 	return log.Sync()
57 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/common/monitor/monitor.go:
--------------------------------------------------------------------------------
  1 | package monitor
  2 | 
  3 | import (
  4 | 	"runtime"
  5 | 	"sync"
  6 | 	"time"
  7 | 
  8 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
  9 | 	"go.uber.org/zap"
 10 | )
 11 | 
 12 | // Stats represents performance statistics
 13 | type Stats struct {
 14 | 	Goroutines  int
 15 | 	Memory      uint64
 16 | 	CPU         float64
 17 | 	StartTime   time.Time
 18 | 	Operations  uint64
 19 | 	mutex       sync.RWMutex
 20 | }
 21 | 
 22 | // Monitor handles performance monitoring
 23 | type Monitor struct {
 24 | 	stats    *Stats
 25 | 	interval time.Duration
 26 | 	done     chan struct{}
 27 | }
 28 | 
 29 | // New creates a new performance monitor
 30 | func New(interval time.Duration) *Monitor {
 31 | 	return &Monitor{
 32 | 		stats: &Stats{
 33 | 			StartTime: time.Now(),
 34 | 		},
 35 | 		interval: interval,
 36 | 		done:     make(chan struct{}),
 37 | 	}
 38 | }
 39 | 
 40 | // Start starts the monitoring
 41 | func (m *Monitor) Start() {
 42 | 	go m.monitor()
 43 | }
 44 | 
 45 | // Stop stops the monitoring
 46 | func (m *Monitor) Stop() {
 47 | 	close(m.done)
 48 | }
 49 | 
 50 | // GetStats returns current statistics
 51 | func (m *Monitor) GetStats() Stats {
 52 | 	m.stats.mutex.RLock()
 53 | 	defer m.stats.mutex.RUnlock()
 54 | 	return *m.stats
 55 | }
 56 | 
 57 | // IncrementOperations increments the operation counter
 58 | func (m *Monitor) IncrementOperations() {
 59 | 	m.stats.mutex.Lock()
 60 | 	m.stats.Operations++
 61 | 	m.stats.mutex.Unlock()
 62 | }
 63 | 
 64 | // monitor periodically collects performance metrics
 65 | func (m *Monitor) monitor() {
 66 | 	ticker := time.NewTicker(m.interval)
 67 | 	defer ticker.Stop()
 68 | 
 69 | 	for {
 70 | 		select {
 71 | 		case <-ticker.C:
 72 | 			m.collectMetrics()
 73 | 		case <-m.done:
 74 | 			return
 75 | 		}
 76 | 	}
 77 | }
 78 | 
 79 | // collectMetrics collects current performance metrics
 80 | func (m *Monitor) collectMetrics() {
 81 | 	m.stats.mutex.Lock()
 82 | 	defer m.stats.mutex.Unlock()
 83 | 
 84 | 	// Get number of goroutines
 85 | 	m.stats.Goroutines = runtime.NumGoroutine()
 86 | 
 87 | 	// Get memory statistics
 88 | 	var memStats runtime.MemStats
 89 | 	runtime.ReadMemStats(&memStats)
 90 | 	m.stats.Memory = memStats.Alloc
 91 | 
 92 | 	// Log current metrics
 93 | 	logger.Info("Performance metrics",
 94 | 		zap.Int("goroutines", m.stats.Goroutines),
 95 | 		zap.Uint64("memory_bytes", m.stats.Memory),
 96 | 		zap.Uint64("operations", m.stats.Operations),
 97 | 		zap.Duration("uptime", time.Since(m.stats.StartTime)),
 98 | 	)
 99 | }
100 | 
101 | // CheckMemoryLimit checks if memory usage is within limit
102 | func (m *Monitor) CheckMemoryLimit(limit float64) bool {
103 | 	var memStats runtime.MemStats
104 | 	runtime.ReadMemStats(&memStats)
105 | 
106 | 	totalMemory := float64(memStats.Sys)
107 | 	usedMemory := float64(memStats.Alloc)
108 | 	memoryUsage := usedMemory / totalMemory
109 | 
110 | 	if memoryUsage > limit {
111 | 		logger.Warn("Memory usage exceeds limit",
112 | 			zap.Float64("usage", memoryUsage),
113 | 			zap.Float64("limit", limit))
114 | 		return false
115 | 	}
116 | 
117 | 	return true
118 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/config/config.go:
--------------------------------------------------------------------------------
 1 | package config
 2 | 
 3 | // Config represents the main configuration structure
 4 | type Config struct {
 5 | 	Paths       PathConfig       `yaml:"paths"`
 6 | 	Performance PerformanceConfig `yaml:"performance"`
 7 | 	Languages   LanguagesConfig  `yaml:"languages"`
 8 | }
 9 | 
10 | // PathConfig contains all path-related configurations
11 | type PathConfig struct {
12 | 	RepoPath    string `yaml:"repo_path"`
13 | 	TagDatePath string `yaml:"tag_date_path"`
14 | 	ResultPath  string `yaml:"result_path"`
15 | }
16 | 
17 | // PerformanceConfig contains performance-related settings
18 | type PerformanceConfig struct {
19 | 	MaxWorkers   int     `yaml:"max_workers"`
20 | 	CacheSize    int     `yaml:"cache_size"`
21 | 	MemoryLimit  float64 `yaml:"memory_limit"`
22 | }
23 | 
24 | // LanguagesConfig contains settings for supported languages
25 | type LanguagesConfig struct {
26 | 	CPP    LanguageSettings `yaml:"cpp"`
27 | 	Java   LanguageSettings `yaml:"java"`
28 | 	Python LanguageSettings `yaml:"python"`
29 | }
30 | 
31 | // LanguageSettings contains settings for a specific language
32 | type LanguageSettings struct {
33 | 	Enabled    bool     `yaml:"enabled"`
34 | 	Extensions []string `yaml:"extensions"`
35 | }
36 | 
37 | // DefaultConfig returns a default configuration
38 | func DefaultConfig() *Config {
39 | 	return &Config{
40 | 		Paths: PathConfig{
41 | 			RepoPath:    "./repos",
42 | 			TagDatePath: "./data/repo_date",
43 | 			ResultPath:  "./data/repo_functions",
44 | 		},
45 | 		Performance: PerformanceConfig{
46 | 			MaxWorkers:   0, // 0 means use number of CPU cores
47 | 			CacheSize:    1000,
48 | 			MemoryLimit: 0.8,
49 | 		},
50 | 		Languages: LanguagesConfig{
51 | 			CPP: LanguageSettings{
52 | 				Enabled:    true,
53 | 				Extensions: []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"},
54 | 			},
55 | 			Java: LanguageSettings{
56 | 				Enabled:    false,
57 | 				Extensions: []string{".java"},
58 | 			},
59 | 			Python: LanguageSettings{
60 | 				Enabled:    false,
61 | 				Extensions: []string{".py"},
62 | 			},
63 | 		},
64 | 	}
65 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/detector/detector.go:
--------------------------------------------------------------------------------
  1 | package detector
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"sort"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/re-centris/re-centris-go/internal/analyzer"
 13 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
 14 | 	"golang.org/x/sync/errgroup"
 15 | )
 16 | 
 17 | // DetectionResult represents the result of a code similarity detection
 18 | type DetectionResult struct {
 19 | 	TargetFile  string           `json:"target_file"`
 20 | 	Matches     []Match          `json:"matches"`
 21 | 	TotalFiles  int             `json:"total_files"`
 22 | 	MatchCount  int             `json:"match_count"`
 23 | }
 24 | 
 25 | // Match represents a single match in the detection result
 26 | type Match struct {
 27 | 	File       string  `json:"file"`
 28 | 	Similarity float64 `json:"similarity"`
 29 | 	Distance   int     `json:"distance"`
 30 | }
 31 | 
 32 | // DetectorOptions contains options for the detector
 33 | type DetectorOptions struct {
 34 | 	MaxWorkers      int
 35 | 	SimilarityThreshold float64
 36 | 	Languages       map[string][]string
 37 | 	KnownFilesDir   string
 38 | }
 39 | 
 40 | // Detector handles code similarity detection
 41 | type Detector struct {
 42 | 	opts     DetectorOptions
 43 | 	analyzer *analyzer.Analyzer
 44 | }
 45 | 
 46 | // New creates a new Detector
 47 | func New(opts DetectorOptions) *Detector {
 48 | 	return &Detector{
 49 | 		opts: opts,
 50 | 		analyzer: analyzer.New(analyzer.AnalyzerOptions{
 51 | 			MaxWorkers: opts.MaxWorkers,
 52 | 			Languages:  opts.Languages,
 53 | 		}),
 54 | 	}
 55 | }
 56 | 
 57 | // DetectSimilarity detects code similarity between target files and known files
 58 | func (d *Detector) DetectSimilarity(ctx context.Context, targetFiles []string) ([]*DetectionResult, error) {
 59 | 	// Load known files
 60 | 	knownFiles, err := d.loadKnownFiles(ctx)
 61 | 	if err != nil {
 62 | 		return nil, fmt.Errorf("failed to load known files: %v", err)
 63 | 	}
 64 | 
 65 | 	// Process target files in parallel
 66 | 	var (
 67 | 		results []*DetectionResult
 68 | 		resultsMux sync.Mutex
 69 | 	)
 70 | 
 71 | 	g, ctx := errgroup.WithContext(ctx)
 72 | 	g.SetLimit(d.opts.MaxWorkers)
 73 | 
 74 | 	for _, targetFile := range targetFiles {
 75 | 		targetFile := targetFile // Create new variable for goroutine
 76 | 		g.Go(func() error {
 77 | 			// Analyze target file
 78 | 			fileInfo, err := d.analyzer.AnalyzeFile(ctx, targetFile)
 79 | 			if err != nil {
 80 | 				logger.Error("Failed to analyze target file",
 81 | 					zap.String("file", targetFile),
 82 | 					zap.Error(err))
 83 | 				return err
 84 | 			}
 85 | 
 86 | 			// Find similar files
 87 | 			similar := d.analyzer.FindSimilarFiles(fileInfo, knownFiles, 
 88 | 				int(100 * (1 - d.opts.SimilarityThreshold)))
 89 | 
 90 | 			// Create matches
 91 | 			matches := make([]Match, len(similar))
 92 | 			for i, s := range similar {
 93 | 				distance := fileInfo.Hash.Distance(s.Hash)
 94 | 				similarity := 1.0 - float64(distance)/100.0
 95 | 				matches[i] = Match{
 96 | 					File:       s.Path,
 97 | 					Similarity: similarity,
 98 | 					Distance:   distance,
 99 | 				}
100 | 			}
101 | 
102 | 			// Sort matches by similarity (descending)
103 | 			sort.Slice(matches, func(i, j int) bool {
104 | 				return matches[i].Similarity > matches[j].Similarity
105 | 			})
106 | 
107 | 			// Create result
108 | 			result := &DetectionResult{
109 | 				TargetFile:  targetFile,
110 | 				Matches:     matches,
111 | 				TotalFiles:  len(knownFiles),
112 | 				MatchCount:  len(matches),
113 | 			}
114 | 
115 | 			// Add to results
116 | 			resultsMux.Lock()
117 | 			results = append(results, result)
118 | 			resultsMux.Unlock()
119 | 
120 | 			return nil
121 | 		})
122 | 	}
123 | 
124 | 	if err := g.Wait(); err != nil {
125 | 		return nil, fmt.Errorf("error while detecting similarities: %v", err)
126 | 	}
127 | 
128 | 	return results, nil
129 | }
130 | 
131 | // loadKnownFiles loads all known files from the specified directory
132 | func (d *Detector) loadKnownFiles(ctx context.Context) ([]*analyzer.FileInfo, error) {
133 | 	return d.analyzer.AnalyzeDirectory(ctx, d.opts.KnownFilesDir)
134 | }
135 | 
136 | // SaveResults saves detection results to a JSON file
137 | func (d *Detector) SaveResults(results []*DetectionResult, outputPath string) error {
138 | 	// Create parent directories if they don't exist
139 | 	if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil {
140 | 		return fmt.Errorf("failed to create directories: %v", err)
141 | 	}
142 | 
143 | 	// Marshal results to JSON
144 | 	data, err := json.MarshalIndent(results, "", "  ")
145 | 	if err != nil {
146 | 		return fmt.Errorf("failed to marshal results: %v", err)
147 | 	}
148 | 
149 | 	// Write to file
150 | 	if err := os.WriteFile(outputPath, data, 0644); err != nil {
151 | 		return fmt.Errorf("failed to write results: %v", err)
152 | 	}
153 | 
154 | 	return nil
155 | } 


--------------------------------------------------------------------------------
/re-centris-go/internal/preprocessor/preprocessor.go:
--------------------------------------------------------------------------------
  1 | package preprocessor
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"sync"
 10 | 
 11 | 	"github.com/re-centris/re-centris-go/internal/analyzer"
 12 | 	"github.com/re-centris/re-centris-go/internal/common/logger"
 13 | 	"golang.org/x/sync/errgroup"
 14 | )
 15 | 
 16 | // FileMetadata contains metadata about a processed file
 17 | type FileMetadata struct {
 18 | 	Path       string            `json:"path"`
 19 | 	Language   string            `json:"language"`
 20 | 	Hash       string            `json:"hash"`
 21 | 	Size       int64            `json:"size"`
 22 | 	Functions  []FunctionInfo    `json:"functions,omitempty"`
 23 | }
 24 | 
 25 | // FunctionInfo contains information about a function
 26 | type FunctionInfo struct {
 27 | 	Name       string `json:"name"`
 28 | 	StartLine  int    `json:"start_line"`
 29 | 	EndLine    int    `json:"end_line"`
 30 | 	Hash       string `json:"hash"`
 31 | }
 32 | 
 33 | // PreprocessorOptions contains options for the preprocessor
 34 | type PreprocessorOptions struct {
 35 | 	MaxWorkers     int
 36 | 	OutputDir      string
 37 | 	Languages      map[string][]string
 38 | 	MinFileSize    int64
 39 | 	MaxFileSize    int64
 40 | }
 41 | 
 42 | // Preprocessor handles file preprocessing
 43 | type Preprocessor struct {
 44 | 	opts     PreprocessorOptions
 45 | 	analyzer *analyzer.Analyzer
 46 | }
 47 | 
 48 | // New creates a new Preprocessor
 49 | func New(opts PreprocessorOptions) *Preprocessor {
 50 | 	return &Preprocessor{
 51 | 		opts: opts,
 52 | 		analyzer: analyzer.New(analyzer.AnalyzerOptions{
 53 | 			MaxWorkers: opts.MaxWorkers,
 54 | 			Languages:  opts.Languages,
 55 | 		}),
 56 | 	}
 57 | }
 58 | 
 59 | // ProcessDirectory processes all files in a directory
 60 | func (p *Preprocessor) ProcessDirectory(ctx context.Context, dir string) error {
 61 | 	// Create output directory if it doesn't exist
 62 | 	if err := os.MkdirAll(p.opts.OutputDir, 0755); err != nil {
 63 | 		return fmt.Errorf("failed to create output directory: %v", err)
 64 | 	}
 65 | 
 66 | 	// Analyze all files in directory
 67 | 	files, err := p.analyzer.AnalyzeDirectory(ctx, dir)
 68 | 	if err != nil {
 69 | 		return fmt.Errorf("failed to analyze directory: %v", err)
 70 | 	}
 71 | 
 72 | 	// Process files in parallel
 73 | 	g, ctx := errgroup.WithContext(ctx)
 74 | 	g.SetLimit(p.opts.MaxWorkers)
 75 | 
 76 | 	for _, file := range files {
 77 | 		file := file // Create new variable for goroutine
 78 | 		g.Go(func() error {
 79 | 			// Skip files that are too small or too large
 80 | 			if file.Size < p.opts.MinFileSize || 
 81 | 			   (p.opts.MaxFileSize > 0 && file.Size > p.opts.MaxFileSize) {
 82 | 				return nil
 83 | 			}
 84 | 
 85 | 			metadata := &FileMetadata{
 86 | 				Path:     file.Path,
 87 | 				Language: file.Language,
 88 | 				Hash:     file.Hash.String(),
 89 | 				Size:     file.Size,
 90 | 			}
 91 | 
 92 | 			// Extract functions if supported
 93 | 			if funcs, err := p.extractFunctions(file); err == nil {
 94 | 				metadata.Functions = funcs
 95 | 			}
 96 | 
 97 | 			// Save metadata
 98 | 			if err := p.saveMetadata(metadata); err != nil {
 99 | 				logger.Error("Failed to save metadata",
100 | 					zap.String("path", file.Path),
101 | 					zap.Error(err))
102 | 				return err
103 | 			}
104 | 
105 | 			return nil
106 | 		})
107 | 	}
108 | 
109 | 	return g.Wait()
110 | }
111 | 
112 | // extractFunctions extracts function information from a file
113 | func (p *Preprocessor) extractFunctions(file *analyzer.FileInfo) ([]FunctionInfo, error) {
114 | 	// TODO: Implement function extraction using language-specific parsers
115 | 	// This is a placeholder that should be replaced with actual implementation
116 | 	return nil, nil
117 | }
118 | 
119 | // saveMetadata saves file metadata to JSON file
120 | func (p *Preprocessor) saveMetadata(metadata *FileMetadata) error {
121 | 	// Create output filename based on file path
122 | 	relPath, err := filepath.Rel("/", metadata.Path)
123 | 	if err != nil {
124 | 		relPath = metadata.Path
125 | 	}
126 | 	outPath := filepath.Join(p.opts.OutputDir, 
127 | 		fmt.Sprintf("%s.json", filepath.ToSlash(relPath)))
128 | 
129 | 	// Create parent directories if they don't exist
130 | 	if err := os.MkdirAll(filepath.Dir(outPath), 0755); err != nil {
131 | 		return fmt.Errorf("failed to create directories: %v", err)
132 | 	}
133 | 
134 | 	// Marshal metadata to JSON
135 | 	data, err := json.MarshalIndent(metadata, "", "  ")
136 | 	if err != nil {
137 | 		return fmt.Errorf("failed to marshal metadata: %v", err)
138 | 	}
139 | 
140 | 	// Write to file
141 | 	if err := os.WriteFile(outPath, data, 0644); err != nil {
142 | 		return fmt.Errorf("failed to write metadata: %v", err)
143 | 	}
144 | 
145 | 	return nil
146 | } 


--------------------------------------------------------------------------------
/re-centris-go/tests/integration/clone_analyze_test.go:
--------------------------------------------------------------------------------
  1 | package integration
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"path/filepath"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/re-centris/re-centris-go/internal/analyzer"
  9 | 	"github.com/re-centris/re-centris-go/internal/collector/clone"
 10 | 	"github.com/re-centris/re-centris-go/internal/common/config"
 11 | )
 12 | 
 13 | func TestCloneAndAnalyze(t *testing.T) {
 14 | 	// Skip if running in CI environment
 15 | 	if os.Getenv("CI") != "" {
 16 | 		t.Skip("Skipping integration test in CI environment")
 17 | 	}
 18 | 
 19 | 	// Create temporary directories
 20 | 	tmpDir, err := os.MkdirTemp("", "re-centris-test-*")
 21 | 	if err != nil {
 22 | 		t.Fatalf("Failed to create temp dir: %v", err)
 23 | 	}
 24 | 	defer os.RemoveAll(tmpDir)
 25 | 
 26 | 	repoDir := filepath.Join(tmpDir, "repos")
 27 | 	analysisDir := filepath.Join(tmpDir, "analysis")
 28 | 
 29 | 	// Create test configuration
 30 | 	cfg := &config.Config{
 31 | 		Clone: config.CloneConfig{
 32 | 			OutputPath: repoDir,
 33 | 			Workers:    2,
 34 | 		},
 35 | 		Analysis: config.AnalysisConfig{
 36 | 			OutputPath: analysisDir,
 37 | 			Workers:    2,
 38 | 		},
 39 | 		Languages: config.LanguagesConfig{
 40 | 			CPP: config.LanguageConfig{
 41 | 				Enabled:    true,
 42 | 				Extensions: []string{".cpp", ".h"},
 43 | 			},
 44 | 		},
 45 | 	}
 46 | 
 47 | 	// Test repository to clone (use a small, public repo)
 48 | 	testRepo := "https://github.com/google/googletest.git"
 49 | 
 50 | 	// Initialize cloner
 51 | 	cloner := clone.New(cfg)
 52 | 
 53 | 	// Clone repository
 54 | 	err = cloner.Clone([]string{testRepo})
 55 | 	if err != nil {
 56 | 		t.Fatalf("Failed to clone repository: %v", err)
 57 | 	}
 58 | 
 59 | 	// Verify repository was cloned
 60 | 	if _, err := os.Stat(repoDir); os.IsNotExist(err) {
 61 | 		t.Errorf("Repository directory was not created")
 62 | 	}
 63 | 
 64 | 	// Initialize analyzer
 65 | 	analyzer := analyzer.New(cfg)
 66 | 
 67 | 	// Analyze cloned repository
 68 | 	err = analyzer.Analyze(repoDir)
 69 | 	if err != nil {
 70 | 		t.Fatalf("Failed to analyze repository: %v", err)
 71 | 	}
 72 | 
 73 | 	// Verify analysis output
 74 | 	if _, err := os.Stat(analysisDir); os.IsNotExist(err) {
 75 | 		t.Errorf("Analysis directory was not created")
 76 | 	}
 77 | 
 78 | 	// Check for analysis results
 79 | 	files, err := filepath.Glob(filepath.Join(analysisDir, "*.json"))
 80 | 	if err != nil {
 81 | 		t.Fatalf("Failed to list analysis files: %v", err)
 82 | 	}
 83 | 	if len(files) == 0 {
 84 | 		t.Error("No analysis results were generated")
 85 | 	}
 86 | }
 87 | 
 88 | func TestAnalyzeWithInvalidInput(t *testing.T) {
 89 | 	tmpDir, err := os.MkdirTemp("", "re-centris-test-*")
 90 | 	if err != nil {
 91 | 		t.Fatalf("Failed to create temp dir: %v", err)
 92 | 	}
 93 | 	defer os.RemoveAll(tmpDir)
 94 | 
 95 | 	cfg := &config.Config{
 96 | 		Analysis: config.AnalysisConfig{
 97 | 			OutputPath: tmpDir,
 98 | 			Workers:    1,
 99 | 		},
100 | 	}
101 | 
102 | 	analyzer := analyzer.New(cfg)
103 | 
104 | 	// Test with non-existent directory
105 | 	err = analyzer.Analyze("/nonexistent/path")
106 | 	if err == nil {
107 | 		t.Error("Expected error when analyzing non-existent directory")
108 | 	}
109 | 
110 | 	// Test with empty directory
111 | 	emptyDir := filepath.Join(tmpDir, "empty")
112 | 	if err := os.MkdirAll(emptyDir, 0755); err != nil {
113 | 		t.Fatalf("Failed to create empty directory: %v", err)
114 | 	}
115 | 
116 | 	err = analyzer.Analyze(emptyDir)
117 | 	if err != nil {
118 | 		t.Errorf("Unexpected error analyzing empty directory: %v", err)
119 | 	}
120 | } 


--------------------------------------------------------------------------------
/re-centris-go/tests/security/security_test.go:
--------------------------------------------------------------------------------
  1 | package security
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | 	"strings"
  8 | 	"sync"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/re-centris/re-centris-go/internal/analyzer"
 13 | 	"github.com/re-centris/re-centris-go/internal/common/config"
 14 | 	"github.com/re-centris/re-centris-go/internal/common/monitor"
 15 | )
 16 | 
 17 | func TestPathTraversal(t *testing.T) {
 18 | 	tmpDir, err := os.MkdirTemp("", "re-centris-security-*")
 19 | 	if err != nil {
 20 | 		t.Fatalf("Failed to create temp dir: %v", err)
 21 | 	}
 22 | 	defer os.RemoveAll(tmpDir)
 23 | 
 24 | 	maliciousPaths := []string{
 25 | 		"../../../etc/passwd",
 26 | 		"..\\..\\..\\Windows\\System32",
 27 | 		"/etc/shadow",
 28 | 		"C:\\Windows\\System32\\config",
 29 | 		filepath.Join(tmpDir, ".."),
 30 | 	}
 31 | 
 32 | 	cfg := &config.Config{
 33 | 		Analysis: config.AnalysisConfig{
 34 | 			OutputPath: tmpDir,
 35 | 		},
 36 | 	}
 37 | 
 38 | 	analyzer := analyzer.New(cfg)
 39 | 
 40 | 	for _, path := range maliciousPaths {
 41 | 		err := analyzer.Analyze(path)
 42 | 		if err == nil {
 43 | 			t.Errorf("Expected error for malicious path: %s", path)
 44 | 		}
 45 | 	}
 46 | }
 47 | 
 48 | func TestMemoryLimit(t *testing.T) {
 49 | 	mon := monitor.New(100 * time.Millisecond)
 50 | 	mon.Start()
 51 | 	defer mon.Stop()
 52 | 
 53 | 	// Allocate memory gradually
 54 | 	var slices [][]byte
 55 | 	defer func() {
 56 | 		slices = nil
 57 | 	}()
 58 | 
 59 | 	// Try to allocate memory until we hit the limit
 60 | 	for i := 0; i < 100; i++ {
 61 | 		if !mon.CheckMemoryLimit(0.8) { // 80% memory limit
 62 | 			// Memory limit reached, test passed
 63 | 			return
 64 | 		}
 65 | 		// Allocate 1MB
 66 | 		slices = append(slices, make([]byte, 1024*1024))
 67 | 	}
 68 | 
 69 | 	t.Error("Memory limit was not enforced")
 70 | }
 71 | 
 72 | func TestConcurrentAccess(t *testing.T) {
 73 | 	tmpDir, err := os.MkdirTemp("", "re-centris-security-*")
 74 | 	if err != nil {
 75 | 		t.Fatalf("Failed to create temp dir: %v", err)
 76 | 	}
 77 | 	defer os.RemoveAll(tmpDir)
 78 | 
 79 | 	cfg := &config.Config{
 80 | 		Analysis: config.AnalysisConfig{
 81 | 			OutputPath: tmpDir,
 82 | 			Workers:    4,
 83 | 		},
 84 | 	}
 85 | 
 86 | 	analyzer := analyzer.New(cfg)
 87 | 
 88 | 	// Create test files
 89 | 	testFiles := make([]string, 10)
 90 | 	for i := range testFiles {
 91 | 		file := filepath.Join(tmpDir, fmt.Sprintf("test%d.cpp", i))
 92 | 		if err := os.WriteFile(file, []byte("int main() { return 0; }"), 0644); err != nil {
 93 | 			t.Fatalf("Failed to create test file: %v", err)
 94 | 		}
 95 | 		testFiles[i] = file
 96 | 	}
 97 | 
 98 | 	// Test concurrent access
 99 | 	var wg sync.WaitGroup
100 | 	errors := make(chan error, len(testFiles))
101 | 
102 | 	for _, file := range testFiles {
103 | 		wg.Add(1)
104 | 		go func(f string) {
105 | 			defer wg.Done()
106 | 			if err := analyzer.Analyze(f); err != nil {
107 | 				errors <- err
108 | 			}
109 | 		}(file)
110 | 	}
111 | 
112 | 	// Wait for all goroutines to finish
113 | 	wg.Wait()
114 | 	close(errors)
115 | 
116 | 	// Check for errors
117 | 	for err := range errors {
118 | 		t.Errorf("Concurrent analysis error: %v", err)
119 | 	}
120 | }
121 | 
122 | func TestResourceExhaustion(t *testing.T) {
123 | 	tmpDir, err := os.MkdirTemp("", "re-centris-security-*")
124 | 	if err != nil {
125 | 		t.Fatalf("Failed to create temp dir: %v", err)
126 | 	}
127 | 	defer os.RemoveAll(tmpDir)
128 | 
129 | 	cfg := &config.Config{
130 | 		Analysis: config.AnalysisConfig{
131 | 			OutputPath: tmpDir,
132 | 			Workers:    1000, // Excessive number of workers
133 | 		},
134 | 	}
135 | 
136 | 	analyzer := analyzer.New(cfg)
137 | 
138 | 	// Create a large file
139 | 	largeFile := filepath.Join(tmpDir, "large.cpp")
140 | 	f, err := os.Create(largeFile)
141 | 	if err != nil {
142 | 		t.Fatalf("Failed to create large file: %v", err)
143 | 	}
144 | 
145 | 	// Write 100MB of data
146 | 	data := make([]byte, 1024)
147 | 	for i := 0; i < 1024*100; i++ {
148 | 		if _, err := f.Write(data); err != nil {
149 | 			f.Close()
150 | 			t.Fatalf("Failed to write to large file: %v", err)
151 | 		}
152 | 	}
153 | 	f.Close()
154 | 
155 | 	// Set timeout for the test
156 | 	done := make(chan bool)
157 | 	go func() {
158 | 		err := analyzer.Analyze(largeFile)
159 | 		if err != nil {
160 | 			t.Logf("Analysis error (expected): %v", err)
161 | 		}
162 | 		done <- true
163 | 	}()
164 | 
165 | 	select {
166 | 	case <-done:
167 | 		// Test completed within timeout
168 | 	case <-time.After(30 * time.Second):
169 | 		t.Error("Analysis took too long, possible resource exhaustion")
170 | 	}
171 | }
172 | 
173 | func TestInputValidation(t *testing.T) {
174 | 	tmpDir, err := os.MkdirTemp("", "re-centris-security-*")
175 | 	if err != nil {
176 | 		t.Fatalf("Failed to create temp dir: %v", err)
177 | 	}
178 | 	defer os.RemoveAll(tmpDir)
179 | 
180 | 	cfg := &config.Config{
181 | 		Analysis: config.AnalysisConfig{
182 | 			OutputPath: tmpDir,
183 | 		},
184 | 	}
185 | 
186 | 	analyzer := analyzer.New(cfg)
187 | 
188 | 	invalidInputs := []struct {
189 | 		name string
190 | 		path string
191 | 	}{
192 | 		{"empty path", ""},
193 | 		{"space only", "   "},
194 | 		{"invalid chars", string([]byte{0x00, 0x01, 0x02})},
195 | 		{"very long path", strings.Repeat("a", 4096)},
196 | 	}
197 | 
198 | 	for _, tc := range invalidInputs {
199 | 		t.Run(tc.name, func(t *testing.T) {
200 | 			err := analyzer.Analyze(tc.path)
201 | 			if err == nil {
202 | 				t.Errorf("Expected error for invalid input: %s", tc.name)
203 | 			}
204 | 		})
205 | 	}
206 | } 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # 核心依赖
 2 | tlsh==4.8.2
 3 | PyYAML>=6.0.1
 4 | psutil==5.9.5
 5 | chardet>=4.0.0
 6 | 
 7 | # 数据处理
 8 | numpy>=1.20.0
 9 | pandas>=1.3.0
10 | 
11 | # Web API
12 | flask>=2.0.0
13 | flask-cors>=3.0.10
14 | gunicorn>=20.1.0
15 | 
16 | # 数据库
17 | sqlalchemy>=1.4.0
18 | 
19 | # 测试工具
20 | pytest>=7.4.0
21 | pytest-cov>=4.1.0
22 | pytest-xdist>=3.3.0
23 | pytest-benchmark>=4.0.0
24 | pytest-mock>=3.11.0
25 | pytest-timeout>=2.1.0
26 | pytest-randomly>=3.13.0
27 | coverage>=7.3.0
28 | codecov>=2.1.0
29 | html-testRunner==1.2.1
30 | 
31 | # 代码质量
32 | flake8>=6.1.0
33 | black>=23.7.0
34 | isort>=5.12.0
35 | mypy>=1.5.0
36 | pylint>=2.17.0
37 | bandit>=1.7.0
38 | safety>=2.3.0
39 | 
40 | # 文档工具
41 | sphinx>=7.1.0
42 | sphinx-rtd-theme>=1.3.0
43 | sphinx-autodoc-typehints>=1.24.0
44 | sphinx-copybutton>=0.5.0
45 | 
46 | # 开发和调试工具
47 | ipython>=8.14.0
48 | ipdb>=0.13.0
49 | debugpy>=1.6.0
50 | build>=1.0.0
51 | twine>=4.0.0
52 | wheel>=0.41.0
53 | 
54 | # 性能分析
55 | memory_profiler>=0.61.0
56 | line_profiler>=4.1.0
57 | py-spy>=0.3.0
58 | 
59 | # 类型检查
60 | types-PyYAML>=6.0.12.12
61 | types-psutil==5.9.5.17
62 | types-requests>=2.31.0
63 | types-setuptools>=68.0.0
64 | 
65 | # 代码解析
66 | javalang==0.13.0
67 | libclang==16.0.6
68 | 
69 | # 代码克隆检测
70 | scikit-learn>=1.0.0
71 | gensim>=4.0.0
72 | nltk>=3.6.0
73 | 
74 | # 版本预测
75 | scipy>=1.7.0
76 | statsmodels>=0.13.0
77 | xgboost>=1.5.0
78 | lightgbm>=3.3.0
79 | catboost>=1.0.0 


--------------------------------------------------------------------------------
/scripts/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 设置错误时退出
 4 | set -e
 5 | 
 6 | # 显示执行的命令
 7 | set -x
 8 | 
 9 | # 检查环境变量
10 | if [ -z "$DOCKER_USERNAME" ] || [ -z "$DOCKER_PASSWORD" ]; then
11 |     echo "Error: DOCKER_USERNAME or DOCKER_PASSWORD not set"
12 |     exit 1
13 | fi
14 | 
15 | # 登录Docker Hub
16 | echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
17 | 
18 | # 构建镜像
19 | docker-compose build
20 | 
21 | # 运行测试
22 | docker-compose run web pytest
23 | 
24 | # 如果测试通过，推送镜像
25 | if [ $? -eq 0 ]; then
26 |     docker-compose push
27 |     
28 |     # 部署到生产环境
29 |     if [ "$DEPLOY_ENV" = "production" ]; then
30 |         # 备份数据库
31 |         docker-compose exec postgres pg_dump -U re_centris re_centris > backup.sql
32 |         
33 |         # 停止旧容器
34 |         docker-compose down
35 |         
36 |         # 启动新容器
37 |         docker-compose up -d
38 |         
39 |         # 等待服务启动
40 |         sleep 30
41 |         
42 |         # 检查服务健康状态
43 |         docker-compose ps | grep "Up" || {
44 |             echo "Error: Service failed to start"
45 |             docker-compose logs
46 |             exit 1
47 |         }
48 |         
49 |         # 运行数据库迁移
50 |         docker-compose exec web python manage.py db upgrade
51 |         
52 |         echo "Deployment successful!"
53 |     else
54 |         echo "Skipping production deployment"
55 |     fi
56 | else
57 |     echo "Tests failed, aborting deployment"
58 |     exit 1
59 | fi 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | """Re-Centris 测试包
 2 | 
 3 | 该包包含了Re-Centris项目的所有测试用例，包括：
 4 | 1. 单元测试 - 测试各个模块的独立功能
 5 | 2. 集成测试 - 测试模块间的交互
 6 | 3. 性能测试 - 测试系统性能和资源使用
 7 | 
 8 | 作者: byRen2002
 9 | 修改日期: 2025年3月
10 | 许可证: MIT License
11 | """ 


--------------------------------------------------------------------------------
/tests/core/test_cache.py:
--------------------------------------------------------------------------------
  1 | """缓存系统测试模块
  2 | 
  3 | 该模块包含了对Cache类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import time
 12 | from unittest.mock import patch, MagicMock
 13 | import tempfile
 14 | import os
 15 | 
 16 | from core.cache import Cache
 17 | 
 18 | class TestCache(unittest.TestCase):
 19 |     """Cache类的测试用例"""
 20 |     
 21 |     def setUp(self):
 22 |         """测试前的准备工作"""
 23 |         self.cache_size = 5
 24 |         self.expire_time = 1  # 1秒过期
 25 |         self.cache = Cache(self.cache_size, self.expire_time)
 26 |         
 27 |     def test_basic_operations(self):
 28 |         """测试基本的缓存操作"""
 29 |         # 测试设置和获取
 30 |         self.cache.set("key1", "value1")
 31 |         self.assertEqual(self.cache.get("key1"), "value1")
 32 |         
 33 |         # 测试不存在的键
 34 |         self.assertIsNone(self.cache.get("nonexistent"))
 35 |         
 36 |     def test_cache_size_limit(self):
 37 |         """测试缓存大小限制"""
 38 |         # 添加超过限制的项
 39 |         for i in range(self.cache_size + 2):
 40 |             self.cache.set(f"key{i}", f"value{i}")
 41 |             
 42 |         # 验证缓存大小不超过限制
 43 |         self.assertLessEqual(len(self.cache.cache), self.cache_size)
 44 |         
 45 |         # 验证最早的项被移除
 46 |         self.assertIsNone(self.cache.get("key0"))
 47 |         self.assertIsNotNone(self.cache.get(f"key{self.cache_size+1}"))
 48 |         
 49 |     def test_expiration(self):
 50 |         """测试缓存过期"""
 51 |         self.cache.set("expire_key", "expire_value")
 52 |         
 53 |         # 等待过期
 54 |         time.sleep(self.expire_time + 0.1)
 55 |         
 56 |         # 验证项已过期
 57 |         self.assertIsNone(self.cache.get("expire_key"))
 58 |         
 59 |     def test_clear(self):
 60 |         """测试清空缓存"""
 61 |         # 添加一些项
 62 |         self.cache.set("key1", "value1")
 63 |         self.cache.set("key2", "value2")
 64 |         
 65 |         # 清空缓存
 66 |         self.cache.clear()
 67 |         
 68 |         # 验证缓存为空
 69 |         self.assertEqual(len(self.cache.cache), 0)
 70 |         self.assertEqual(len(self.cache.access_times), 0)
 71 |         
 72 |     def test_update_access_time(self):
 73 |         """测试访问时间更新"""
 74 |         self.cache.set("key", "value")
 75 |         first_access = self.cache.access_times["key"]
 76 |         
 77 |         # 等待一小段时间
 78 |         time.sleep(0.1)
 79 |         
 80 |         # 再次访问
 81 |         self.cache.get("key")
 82 |         second_access = self.cache.access_times["key"]
 83 |         
 84 |         # 验证访问时间已更新
 85 |         self.assertGreater(second_access, first_access)
 86 |         
 87 |     def test_persistence(self):
 88 |         """测试缓存持久化"""
 89 |         # 创建临时目录
 90 |         temp_dir = tempfile.mkdtemp()
 91 |         cache_file = os.path.join(temp_dir, "cache.db")
 92 |         
 93 |         try:
 94 |             # 创建持久化缓存
 95 |             persistent_cache = Cache(
 96 |                 self.cache_size,
 97 |                 self.expire_time,
 98 |                 persistent=True,
 99 |                 cache_file=cache_file
100 |             )
101 |             
102 |             # 添加数据
103 |             persistent_cache.set("persist_key", "persist_value")
104 |             
105 |             # 关闭缓存
106 |             persistent_cache.close()
107 |             
108 |             # 重新创建缓存并验证数据
109 |             new_cache = Cache(
110 |                 self.cache_size,
111 |                 self.expire_time,
112 |                 persistent=True,
113 |                 cache_file=cache_file
114 |             )
115 |             
116 |             self.assertEqual(new_cache.get("persist_key"), "persist_value")
117 |             
118 |         finally:
119 |             # 清理
120 |             if os.path.exists(cache_file):
121 |                 os.remove(cache_file)
122 |             os.rmdir(temp_dir)
123 |             
124 |     def test_thread_safety(self):
125 |         """测试线程安全性"""
126 |         import threading
127 |         
128 |         def worker():
129 |             for i in range(100):
130 |                 self.cache.set(f"thread_key_{i}", f"thread_value_{i}")
131 |                 self.cache.get(f"thread_key_{i}")
132 |                 
133 |         # 创建多个线程同时操作缓存
134 |         threads = [threading.Thread(target=worker) for _ in range(4)]
135 |         
136 |         # 启动所有线程
137 |         for thread in threads:
138 |             thread.start()
139 |             
140 |         # 等待所有线程完成
141 |         for thread in threads:
142 |             thread.join()
143 |             
144 |         # 验证缓存状态正常
145 |         self.assertLessEqual(len(self.cache.cache), self.cache_size)
146 |         
147 |     def test_invalid_inputs(self):
148 |         """测试无效输入处理"""
149 |         # 测试无效的缓存大小
150 |         with self.assertRaises(ValueError):
151 |             Cache(-1, self.expire_time)
152 |             
153 |         # 测试无效的过期时间
154 |         with self.assertRaises(ValueError):
155 |             Cache(self.cache_size, -1)
156 |             
157 |     def test_memory_management(self):
158 |         """测试内存管理"""
159 |         large_data = "x" * 1024 * 1024  # 1MB数据
160 |         
161 |         # 添加大量数据
162 |         for i in range(10):
163 |             self.cache.set(f"large_key_{i}", large_data)
164 |             
165 |         # 验证缓存大小限制有效
166 |         self.assertLessEqual(len(self.cache.cache), self.cache_size)
167 | 
168 | if __name__ == '__main__':
169 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/core/test_config_manager.py:
--------------------------------------------------------------------------------
  1 | """配置管理器测试模块
  2 | 
  3 | 该模块包含了对ConfigManager类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import os
 12 | import tempfile
 13 | import yaml
 14 | from unittest.mock import patch, MagicMock
 15 | 
 16 | from core.config_manager import ConfigManager
 17 | 
 18 | class TestConfigManager(unittest.TestCase):
 19 |     """ConfigManager类的测试用例"""
 20 |     
 21 |     def setUp(self):
 22 |         """测试前的准备工作"""
 23 |         # 创建临时配置文件
 24 |         self.temp_dir = tempfile.mkdtemp()
 25 |         self.config_file = os.path.join(self.temp_dir, "config.yaml")
 26 |         
 27 |         # 测试配置数据
 28 |         self.test_config = {
 29 |             "paths": {
 30 |                 "repo": "/path/to/repo",
 31 |                 "results": "/path/to/results",
 32 |                 "logs": "/path/to/logs"
 33 |             },
 34 |             "performance": {
 35 |                 "max_workers": 4,
 36 |                 "cache_size": 1000,
 37 |                 "memory_limit": 1024,
 38 |                 "timeout": 300
 39 |             },
 40 |             "logging": {
 41 |                 "level": "INFO",
 42 |                 "max_size": 10,
 43 |                 "backup_count": 5
 44 |             }
 45 |         }
 46 |         
 47 |         # 写入测试配置
 48 |         with open(self.config_file, 'w') as f:
 49 |             yaml.dump(self.test_config, f)
 50 |             
 51 |         # 创建ConfigManager实例
 52 |         self.config_manager = ConfigManager(self.config_file)
 53 |         
 54 |     def tearDown(self):
 55 |         """测试后的清理工作"""
 56 |         # 删除临时文件和目录
 57 |         if os.path.exists(self.config_file):
 58 |             os.remove(self.config_file)
 59 |         os.rmdir(self.temp_dir)
 60 |         
 61 |     def test_load_config(self):
 62 |         """测试配置加载"""
 63 |         # 验证配置是否正确加载
 64 |         self.assertEqual(
 65 |             self.config_manager.get("paths.repo"),
 66 |             "/path/to/repo"
 67 |         )
 68 |         self.assertEqual(
 69 |             self.config_manager.get("performance.max_workers"),
 70 |             4
 71 |         )
 72 |         
 73 |     def test_get_nested_value(self):
 74 |         """测试获取嵌套配置值"""
 75 |         # 测试多层嵌套
 76 |         self.assertEqual(
 77 |             self.config_manager.get("paths.repo"),
 78 |             "/path/to/repo"
 79 |         )
 80 |         
 81 |         # 测试不存在的路径
 82 |         self.assertIsNone(
 83 |             self.config_manager.get("nonexistent.path")
 84 |         )
 85 |         
 86 |         # 测试默认值
 87 |         self.assertEqual(
 88 |             self.config_manager.get("nonexistent.path", "default"),
 89 |             "default"
 90 |         )
 91 |         
 92 |     def test_set_value(self):
 93 |         """测试设置配置值"""
 94 |         # 设置新值
 95 |         self.config_manager.set("paths.new_path", "/new/path")
 96 |         
 97 |         # 验证设置成功
 98 |         self.assertEqual(
 99 |             self.config_manager.get("paths.new_path"),
100 |             "/new/path"
101 |         )
102 |         
103 |         # 更新现有值
104 |         self.config_manager.set("paths.repo", "/updated/path")
105 |         self.assertEqual(
106 |             self.config_manager.get("paths.repo"),
107 |             "/updated/path"
108 |         )
109 |         
110 |     def test_save_config(self):
111 |         """测试配置保存"""
112 |         # 修改配置
113 |         self.config_manager.set("paths.new_path", "/new/path")
114 |         
115 |         # 保存配置
116 |         self.config_manager.save()
117 |         
118 |         # 重新加载配置并验证
119 |         new_config = ConfigManager(self.config_file)
120 |         self.assertEqual(
121 |             new_config.get("paths.new_path"),
122 |             "/new/path"
123 |         )
124 |         
125 |     def test_environment_override(self):
126 |         """测试环境变量覆盖"""
127 |         with patch.dict('os.environ', {
128 |             'RE_CENTRIS_PATHS_REPO': '/env/path',
129 |             'RE_CENTRIS_PERFORMANCE_MAX_WORKERS': '8'
130 |         }):
131 |             # 重新加载配置
132 |             config = ConfigManager(self.config_file)
133 |             
134 |             # 验证环境变量覆盖
135 |             self.assertEqual(config.get("paths.repo"), "/env/path")
136 |             self.assertEqual(config.get("performance.max_workers"), 8)
137 |             
138 |     def test_validation(self):
139 |         """测试配置验证"""
140 |         # 测试必需字段
141 |         invalid_config = {"paths": {}}
142 |         with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
143 |             yaml.dump(invalid_config, f)
144 |             
145 |         with self.assertRaises(ValueError):
146 |             ConfigManager(f.name)
147 |             
148 |         os.unlink(f.name)
149 |         
150 |     def test_type_conversion(self):
151 |         """测试类型转换"""
152 |         # 测试数字转换
153 |         self.assertIsInstance(
154 |             self.config_manager.get("performance.max_workers"),
155 |             int
156 |         )
157 |         
158 |         # 测试布尔值转换
159 |         self.config_manager.set("feature.enabled", "true")
160 |         self.assertIsInstance(
161 |             self.config_manager.get("feature.enabled"),
162 |             bool
163 |         )
164 |         
165 |     def test_merge_configs(self):
166 |         """测试配置合并"""
167 |         # 创建另一个配置文件
168 |         other_config = {
169 |             "paths": {
170 |                 "temp": "/path/to/temp"
171 |             },
172 |             "new_section": {
173 |                 "key": "value"
174 |             }
175 |         }
176 |         
177 |         other_file = os.path.join(self.temp_dir, "other.yaml")
178 |         with open(other_file, 'w') as f:
179 |             yaml.dump(other_config, f)
180 |             
181 |         # 合并配置
182 |         self.config_manager.merge(other_file)
183 |         
184 |         # 验证合并结果
185 |         self.assertEqual(
186 |             self.config_manager.get("paths.temp"),
187 |             "/path/to/temp"
188 |         )
189 |         self.assertEqual(
190 |             self.config_manager.get("new_section.key"),
191 |             "value"
192 |         )
193 |         
194 |         # 清理
195 |         os.remove(other_file)
196 | 
197 | if __name__ == '__main__':
198 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/core/test_memory_optimizer.py:
--------------------------------------------------------------------------------
  1 | """内存优化器测试模块
  2 | 
  3 | 该模块包含了对MemoryOptimizer类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import os
 12 | import tempfile
 13 | import psutil
 14 | from unittest.mock import patch, MagicMock
 15 | 
 16 | from core.memory_optimizer import MemoryOptimizer
 17 | from core.config_manager import ConfigManager
 18 | 
 19 | class TestMemoryOptimizer(unittest.TestCase):
 20 |     """MemoryOptimizer类的测试用例"""
 21 |     
 22 |     def setUp(self):
 23 |         """测试前的准备工作"""
 24 |         # 创建临时配置文件
 25 |         self.temp_dir = tempfile.mkdtemp()
 26 |         self.config_file = os.path.join(self.temp_dir, "config.yaml")
 27 |         
 28 |         # 测试配置数据
 29 |         self.test_config = {
 30 |             "memory": {
 31 |                 "limit": 1024 * 1024 * 1024,  # 1GB
 32 |                 "threshold": 0.8,  # 80%
 33 |                 "cleanup_threshold": 0.9,  # 90%
 34 |                 "min_free": 512 * 1024 * 1024  # 512MB
 35 |             }
 36 |         }
 37 |         
 38 |         # 写入测试配置
 39 |         with open(self.config_file, 'w') as f:
 40 |             yaml.dump(self.test_config, f)
 41 |             
 42 |         # 创建ConfigManager实例
 43 |         self.config_manager = ConfigManager(self.config_file)
 44 |         
 45 |         # 创建MemoryOptimizer实例
 46 |         self.memory_optimizer = MemoryOptimizer(self.config_manager)
 47 |         
 48 |     def tearDown(self):
 49 |         """测试后的清理工作"""
 50 |         # 删除临时文件和目录
 51 |         if os.path.exists(self.config_file):
 52 |             os.remove(self.config_file)
 53 |         os.rmdir(self.temp_dir)
 54 |         
 55 |     def test_memory_check(self):
 56 |         """测试内存检查"""
 57 |         # 模拟内存使用情况
 58 |         with patch('psutil.virtual_memory') as mock_memory:
 59 |             # 模拟内存充足的情况
 60 |             mock_memory.return_value = MagicMock(
 61 |                 total=8 * 1024 * 1024 * 1024,  # 8GB总内存
 62 |                 available=4 * 1024 * 1024 * 1024  # 4GB可用内存
 63 |             )
 64 |             
 65 |             # 验证内存检查通过
 66 |             self.assertTrue(
 67 |                 self.memory_optimizer.check_memory_available(
 68 |                     1024 * 1024 * 1024  # 需要1GB内存
 69 |                 )
 70 |             )
 71 |             
 72 |             # 模拟内存不足的情况
 73 |             mock_memory.return_value = MagicMock(
 74 |                 total=8 * 1024 * 1024 * 1024,  # 8GB总内存
 75 |                 available=256 * 1024 * 1024  # 256MB可用内存
 76 |             )
 77 |             
 78 |             # 验证内存检查失败
 79 |             self.assertFalse(
 80 |                 self.memory_optimizer.check_memory_available(
 81 |                     1024 * 1024 * 1024  # 需要1GB内存
 82 |                 )
 83 |             )
 84 |             
 85 |     def test_memory_cleanup(self):
 86 |         """测试内存清理"""
 87 |         # 创建一些大对象来占用内存
 88 |         large_objects = []
 89 |         for _ in range(5):
 90 |             large_objects.append(bytearray(100 * 1024 * 1024))  # 每个100MB
 91 |             
 92 |         # 记录清理前的内存使用
 93 |         before_cleanup = psutil.Process().memory_info().rss
 94 |         
 95 |         # 执行内存清理
 96 |         self.memory_optimizer.cleanup()
 97 |         
 98 |         # 记录清理后的内存使用
 99 |         after_cleanup = psutil.Process().memory_info().rss
100 |         
101 |         # 验证内存使用减少
102 |         self.assertLess(after_cleanup, before_cleanup)
103 |         
104 |     def test_memory_monitoring(self):
105 |         """测试内存监控"""
106 |         # 启动监控
107 |         self.memory_optimizer.start_monitoring()
108 |         
109 |         # 验证监控线程已启动
110 |         self.assertTrue(self.memory_optimizer.is_monitoring())
111 |         
112 |         # 停止监控
113 |         self.memory_optimizer.stop_monitoring()
114 |         
115 |         # 验证监控线程已停止
116 |         self.assertFalse(self.memory_optimizer.is_monitoring())
117 |         
118 |     def test_memory_limit_enforcement(self):
119 |         """测试内存限制执行"""
120 |         # 测试超出内存限制
121 |         with self.assertRaises(MemoryError):
122 |             # 尝试分配超过限制的内存
123 |             self.memory_optimizer.allocate_memory(
124 |                 self.test_config['memory']['limit'] * 2
125 |             )
126 |             
127 |     def test_memory_stats(self):
128 |         """测试内存统计"""
129 |         # 获取内存统计信息
130 |         stats = self.memory_optimizer.get_memory_stats()
131 |         
132 |         # 验证统计信息的完整性
133 |         self.assertIn('total', stats)
134 |         self.assertIn('available', stats)
135 |         self.assertIn('used', stats)
136 |         self.assertIn('free', stats)
137 |         self.assertIn('percent', stats)
138 |         
139 |     def test_optimization_strategies(self):
140 |         """测试优化策略"""
141 |         # 测试不同的优化级别
142 |         strategies = [
143 |             'minimal',  # 最小优化
144 |             'moderate',  # 中等优化
145 |             'aggressive'  # 激进优化
146 |         ]
147 |         
148 |         for strategy in strategies:
149 |             # 设置优化策略
150 |             self.memory_optimizer.set_optimization_strategy(strategy)
151 |             
152 |             # 验证策略设置成功
153 |             self.assertEqual(
154 |                 self.memory_optimizer.get_current_strategy(),
155 |                 strategy
156 |             )
157 |             
158 |     def test_memory_pressure_handling(self):
159 |         """测试内存压力处理"""
160 |         # 模拟高内存压力情况
161 |         with patch('psutil.virtual_memory') as mock_memory:
162 |             mock_memory.return_value = MagicMock(
163 |                 percent=95.0  # 95%内存使用率
164 |             )
165 |             
166 |             # 触发内存压力处理
167 |             self.memory_optimizer.handle_memory_pressure()
168 |             
169 |             # 验证是否触发了清理操作
170 |             self.assertTrue(self.memory_optimizer.cleanup_triggered)
171 |             
172 |     def test_concurrent_memory_operations(self):
173 |         """测试并发内存操作"""
174 |         import threading
175 |         
176 |         def memory_worker():
177 |             # 执行一些内存操作
178 |             for _ in range(10):
179 |                 # 分配和释放内存
180 |                 data = bytearray(10 * 1024 * 1024)  # 10MB
181 |                 self.memory_optimizer.track_allocation(len(data))
182 |                 del data
183 |                 self.memory_optimizer.track_deallocation(10 * 1024 * 1024)
184 |                 
185 |         # 创建多个线程
186 |         threads = [threading.Thread(target=memory_worker) for _ in range(4)]
187 |         
188 |         # 启动所有线程
189 |         for thread in threads:
190 |             thread.start()
191 |             
192 |         # 等待所有线程完成
193 |         for thread in threads:
194 |             thread.join()
195 |             
196 |         # 验证内存跟踪的准确性
197 |         self.assertEqual(
198 |             self.memory_optimizer.get_tracked_allocations(),
199 |             0
200 |         )
201 | 
202 | if __name__ == '__main__':
203 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/core/test_parallel_manager.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import time
  3 | from typing import List
  4 | from core.parallel_manager import ParallelManager
  5 | 
  6 | class TestParallelManager(unittest.TestCase):
  7 |     """ParallelManager单元测试"""
  8 |     
  9 |     def setUp(self):
 10 |         """测试前准备"""
 11 |         self.manager = ParallelManager(max_workers=2)
 12 |         
 13 |     def tearDown(self):
 14 |         """测试后清理"""
 15 |         self.manager.close_all()
 16 |         
 17 |     def test_process_items_empty(self):
 18 |         """测试处理空列表"""
 19 |         result = self.manager.process_items([], lambda x: x)
 20 |         self.assertEqual(result, [])
 21 |         
 22 |     def test_process_items_single_chunk(self):
 23 |         """测试处理单个数据块"""
 24 |         def square_numbers(nums: List[int]) -> List[int]:
 25 |             return [x * x for x in nums]
 26 |             
 27 |         items = [1, 2, 3, 4, 5]
 28 |         result = self.manager.process_items(
 29 |             items=items,
 30 |             process_func=square_numbers,
 31 |             chunk_size=5
 32 |         )
 33 |         self.assertEqual(result, [1, 4, 9, 16, 25])
 34 |         
 35 |     def test_process_items_multiple_chunks(self):
 36 |         """测试处理多个数据块"""
 37 |         def sum_numbers(nums: List[int]) -> int:
 38 |             return sum(nums)
 39 |             
 40 |         items = list(range(100))
 41 |         result = self.manager.process_items(
 42 |             items=items,
 43 |             process_func=sum_numbers,
 44 |             chunk_size=10
 45 |         )
 46 |         self.assertEqual(sum(result), sum(range(100)))
 47 |         
 48 |     def test_process_items_with_threads(self):
 49 |         """测试使用线程池处理"""
 50 |         def slow_increment(nums: List[int]) -> List[int]:
 51 |             time.sleep(0.1)  # 模拟耗时操作
 52 |             return [x + 1 for x in nums]
 53 |             
 54 |         items = list(range(10))
 55 |         result = self.manager.process_items(
 56 |             items=items,
 57 |             process_func=slow_increment,
 58 |             use_threads=True,
 59 |             chunk_size=2
 60 |         )
 61 |         self.assertEqual(result, [x + 1 for x in range(10)])
 62 |         
 63 |     def test_process_items_with_progress(self):
 64 |         """测试带进度回调的处理"""
 65 |         progress_updates = []
 66 |         
 67 |         def track_progress(current: int, total: int):
 68 |             progress_updates.append((current, total))
 69 |             
 70 |         def double_numbers(nums: List[int]) -> List[int]:
 71 |             return [x * 2 for x in nums]
 72 |             
 73 |         items = list(range(5))
 74 |         result = self.manager.process_items_with_progress(
 75 |             items=items,
 76 |             process_func=double_numbers,
 77 |             progress_callback=track_progress,
 78 |             chunk_size=1
 79 |         )
 80 |         
 81 |         self.assertEqual(result, [x * 2 for x in range(5)])
 82 |         self.assertEqual(len(progress_updates), 5)
 83 |         self.assertEqual(progress_updates[-1], (5, 5))
 84 |         
 85 |     def test_error_handling(self):
 86 |         """测试错误处理"""
 87 |         def failing_func(nums: List[int]) -> List[int]:
 88 |             raise ValueError("测试错误")
 89 |             
 90 |         items = list(range(5))
 91 |         result = self.manager.process_items(
 92 |             items=items,
 93 |             process_func=failing_func
 94 |         )
 95 |         self.assertEqual(result, [])
 96 |         
 97 |     def test_pool_management(self):
 98 |         """测试池管理"""
 99 |         # 测试进程池创建和关闭
100 |         self.manager.process_items(
101 |             items=[1, 2, 3],
102 |             process_func=lambda x: x,
103 |             pool_name="test_pool"
104 |         )
105 |         self.assertIn("test_pool", self.manager._process_pools)
106 |         
107 |         # 测试关闭特定池
108 |         self.manager.close_pool("test_pool")
109 |         self.assertNotIn("test_pool", self.manager._process_pools)
110 |         
111 |         # 测试关闭所有池
112 |         self.manager.process_items(
113 |             items=[1, 2, 3],
114 |             process_func=lambda x: x,
115 |             pool_name="another_pool"
116 |         )
117 |         self.manager.close_all()
118 |         self.assertEqual(len(self.manager._process_pools), 0)
119 |         self.assertEqual(len(self.manager._thread_pools), 0)
120 |         
121 |     def test_large_data_processing(self):
122 |         """测试大数据处理"""
123 |         items = list(range(10000))
124 |         
125 |         def process_chunk(nums: List[int]) -> List[int]:
126 |             return [x * x for x in nums]
127 |             
128 |         result = self.manager.process_items(
129 |             items=items,
130 |             process_func=process_chunk,
131 |             chunk_size=100
132 |         )
133 |         
134 |         self.assertEqual(len(result), 10000)
135 |         self.assertEqual(result[0], 0)
136 |         self.assertEqual(result[-1], 9999 * 9999)
137 |         
138 |     def test_concurrent_processing(self):
139 |         """测试并发处理"""
140 |         start_time = time.time()
141 |         
142 |         def slow_process(nums: List[int]) -> List[int]:
143 |             time.sleep(0.1)  # 模拟耗时操作
144 |             return nums
145 |             
146 |         items = list(range(20))
147 |         self.manager.process_items(
148 |             items=items,
149 |             process_func=slow_process,
150 |             chunk_size=2
151 |         )
152 |         
153 |         duration = time.time() - start_time
154 |         # 由于使用2个工作进程，处理时间应该小于串行处理的一半
155 |         self.assertLess(duration, 1.0)  # 串行需要2秒
156 | 
157 | if __name__ == '__main__':
158 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/core/test_performance_monitor.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/tests/core/test_resource_manager.py:
--------------------------------------------------------------------------------
  1 | """资源管理器测试模块
  2 | 
  3 | 该模块包含了对ResourceManager类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import os
 12 | import tempfile
 13 | import shutil
 14 | from unittest.mock import patch, MagicMock
 15 | 
 16 | from core.resource_manager import ResourceManager
 17 | from core.config_manager import ConfigManager
 18 | 
 19 | class TestResourceManager(unittest.TestCase):
 20 |     """ResourceManager类的测试用例"""
 21 |     
 22 |     def setUp(self):
 23 |         """测试前的准备工作"""
 24 |         # 创建临时目录
 25 |         self.temp_dir = tempfile.mkdtemp()
 26 |         self.test_repo_path = os.path.join(self.temp_dir, "repos")
 27 |         self.test_cache_path = os.path.join(self.temp_dir, "cache")
 28 |         
 29 |         # 创建测试配置
 30 |         self.config = {
 31 |             "paths": {
 32 |                 "repo": self.test_repo_path,
 33 |                 "cache": self.test_cache_path
 34 |             },
 35 |             "limits": {
 36 |                 "max_repo_size": 1024 * 1024 * 100,  # 100MB
 37 |                 "max_cache_size": 1024 * 1024 * 500   # 500MB
 38 |             }
 39 |         }
 40 |         
 41 |         # 创建配置文件
 42 |         self.config_file = os.path.join(self.temp_dir, "config.yaml")
 43 |         with open(self.config_file, 'w') as f:
 44 |             yaml.dump(self.config, f)
 45 |             
 46 |         # 创建ConfigManager实例
 47 |         self.config_manager = ConfigManager(self.config_file)
 48 |         
 49 |         # 创建ResourceManager实例
 50 |         self.resource_manager = ResourceManager(self.config_manager)
 51 |         
 52 |     def tearDown(self):
 53 |         """测试后的清理工作"""
 54 |         # 删除临时目录及其内容
 55 |         shutil.rmtree(self.temp_dir)
 56 |         
 57 |     def test_init_directories(self):
 58 |         """测试目录初始化"""
 59 |         # 验证目录是否被创建
 60 |         self.assertTrue(os.path.exists(self.test_repo_path))
 61 |         self.assertTrue(os.path.exists(self.test_cache_path))
 62 |         
 63 |     def test_check_disk_space(self):
 64 |         """测试磁盘空间检查"""
 65 |         # 模拟磁盘空间不足的情况
 66 |         with patch('psutil.disk_usage') as mock_disk_usage:
 67 |             mock_disk_usage.return_value = MagicMock(
 68 |                 free=1024 * 1024  # 1MB可用空间
 69 |             )
 70 |             
 71 |             with self.assertRaises(RuntimeError):
 72 |                 self.resource_manager.check_disk_space(
 73 |                     self.test_repo_path,
 74 |                     required_space=1024 * 1024 * 10  # 需要10MB
 75 |                 )
 76 |                 
 77 |     def test_cleanup_old_files(self):
 78 |         """测试旧文件清理"""
 79 |         # 创建测试文件
 80 |         test_files = []
 81 |         for i in range(5):
 82 |             file_path = os.path.join(self.test_cache_path, f"test_{i}.txt")
 83 |             with open(file_path, 'w') as f:
 84 |                 f.write("test data")
 85 |             test_files.append(file_path)
 86 |             
 87 |         # 修改文件访问时间
 88 |         for i, file_path in enumerate(test_files):
 89 |             access_time = time.time() - (i + 1) * 86400  # i+1天前
 90 |             os.utime(file_path, (access_time, access_time))
 91 |             
 92 |         # 清理3天前的文件
 93 |         self.resource_manager.cleanup_old_files(
 94 |             self.test_cache_path,
 95 |             days=3
 96 |         )
 97 |         
 98 |         # 验证结果
 99 |         remaining_files = os.listdir(self.test_cache_path)
100 |         self.assertEqual(len(remaining_files), 3)  # 应该保留3个文件
101 |         
102 |     def test_monitor_resource_usage(self):
103 |         """测试资源使用监控"""
104 |         # 创建一些测试文件来占用空间
105 |         for i in range(10):
106 |             file_path = os.path.join(self.test_cache_path, f"large_{i}.txt")
107 |             with open(file_path, 'wb') as f:
108 |                 f.write(b'0' * 1024 * 1024)  # 写入1MB数据
109 |                 
110 |         # 获取资源使用情况
111 |         usage = self.resource_manager.get_resource_usage()
112 |         
113 |         # 验证返回的数据结构
114 |         self.assertIn('disk_usage', usage)
115 |         self.assertIn('memory_usage', usage)
116 |         self.assertIn('cpu_usage', usage)
117 |         
118 |     def test_resource_limits(self):
119 |         """测试资源限制"""
120 |         # 测试超出仓库大小限制
121 |         large_data = b'0' * (self.config['limits']['max_repo_size'] + 1024)
122 |         
123 |         with self.assertRaises(ValueError):
124 |             self.resource_manager.check_size_limit(
125 |                 len(large_data),
126 |                 'repo'
127 |             )
128 |             
129 |     def test_file_operations(self):
130 |         """测试文件操作"""
131 |         # 测试文件写入
132 |         test_data = b"test content"
133 |         test_file = os.path.join(self.test_cache_path, "test.txt")
134 |         
135 |         self.resource_manager.write_file(test_file, test_data)
136 |         self.assertTrue(os.path.exists(test_file))
137 |         
138 |         # 测试文件读取
139 |         read_data = self.resource_manager.read_file(test_file)
140 |         self.assertEqual(read_data, test_data)
141 |         
142 |         # 测试文件删除
143 |         self.resource_manager.delete_file(test_file)
144 |         self.assertFalse(os.path.exists(test_file))
145 |         
146 |     def test_path_validation(self):
147 |         """测试路径验证"""
148 |         # 测试无效路径
149 |         invalid_paths = [
150 |             "../outside.txt",
151 |             "/absolute/path/file.txt",
152 |             "../../etc/passwd"
153 |         ]
154 |         
155 |         for path in invalid_paths:
156 |             with self.assertRaises(ValueError):
157 |                 self.resource_manager.validate_path(path)
158 |                 
159 |     def test_concurrent_access(self):
160 |         """测试并发访问"""
161 |         import threading
162 |         
163 |         def worker():
164 |             # 执行一些文件操作
165 |             for i in range(10):
166 |                 file_path = os.path.join(
167 |                     self.test_cache_path,
168 |                     f"thread_{threading.get_ident()}_{i}.txt"
169 |                 )
170 |                 self.resource_manager.write_file(file_path, b"test")
171 |                 self.resource_manager.read_file(file_path)
172 |                 self.resource_manager.delete_file(file_path)
173 |                 
174 |         # 创建多个线程
175 |         threads = [threading.Thread(target=worker) for _ in range(4)]
176 |         
177 |         # 启动所有线程
178 |         for thread in threads:
179 |             thread.start()
180 |             
181 |         # 等待所有线程完成
182 |         for thread in threads:
183 |             thread.join()
184 |             
185 |         # 验证没有遗留文件
186 |         remaining_files = os.listdir(self.test_cache_path)
187 |         self.assertEqual(len(remaining_files), 0)
188 | 
189 | if __name__ == '__main__':
190 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/detector/test_detector.py:
--------------------------------------------------------------------------------
  1 | """检测器测试模块
  2 | 
  3 | 该模块包含了对Detector类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import os
 12 | import tempfile
 13 | import json
 14 | import tlsh
 15 | from unittest.mock import patch, MagicMock
 16 | 
 17 | from detector.detector import Detector
 18 | from core.config_manager import ConfigManager
 19 | 
 20 | class TestDetector(unittest.TestCase):
 21 |     """Detector类的测试用例"""
 22 |     
 23 |     def setUp(self):
 24 |         """测试前的准备工作"""
 25 |         # 创建临时测试目录
 26 |         self.temp_dir = tempfile.mkdtemp()
 27 |         self.test_data_dir = os.path.join(self.temp_dir, "test_data")
 28 |         os.makedirs(self.test_data_dir)
 29 |         
 30 |         # 创建测试配置
 31 |         self.config_file = os.path.join(self.temp_dir, "config.yaml")
 32 |         self.test_config = {
 33 |             "paths": {
 34 |                 "repo": os.path.join(self.test_data_dir, "repos"),
 35 |                 "results": os.path.join(self.test_data_dir, "results"),
 36 |                 "components": os.path.join(self.test_data_dir, "components"),
 37 |                 "logs": os.path.join(self.test_data_dir, "logs")
 38 |             },
 39 |             "detection": {
 40 |                 "tlsh_threshold": 30,
 41 |                 "similarity_threshold": 0.8,
 42 |                 "min_component_size": 100,
 43 |                 "max_workers": 4
 44 |             },
 45 |             "logging": {
 46 |                 "level": "INFO",
 47 |                 "file": "detector.log"
 48 |             }
 49 |         }
 50 |         
 51 |         with open(self.config_file, 'w') as f:
 52 |             yaml.dump(self.test_config, f)
 53 |             
 54 |         # 创建必要的目录
 55 |         for path in self.test_config["paths"].values():
 56 |             os.makedirs(path, exist_ok=True)
 57 |             
 58 |         # 创建测试数据
 59 |         self._create_test_data()
 60 |         
 61 |         # 创建Detector实例
 62 |         self.config_manager = ConfigManager(self.config_file)
 63 |         self.detector = Detector(self.config_manager)
 64 |         
 65 |     def tearDown(self):
 66 |         """测试后的清理工作"""
 67 |         # 删除临时目录及其内容
 68 |         shutil.rmtree(self.temp_dir)
 69 |         
 70 |     def _create_test_data(self):
 71 |         """创建测试数据"""
 72 |         # 创建组件数据库
 73 |         component_db = os.path.join(
 74 |             self.test_config["paths"]["components"],
 75 |             "test_component.json"
 76 |         )
 77 |         
 78 |         test_functions = {
 79 |             tlsh.hash(b"function1"): {
 80 |                 "name": "test_func1",
 81 |                 "file": "test1.py",
 82 |                 "component": "component1"
 83 |             },
 84 |             tlsh.hash(b"function2"): {
 85 |                 "name": "test_func2",
 86 |                 "file": "test2.py",
 87 |                 "component": "component2"
 88 |             }
 89 |         }
 90 |         
 91 |         with open(component_db, 'w') as f:
 92 |             json.dump(test_functions, f)
 93 |             
 94 |         # 创建测试代码文件
 95 |         test_code = os.path.join(
 96 |             self.test_config["paths"]["repo"],
 97 |             "test_code.py"
 98 |         )
 99 |         
100 |         with open(test_code, 'w') as f:
101 |             f.write("def test_func1():\n    return 'test1'\n\n")
102 |             f.write("def test_func2():\n    return 'test2'\n")
103 |             
104 |     def test_initialization(self):
105 |         """测试初始化"""
106 |         # 验证配置加载
107 |         self.assertIsNotNone(self.detector.config)
108 |         
109 |         # 验证日志设置
110 |         self.assertTrue(os.path.exists(
111 |             os.path.join(self.test_config["paths"]["logs"], "detector.log")
112 |         ))
113 |         
114 |     def test_tlsh_computation(self):
115 |         """测试TLSH计算"""
116 |         # 计算测试函数的TLSH
117 |         code = "def test_function():\n    return 'test'\n"
118 |         hash_value = self.detector.compute_tlsh(code)
119 |         
120 |         # 验证哈希值格式
121 |         self.assertIsInstance(hash_value, str)
122 |         self.assertGreater(len(hash_value), 0)
123 |         
124 |     def test_component_detection(self):
125 |         """测试组件检测"""
126 |         # 执行检测
127 |         results = self.detector.detect(
128 |             os.path.join(self.test_config["paths"]["repo"], "test_code.py")
129 |         )
130 |         
131 |         # 验证检测结果
132 |         self.assertIsInstance(results, dict)
133 |         self.assertIn("matches", results)
134 |         self.assertIn("statistics", results)
135 |         
136 |     def test_similarity_calculation(self):
137 |         """测试相似度计算"""
138 |         # 计算两个相似函数的TLSH差异
139 |         code1 = "def test_function():\n    return 'test1'\n"
140 |         code2 = "def test_function():\n    return 'test2'\n"
141 |         
142 |         hash1 = self.detector.compute_tlsh(code1)
143 |         hash2 = self.detector.compute_tlsh(code2)
144 |         
145 |         diff = self.detector.compute_tlsh_diff(hash1, hash2)
146 |         
147 |         # 验证差异值在合理范围内
148 |         self.assertIsInstance(diff, int)
149 |         self.assertGreaterEqual(diff, 0)
150 |         self.assertLessEqual(diff, 1000)
151 |         
152 |     def test_parallel_processing(self):
153 |         """测试并行处理"""
154 |         # 创建多个测试文件
155 |         for i in range(10):
156 |             test_file = os.path.join(
157 |                 self.test_config["paths"]["repo"],
158 |                 f"test_code_{i}.py"
159 |             )
160 |             with open(test_file, 'w') as f:
161 |                 f.write(f"def test_func_{i}():\n    return 'test{i}'\n")
162 |                 
163 |         # 执行并行检测
164 |         results = self.detector.detect_batch(
165 |             self.test_config["paths"]["repo"]
166 |         )
167 |         
168 |         # 验证结果
169 |         self.assertEqual(len(results), 10)
170 |         
171 |     def test_cache_mechanism(self):
172 |         """测试缓存机制"""
173 |         # 第一次检测
174 |         file_path = os.path.join(
175 |             self.test_config["paths"]["repo"],
176 |             "test_code.py"
177 |         )
178 |         
179 |         start_time = time.time()
180 |         first_result = self.detector.detect(file_path)
181 |         first_time = time.time() - start_time
182 |         
183 |         # 第二次检测（应该使用缓存）
184 |         start_time = time.time()
185 |         second_result = self.detector.detect(file_path)
186 |         second_time = time.time() - start_time
187 |         
188 |         # 验证结果一致性和性能提升
189 |         self.assertEqual(first_result, second_result)
190 |         self.assertLess(second_time, first_time)
191 |         
192 |     def test_error_handling(self):
193 |         """测试错误处理"""
194 |         # 测试不存在的文件
195 |         with self.assertRaises(FileNotFoundError):
196 |             self.detector.detect("nonexistent_file.py")
197 |             
198 |         # 测试无效的组件数据库
199 |         with open(os.path.join(
200 |             self.test_config["paths"]["components"],
201 |             "invalid.json"
202 |         ), 'w') as f:
203 |             f.write("invalid json")
204 |             
205 |         with self.assertRaises(json.JSONDecodeError):
206 |             self.detector.load_component_db("invalid.json")
207 |             
208 |     def test_memory_management(self):
209 |         """测试内存管理"""
210 |         import psutil
211 |         process = psutil.Process()
212 |         
213 |         # 记录初始内存使用
214 |         initial_memory = process.memory_info().rss
215 |         
216 |         # 处理大量数据
217 |         for i in range(100):
218 |             test_file = os.path.join(
219 |                 self.test_config["paths"]["repo"],
220 |                 f"large_test_{i}.py"
221 |             )
222 |             with open(test_file, 'w') as f:
223 |                 for j in range(1000):
224 |                     f.write(f"def test_func_{i}_{j}():\n    return 'test'\n")
225 |                     
226 |         self.detector.detect_batch(self.test_config["paths"]["repo"])
227 |         
228 |         # 记录最终内存使用
229 |         final_memory = process.memory_info().rss
230 |         
231 |         # 验证内存增长在合理范围内
232 |         memory_growth = (final_memory - initial_memory) / (1024 * 1024)  # MB
233 |         self.assertLess(memory_growth, 1000)  # 内存增长应小于1GB
234 |         
235 |     def test_performance_monitoring(self):
236 |         """测试性能监控"""
237 |         # 启用性能监控
238 |         self.detector.enable_performance_monitoring()
239 |         
240 |         # 执行一些操作
241 |         self.detector.detect(
242 |             os.path.join(self.test_config["paths"]["repo"], "test_code.py")
243 |         )
244 |         
245 |         # 获取性能统计
246 |         stats = self.detector.get_performance_stats()
247 |         
248 |         # 验证统计信息
249 |         self.assertIn("processing_time", stats)
250 |         self.assertIn("memory_usage", stats)
251 |         self.assertIn("cpu_usage", stats)
252 |         
253 |     def test_result_export(self):
254 |         """测试结果导出"""
255 |         # 执行检测
256 |         results = self.detector.detect(
257 |             os.path.join(self.test_config["paths"]["repo"], "test_code.py")
258 |         )
259 |         
260 |         # 导出结果
261 |         export_file = os.path.join(
262 |             self.test_config["paths"]["results"],
263 |             "test_results.json"
264 |         )
265 |         self.detector.export_results(results, export_file)
266 |         
267 |         # 验证导出文件
268 |         self.assertTrue(os.path.exists(export_file))
269 |         
270 |         # 验证导出内容
271 |         with open(export_file) as f:
272 |             exported_data = json.load(f)
273 |             self.assertEqual(exported_data, results)
274 | 
275 | if __name__ == '__main__':
276 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/detector/test_version_predictor.py:
--------------------------------------------------------------------------------
  1 | """版本预测器测试模块
  2 | 
  3 | 该模块实现了对版本预测器的测试用例。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | from datetime import datetime, timedelta
 12 | import numpy as np
 13 | from detector.version_predictor import VersionPredictor
 14 | 
 15 | class TestVersionPredictor(unittest.TestCase):
 16 |     """版本预测器测试类"""
 17 |     
 18 |     def setUp(self):
 19 |         """测试前的准备工作"""
 20 |         self.predictor = VersionPredictor()
 21 |         
 22 |         # 生成测试数据
 23 |         self.training_data = self._generate_test_data()
 24 |         self.version_dates = self._generate_version_dates()
 25 |         
 26 |     def _generate_test_data(self):
 27 |         """生成测试数据"""
 28 |         return [
 29 |             {
 30 |                 'lines_added': 100,
 31 |                 'lines_deleted': 50,
 32 |                 'files_changed': 5,
 33 |                 'commit_frequency': 10,
 34 |                 'author_experience': 100,
 35 |                 'commit_time': datetime.now() - timedelta(days=30),
 36 |                 'content': 'def test_function():\n    pass'
 37 |             },
 38 |             {
 39 |                 'lines_added': 200,
 40 |                 'lines_deleted': 100,
 41 |                 'files_changed': 10,
 42 |                 'commit_frequency': 15,
 43 |                 'author_experience': 150,
 44 |                 'commit_time': datetime.now() - timedelta(days=20),
 45 |                 'content': 'class TestClass:\n    def method(self):\n        pass'
 46 |             }
 47 |         ]
 48 |         
 49 |     def _generate_version_dates(self):
 50 |         """生成版本日期"""
 51 |         base_date = datetime.now() - timedelta(days=60)
 52 |         return [
 53 |             base_date + timedelta(days=i*15)
 54 |             for i in range(5)
 55 |         ]
 56 |         
 57 |     def test_initialization(self):
 58 |         """测试初始化"""
 59 |         self.assertIsNotNone(self.predictor)
 60 |         self.assertIsNotNone(self.predictor.models)
 61 |         self.assertIsNotNone(self.predictor.scaler)
 62 |         
 63 |     def test_feature_extraction(self):
 64 |         """测试特征提取"""
 65 |         features = self.predictor._extract_features(self.training_data)
 66 |         
 67 |         self.assertIsInstance(features, np.ndarray)
 68 |         self.assertEqual(len(features), len(self.training_data))
 69 |         
 70 |     def test_time_feature_extraction(self):
 71 |         """测试时序特征提取"""
 72 |         time_features = self.predictor._extract_time_features(
 73 |             self.training_data[0]
 74 |         )
 75 |         
 76 |         self.assertIsInstance(time_features, list)
 77 |         self.assertEqual(len(time_features), 4)  # 4个时序特征
 78 |         
 79 |     def test_time_interval_computation(self):
 80 |         """测试时间间隔计算"""
 81 |         intervals = self.predictor._compute_time_intervals(
 82 |             self.version_dates
 83 |         )
 84 |         
 85 |         self.assertIsInstance(intervals, np.ndarray)
 86 |         self.assertEqual(len(intervals), len(self.version_dates) - 1)
 87 |         
 88 |     def test_model_training(self):
 89 |         """测试模型训练"""
 90 |         self.predictor.train(
 91 |             self.training_data,
 92 |             self.version_dates
 93 |         )
 94 |         
 95 |         # 验证模型是否已训练
 96 |         for model in self.predictor.models.values():
 97 |             self.assertTrue(hasattr(model, 'predict'))
 98 |             
 99 |     def test_version_prediction(self):
100 |         """测试版本预测"""
101 |         # 先训练模型
102 |         self.predictor.train(
103 |             self.training_data,
104 |             self.version_dates
105 |         )
106 |         
107 |         # 进行预测
108 |         prediction = self.predictor.predict(self.training_data)
109 |         
110 |         self.assertIsInstance(prediction, dict)
111 |         self.assertIn('predicted_interval', prediction)
112 |         self.assertIn('confidence_interval', prediction)
113 |         self.assertIn('model_contributions', prediction)
114 |         
115 |     def test_model_update(self):
116 |         """测试模型更新"""
117 |         # 先训练模型
118 |         self.predictor.train(
119 |             self.training_data,
120 |             self.version_dates
121 |         )
122 |         
123 |         # 准备新数据
124 |         new_data = [{
125 |             'lines_added': 150,
126 |             'lines_deleted': 75,
127 |             'files_changed': 7,
128 |             'commit_frequency': 12,
129 |             'author_experience': 120,
130 |             'commit_time': datetime.now() - timedelta(days=10),
131 |             'content': 'def new_function():\n    return True'
132 |         }]
133 |         
134 |         new_date = datetime.now()
135 |         
136 |         # 更新模型
137 |         self.predictor.update(new_data, new_date)
138 |         
139 |         # 验证更新后的预测
140 |         prediction = self.predictor.predict(new_data)
141 |         self.assertIsInstance(prediction, dict)
142 |         
143 |     def test_model_evaluation(self):
144 |         """测试模型评估"""
145 |         # 先训练模型
146 |         self.predictor.train(
147 |             self.training_data,
148 |             self.version_dates
149 |         )
150 |         
151 |         # 准备测试数据
152 |         test_data = self._generate_test_data()
153 |         test_dates = [
154 |             datetime.now() + timedelta(days=i*15)
155 |             for i in range(3)
156 |         ]
157 |         
158 |         # 评估模型
159 |         metrics = self.predictor.evaluate(test_data, test_dates)
160 |         
161 |         self.assertIsInstance(metrics, dict)
162 |         for model_metrics in metrics.values():
163 |             self.assertIn('mse', model_metrics)
164 |             self.assertIn('rmse', model_metrics)
165 |             self.assertIn('r2', model_metrics)
166 |             
167 |     def test_confidence_interval(self):
168 |         """测试置信区间计算"""
169 |         X = np.array([[1, 2, 3], [4, 5, 6]])
170 |         predictions = [10, 12, 15]
171 |         
172 |         interval = self.predictor._compute_confidence_interval(
173 |             X,
174 |             predictions
175 |         )
176 |         
177 |         self.assertIsInstance(interval, tuple)
178 |         self.assertEqual(len(interval), 2)
179 |         self.assertLess(interval[0], interval[1])
180 |         
181 |     def test_error_handling(self):
182 |         """测试错误处理"""
183 |         # 测试空数据
184 |         empty_prediction = self.predictor.predict([])
185 |         self.assertEqual(empty_prediction, {})
186 |         
187 |         # 测试无效数据
188 |         invalid_data = [{'invalid_key': 'value'}]
189 |         features = self.predictor._extract_features(invalid_data)
190 |         self.assertEqual(len(features), 1)
191 |         
192 |         # 测试无效日期
193 |         invalid_dates = []
194 |         intervals = self.predictor._compute_time_intervals(invalid_dates)
195 |         self.assertEqual(len(intervals), 0)
196 |         
197 | if __name__ == '__main__':
198 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/integration/test_clone_detection.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import tempfile
  4 | import shutil
  5 | import json
  6 | from preprocessor.preprocessor import Preprocessor
  7 | from detector.Detector import Detector
  8 | 
  9 | class TestCloneDetection(unittest.TestCase):
 10 |     """克隆检测集成测试"""
 11 |     
 12 |     @classmethod
 13 |     def setUpClass(cls):
 14 |         """测试类初始化"""
 15 |         # 创建临时工作目录
 16 |         cls.work_dir = tempfile.mkdtemp()
 17 |         
 18 |         # 创建测试项目结构
 19 |         cls._create_test_project()
 20 |         
 21 |     @classmethod
 22 |     def tearDownClass(cls):
 23 |         """测试类清理"""
 24 |         shutil.rmtree(cls.work_dir)
 25 |         
 26 |     @classmethod
 27 |     def _create_test_project(cls):
 28 |         """创建测试项目"""
 29 |         # 创建目录结构
 30 |         dirs = [
 31 |             "input/project1/src",
 32 |             "input/project2/src",
 33 |             "preprocessor/result",
 34 |             "preprocessor/initialSigs",
 35 |             "preprocessor/componentDB",
 36 |             "preprocessor/metaInfos",
 37 |             "detector/result"
 38 |         ]
 39 |         
 40 |         for dir_path in dirs:
 41 |             os.makedirs(os.path.join(cls.work_dir, dir_path))
 42 |             
 43 |         # 创建测试源文件
 44 |         cls._create_test_files()
 45 |         
 46 |     @classmethod
 47 |     def _create_test_files(cls):
 48 |         """创建测试文件"""
 49 |         # 项目1的源文件
 50 |         project1_file = os.path.join(cls.work_dir, "input/project1/src/main.cpp")
 51 |         with open(project1_file, 'w') as f:
 52 |             f.write("""
 53 |                 int add(int a, int b) {
 54 |                     return a + b;
 55 |                 }
 56 |                 
 57 |                 int subtract(int a, int b) {
 58 |                     return a - b;
 59 |                 }
 60 |                 
 61 |                 int main() {
 62 |                     int x = 10, y = 5;
 63 |                     printf("%d\\n", add(x, y));
 64 |                     printf("%d\\n", subtract(x, y));
 65 |                     return 0;
 66 |                 }
 67 |             """)
 68 |             
 69 |         # 项目2的源文件（包含克隆的代码）
 70 |         project2_file = os.path.join(cls.work_dir, "input/project2/src/calculator.cpp")
 71 |         with open(project2_file, 'w') as f:
 72 |             f.write("""
 73 |                 // 克隆的add函数
 74 |                 int add(int a, int b) {
 75 |                     return a + b;
 76 |                 }
 77 |                 
 78 |                 // 修改的subtract函数
 79 |                 int subtract(int x, int y) {
 80 |                     int result = x - y;
 81 |                     return result;
 82 |                 }
 83 |                 
 84 |                 // 新的multiply函数
 85 |                 int multiply(int a, int b) {
 86 |                     return a * b;
 87 |                 }
 88 |                 
 89 |                 int main() {
 90 |                     int a = 20, b = 10;
 91 |                     printf("%d\\n", add(a, b));
 92 |                     printf("%d\\n", subtract(a, b));
 93 |                     printf("%d\\n", multiply(a, b));
 94 |                     return 0;
 95 |                 }
 96 |             """)
 97 |             
 98 |     def setUp(self):
 99 |         """测试前准备"""
100 |         # 初始化预处理器和检测器
101 |         self.preprocessor = Preprocessor()
102 |         self.detector = Detector()
103 |         
104 |         # 设置工作目录
105 |         self.preprocessor.config.set_base_path(self.work_dir)
106 |         self.detector.base_path = self.work_dir
107 |         
108 |     def test_end_to_end_clone_detection(self):
109 |         """端到端克隆检测测试"""
110 |         try:
111 |             # 1. 运行预处理
112 |             self.preprocessor.run()
113 |             
114 |             # 验证预处理结果
115 |             self._verify_preprocessing()
116 |             
117 |             # 2. 运行克隆检测
118 |             self.detector.detect(
119 |                 os.path.join(self.work_dir, "input/project2"),
120 |                 "project2"
121 |             )
122 |             
123 |             # 验证检测结果
124 |             self._verify_detection()
125 |             
126 |         except Exception as e:
127 |             self.fail(f"端到端测试失败: {str(e)}")
128 |             
129 |     def _verify_preprocessing(self):
130 |         """验证预处理结果"""
131 |         # 检查初始签名
132 |         initial_sigs_file = os.path.join(
133 |             self.work_dir,
134 |             "preprocessor/initialSigs/initialSigs.json"
135 |         )
136 |         self.assertTrue(os.path.exists(initial_sigs_file))
137 |         
138 |         with open(initial_sigs_file, 'r') as f:
139 |             sigs = json.load(f)
140 |             self.assertGreater(len(sigs), 0)
141 |             
142 |         # 检查组件数据库
143 |         comp_db_dir = os.path.join(self.work_dir, "preprocessor/componentDB")
144 |         self.assertTrue(os.path.exists(comp_db_dir))
145 |         self.assertGreater(len(os.listdir(comp_db_dir)), 0)
146 |         
147 |     def _verify_detection(self):
148 |         """验证检测结果"""
149 |         # 检查检测结果文件
150 |         result_file = os.path.join(
151 |             self.work_dir,
152 |             "detector/result/result_project2"
153 |         )
154 |         self.assertTrue(os.path.exists(result_file))
155 |         
156 |         with open(result_file, 'r') as f:
157 |             results = f.readlines()
158 |             self.assertGreater(len(results), 0)
159 |             
160 |             # 解析结果
161 |             for result in results:
162 |                 parts = result.strip().split('\t')
163 |                 self.assertEqual(len(parts), 7)  # 验证结果格式
164 |                 
165 |                 # 验证字段
166 |                 project, repo, version, used, unused, modified, str_change = parts
167 |                 self.assertEqual(project, "project2")
168 |                 self.assertGreater(int(used), 0)  # 应该检测到至少一个使用的函数
169 |                 
170 |     def test_incremental_detection(self):
171 |         """增量检测测试"""
172 |         # 首次检测
173 |         self.preprocessor.run()
174 |         self.detector.detect(
175 |             os.path.join(self.work_dir, "input/project2"),
176 |             "project2"
177 |         )
178 |         
179 |         # 修改源文件
180 |         project2_file = os.path.join(self.work_dir, "input/project2/src/calculator.cpp")
181 |         with open(project2_file, 'a') as f:
182 |             f.write("""
183 |                 // 新增的divide函数
184 |                 float divide(int a, int b) {
185 |                     return a / (float)b;
186 |                 }
187 |             """)
188 |             
189 |         # 再次检测
190 |         self.preprocessor.run()
191 |         self.detector.detect(
192 |             os.path.join(self.work_dir, "input/project2"),
193 |             "project2"
194 |         )
195 |         
196 |         # 验证结果变化
197 |         result_file = os.path.join(
198 |             self.work_dir,
199 |             "detector/result/result_project2"
200 |         )
201 |         with open(result_file, 'r') as f:
202 |             results = f.readlines()
203 |             last_result = results[-1].strip().split('\t')
204 |             self.assertGreater(int(last_result[3]), 0)  # used
205 |             self.assertGreater(int(last_result[4]), 0)  # unused
206 |             
207 |     def test_error_conditions(self):
208 |         """错误条件测试"""
209 |         # 测试空项目
210 |         empty_dir = os.path.join(self.work_dir, "input/empty_project")
211 |         os.makedirs(empty_dir)
212 |         
213 |         try:
214 |             self.detector.detect(empty_dir, "empty_project")
215 |         except Exception as e:
216 |             self.fail(f"空项目处理失败: {str(e)}")
217 |             
218 |         # 测试无效文件
219 |         invalid_dir = os.path.join(self.work_dir, "input/invalid_project")
220 |         os.makedirs(invalid_dir)
221 |         with open(os.path.join(invalid_dir, "invalid.cpp"), 'w') as f:
222 |             f.write("This is not valid C++ code")
223 |             
224 |         try:
225 |             self.detector.detect(invalid_dir, "invalid_project")
226 |         except Exception as e:
227 |             self.fail(f"无效文件处理失败: {str(e)}")
228 |             
229 |     def test_performance(self):
230 |         """性能测试"""
231 |         import time
232 |         
233 |         # 创建大型测试项目
234 |         large_project_dir = os.path.join(self.work_dir, "input/large_project/src")
235 |         os.makedirs(large_project_dir)
236 |         
237 |         # 生成多个源文件
238 |         for i in range(100):
239 |             with open(os.path.join(large_project_dir, f"file{i}.cpp"), 'w') as f:
240 |                 f.write(f"""
241 |                     int func{i}(int x) {{
242 |                         return x * {i};
243 |                     }}
244 |                 """)
245 |                 
246 |         # 测量处理时间
247 |         start_time = time.time()
248 |         
249 |         self.preprocessor.run()
250 |         self.detector.detect(
251 |             os.path.join(self.work_dir, "input/large_project"),
252 |             "large_project"
253 |         )
254 |         
255 |         duration = time.time() - start_time
256 |         
257 |         # 验证性能
258 |         self.assertLess(duration, 60)  # 应该在60秒内完成
259 |         
260 | if __name__ == '__main__':
261 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/preprocessor/test_java_processor.py:
--------------------------------------------------------------------------------
  1 | """Java处理器测试模块
  2 | 
  3 | 该模块包含了对JavaProcessor类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import os
 12 | import tempfile
 13 | import shutil
 14 | from preprocessor.language_processors.java_processor import JavaProcessor
 15 | 
 16 | class TestJavaProcessor(unittest.TestCase):
 17 |     """JavaProcessor类的测试用例"""
 18 |     
 19 |     def setUp(self):
 20 |         """测试前的准备工作"""
 21 |         self.processor = JavaProcessor()
 22 |         self.temp_dir = tempfile.mkdtemp()
 23 |         
 24 |         # 创建测试Java文件
 25 |         self.test_file = os.path.join(self.temp_dir, "TestClass.java")
 26 |         self._create_test_file()
 27 |         
 28 |     def tearDown(self):
 29 |         """测试后的清理工作"""
 30 |         shutil.rmtree(self.temp_dir)
 31 |         
 32 |     def _create_test_file(self):
 33 |         """创建测试Java文件"""
 34 |         test_code = '''
 35 | package com.example.test;
 36 | 
 37 | import java.util.List;
 38 | import java.util.ArrayList;
 39 | 
 40 | public class TestClass extends BaseClass implements TestInterface {
 41 |     private String name;
 42 |     private int age;
 43 |     
 44 |     public TestClass(String name, int age) {
 45 |         this.name = name;
 46 |         this.age = age;
 47 |     }
 48 |     
 49 |     public String getName() {
 50 |         return name;
 51 |     }
 52 |     
 53 |     public void setName(String name) {
 54 |         this.name = name;
 55 |     }
 56 |     
 57 |     public int calculateComplexity(int n) {
 58 |         int result = 0;
 59 |         if (n > 0) {
 60 |             for (int i = 0; i < n; i++) {
 61 |                 if (i % 2 == 0) {
 62 |                     result += i;
 63 |                 } else {
 64 |                     result -= i;
 65 |                 }
 66 |                 while (result > 100) {
 67 |                     result /= 2;
 68 |                 }
 69 |             }
 70 |         }
 71 |         return result;
 72 |     }
 73 |     
 74 |     private List<String> processItems(List<String> items) {
 75 |         List<String> results = new ArrayList<>();
 76 |         for (String item : items) {
 77 |             if (item != null && !item.isEmpty()) {
 78 |                 results.add(item.toUpperCase());
 79 |             }
 80 |         }
 81 |         return results;
 82 |     }
 83 | }
 84 | '''
 85 |         with open(self.test_file, 'w', encoding='utf-8') as f:
 86 |             f.write(test_code)
 87 |             
 88 |     def test_extract_methods(self):
 89 |         """测试方法提取"""
 90 |         methods = self.processor.extract_methods(self.test_file)
 91 |         
 92 |         # 验证方法数量
 93 |         self.assertEqual(len(methods), 5)  # 构造函数 + 4个方法
 94 |         
 95 |         # 验证方法名称
 96 |         method_names = [m['name'] for m in methods]
 97 |         expected_names = [
 98 |             'TestClass',  # 构造函数
 99 |             'getName',
100 |             'setName',
101 |             'calculateComplexity',
102 |             'processItems'
103 |         ]
104 |         self.assertEqual(sorted(method_names), sorted(expected_names))
105 |         
106 |         # 验证方法属性
107 |         for method in methods:
108 |             self.assertIn('name', method)
109 |             self.assertIn('content', method)
110 |             self.assertIn('start_line', method)
111 |             self.assertIn('modifiers', method)
112 |             self.assertIn('return_type', method)
113 |             self.assertIn('parameters', method)
114 |             
115 |     def test_method_content(self):
116 |         """测试方法内容提取"""
117 |         methods = self.processor.extract_methods(self.test_file)
118 |         
119 |         # 找到calculateComplexity方法
120 |         complex_method = next(
121 |             m for m in methods if m['name'] == 'calculateComplexity'
122 |         )
123 |         
124 |         # 验证方法内容
125 |         self.assertIn('if (n > 0)', complex_method['content'])
126 |         self.assertIn('for (int i = 0', complex_method['content'])
127 |         self.assertIn('while (result > 100)', complex_method['content'])
128 |         
129 |     def test_return_type(self):
130 |         """测试返回类型提取"""
131 |         methods = self.processor.extract_methods(self.test_file)
132 |         
133 |         # 验证不同返回类型
134 |         return_types = {m['name']: m['return_type'] for m in methods}
135 |         self.assertEqual(return_types['getName'], 'String')
136 |         self.assertEqual(return_types['setName'], 'void')
137 |         self.assertEqual(return_types['calculateComplexity'], 'int')
138 |         
139 |     def test_parameters(self):
140 |         """测试参数提取"""
141 |         methods = self.processor.extract_methods(self.test_file)
142 |         
143 |         # 验证构造函数参数
144 |         constructor = next(m for m in methods if m['name'] == 'TestClass')
145 |         self.assertEqual(len(constructor['parameters']), 2)
146 |         self.assertEqual(constructor['parameters'][0]['type'], 'String')
147 |         self.assertEqual(constructor['parameters'][0]['name'], 'name')
148 |         self.assertEqual(constructor['parameters'][1]['type'], 'int')
149 |         self.assertEqual(constructor['parameters'][1]['name'], 'age')
150 |         
151 |     def test_complexity_analysis(self):
152 |         """测试复杂度分析"""
153 |         methods = self.processor.extract_methods(self.test_file)
154 |         
155 |         # 分析calculateComplexity方法的复杂度
156 |         complex_method = next(
157 |             m for m in methods if m['name'] == 'calculateComplexity'
158 |         )
159 |         metrics = self.processor.analyze_complexity(complex_method['content'])
160 |         
161 |         # 验证复杂度指标
162 |         self.assertGreater(metrics['cyclomatic_complexity'], 1)
163 |         self.assertGreater(metrics['cognitive_complexity'], 0)
164 |         self.assertGreater(metrics['nesting_depth'], 1)
165 |         
166 |     def test_class_info(self):
167 |         """测试类信息提取"""
168 |         class_info = self.processor.extract_class_info(self.test_file)
169 |         
170 |         # 验证基本信息
171 |         self.assertEqual(class_info['name'], 'TestClass')
172 |         self.assertEqual(class_info['package'], 'com.example.test')
173 |         
174 |         # 验证继承和实现
175 |         self.assertEqual(class_info['extends'], 'BaseClass')
176 |         self.assertIn('TestInterface', class_info['implements'])
177 |         
178 |         # 验证导入
179 |         self.assertIn('java.util.List', class_info['imports'])
180 |         self.assertIn('java.util.ArrayList', class_info['imports'])
181 |         
182 |     def test_method_signature(self):
183 |         """测试方法签名生成"""
184 |         methods = self.processor.extract_methods(self.test_file)
185 |         
186 |         # 验证不同方法的签名
187 |         for method in methods:
188 |             signature = self.processor.get_method_signature(method)
189 |             self.assertIsInstance(signature, str)
190 |             self.assertGreater(len(signature), 0)
191 |             
192 |             if method['name'] == 'calculateComplexity':
193 |                 self.assertIn('public int calculateComplexity(int n)', signature)
194 |                 
195 |     def test_code_normalization(self):
196 |         """测试代码规范化"""
197 |         test_code = '''
198 |         public void testMethod() {
199 |             // This is a comment
200 |             String name = "test";  /* Another comment */
201 |             if (name.equals("test")) {
202 |                 System.out.println("Hello");
203 |             }
204 |         }
205 |         '''
206 |         
207 |         normalized = self.processor.normalize_code(test_code)
208 |         
209 |         # 验证规范化结果
210 |         self.assertNotIn('//', normalized)  # 注释被移除
211 |         self.assertNotIn('/*', normalized)  # 多行注释被移除
212 |         self.assertNotIn('  ', normalized)  # 多余空格被移除
213 |         self.assertEqual(normalized.count('"'), 2)  # 字符串被规范化
214 |         
215 |     def test_error_handling(self):
216 |         """测试错误处理"""
217 |         # 测试处理不存在的文件
218 |         methods = self.processor.extract_methods("nonexistent.java")
219 |         self.assertEqual(len(methods), 0)
220 |         
221 |         # 测试处理无效的Java代码
222 |         invalid_file = os.path.join(self.temp_dir, "Invalid.java")
223 |         with open(invalid_file, 'w') as f:
224 |             f.write("invalid java code")
225 |             
226 |         methods = self.processor.extract_methods(invalid_file)
227 |         self.assertEqual(len(methods), 0)
228 |         
229 |     def test_large_file(self):
230 |         """测试处理大文件"""
231 |         # 创建包含多个方法的大文件
232 |         large_file = os.path.join(self.temp_dir, "LargeClass.java")
233 |         with open(large_file, 'w') as f:
234 |             f.write("public class LargeClass {\n")
235 |             for i in range(100):
236 |                 f.write(f'''
237 |                     public void method{i}() {{
238 |                         System.out.println("Method {i}");
239 |                     }}
240 |                 ''')
241 |             f.write("}")
242 |             
243 |         # 验证能够处理大文件
244 |         methods = self.processor.extract_methods(large_file)
245 |         self.assertEqual(len(methods), 100)
246 | 
247 | if __name__ == '__main__':
248 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/preprocessor/test_preprocessor.py:
--------------------------------------------------------------------------------
  1 | """预处理器测试模块
  2 | 
  3 | 该模块包含了对Preprocessor类的单元测试。
  4 | 
  5 | 作者: byRen2002
  6 | 修改日期: 2025年3月
  7 | 许可证: MIT License
  8 | """
  9 | 
 10 | import unittest
 11 | import os
 12 | import tempfile
 13 | import shutil
 14 | import json
 15 | from unittest.mock import patch, MagicMock
 16 | 
 17 | from preprocessor.preprocessor import (
 18 |     PreprocessorConfig,
 19 |     SignatureProcessor,
 20 |     MetaInfoManager,
 21 |     CodeSegmenter
 22 | )
 23 | 
 24 | class TestPreprocessor(unittest.TestCase):
 25 |     """Preprocessor类的测试用例"""
 26 |     
 27 |     def setUp(self):
 28 |         """测试前的准备工作"""
 29 |         # 创建临时测试目录
 30 |         self.temp_dir = tempfile.mkdtemp()
 31 |         self.test_data_dir = os.path.join(self.temp_dir, "test_data")
 32 |         os.makedirs(self.test_data_dir)
 33 |         
 34 |         # 创建测试仓库目录结构
 35 |         self.repo_dir = os.path.join(self.test_data_dir, "repos")
 36 |         self.repo_date_dir = os.path.join(self.test_data_dir, "repo_date")
 37 |         self.repo_func_dir = os.path.join(self.test_data_dir, "repo_functions")
 38 |         
 39 |         for dir_path in [self.repo_dir, self.repo_date_dir, self.repo_func_dir]:
 40 |             os.makedirs(dir_path)
 41 |             
 42 |         # 创建测试配置
 43 |         self.config = PreprocessorConfig()
 44 |         self.config.current_path = self.test_data_dir
 45 |         self.config.tag_date_path = self.repo_date_dir
 46 |         self.config.result_path = self.repo_func_dir
 47 |         
 48 |         # 创建测试数据
 49 |         self._create_test_data()
 50 |         
 51 |     def tearDown(self):
 52 |         """测试后的清理工作"""
 53 |         # 删除临时目录及其内容
 54 |         shutil.rmtree(self.temp_dir)
 55 |         
 56 |     def _create_test_data(self):
 57 |         """创建测试数据"""
 58 |         # 创建版本日期文件
 59 |         repo_date_file = os.path.join(self.repo_date_dir, "test_repo")
 60 |         with open(repo_date_file, 'w') as f:
 61 |             f.write("2024-01-01 tag: v1.0\n")
 62 |             f.write("2024-02-01 tag: v1.1\n")
 63 |             f.write("2024-03-01 tag: v2.0\n")
 64 |             
 65 |         # 创建函数签名文件
 66 |         repo_func_dir = os.path.join(self.repo_func_dir, "test_repo")
 67 |         os.makedirs(repo_func_dir)
 68 |         
 69 |         versions = ["v1.0", "v1.1", "v2.0"]
 70 |         for version in versions:
 71 |             func_file = os.path.join(repo_func_dir, f"fuzzy_{version}.hidx")
 72 |             with open(func_file, 'w') as f:
 73 |                 f.write("hash\tfunction\tfile\n")
 74 |                 f.write(f"hash1\tfunc1\tfile1.py\n")
 75 |                 f.write(f"hash2\tfunc2\tfile2.py\n")
 76 |                 
 77 |     def test_config_initialization(self):
 78 |         """测试配置初始化"""
 79 |         # 验证目录创建
 80 |         self.assertTrue(os.path.exists(self.config.ver_idx_path))
 81 |         self.assertTrue(os.path.exists(self.config.initial_db_path))
 82 |         self.assertTrue(os.path.exists(self.config.final_db_path))
 83 |         self.assertTrue(os.path.exists(self.config.meta_path))
 84 |         
 85 |     def test_signature_processing(self):
 86 |         """测试签名处理"""
 87 |         processor = SignatureProcessor(self.config)
 88 |         
 89 |         # 处理测试仓库
 90 |         processor.process_single_repo("test_repo")
 91 |         
 92 |         # 验证输出文件
 93 |         self.assertTrue(
 94 |             os.path.exists(
 95 |                 os.path.join(self.config.func_date_path, "test_repo_funcdate")
 96 |             )
 97 |         )
 98 |         self.assertTrue(
 99 |             os.path.exists(
100 |                 os.path.join(self.config.ver_idx_path, "test_repo_idx")
101 |             )
102 |         )
103 |         self.assertTrue(
104 |             os.path.exists(
105 |                 os.path.join(self.config.initial_db_path, "test_repo_sig")
106 |             )
107 |         )
108 |         
109 |         # 验证版本索引内容
110 |         with open(os.path.join(self.config.ver_idx_path, "test_repo_idx")) as f:
111 |             ver_idx = json.load(f)
112 |             self.assertEqual(len(ver_idx), 3)  # 应该有3个版本
113 |             
114 |     def test_meta_info_management(self):
115 |         """测试元信息管理"""
116 |         # 先处理签名
117 |         processor = SignatureProcessor(self.config)
118 |         processor.process_single_repo("test_repo")
119 |         
120 |         # 处理元信息
121 |         meta_manager = MetaInfoManager(self.config)
122 |         meta_manager.save_meta_infos()
123 |         
124 |         # 验证元信息文件
125 |         self.assertTrue(
126 |             os.path.exists(os.path.join(self.config.meta_path, "aveFuncs"))
127 |         )
128 |         self.assertTrue(
129 |             os.path.exists(os.path.join(self.config.meta_path, "allFuncs"))
130 |         )
131 |         self.assertTrue(
132 |             os.path.exists(os.path.join(self.config.meta_path, "uniqueFuncs"))
133 |         )
134 |         
135 |         # 验证权重文件
136 |         self.assertTrue(
137 |             os.path.exists(
138 |                 os.path.join(self.config.weight_path, "test_repo_weights")
139 |             )
140 |         )
141 |         
142 |     def test_code_segmentation(self):
143 |         """测试代码分割"""
144 |         # 准备数据
145 |         processor = SignatureProcessor(self.config)
146 |         processor.process_single_repo("test_repo")
147 |         
148 |         meta_manager = MetaInfoManager(self.config)
149 |         meta_manager.save_meta_infos()
150 |         
151 |         # 执行代码分割
152 |         segmenter = CodeSegmenter(self.config)
153 |         segmenter.segment_code()
154 |         
155 |         # 验证分割结果
156 |         self.assertTrue(
157 |             os.path.exists(
158 |                 os.path.join(self.config.final_db_path, "test_repo_sig")
159 |             )
160 |         )
161 |         
162 |     def test_version_date_extraction(self):
163 |         """测试版本日期提取"""
164 |         processor = SignatureProcessor(self.config)
165 |         ver_dates = processor.extract_ver_date("test_repo")
166 |         
167 |         # 验证版本日期
168 |         self.assertEqual(ver_dates["v1.0"], "2024-01-01")
169 |         self.assertEqual(ver_dates["v1.1"], "2024-02-01")
170 |         self.assertEqual(ver_dates["v2.0"], "2024-03-01")
171 |         
172 |     def test_error_handling(self):
173 |         """测试错误处理"""
174 |         processor = SignatureProcessor(self.config)
175 |         
176 |         # 测试处理不存在的仓库
177 |         processor.process_single_repo("nonexistent_repo")
178 |         
179 |         # 验证不会创建相关文件
180 |         self.assertFalse(
181 |             os.path.exists(
182 |                 os.path.join(self.config.func_date_path, "nonexistent_repo_funcdate")
183 |             )
184 |         )
185 |         
186 |     def test_concurrent_processing(self):
187 |         """测试并发处理"""
188 |         import threading
189 |         
190 |         def worker():
191 |             processor = SignatureProcessor(self.config)
192 |             processor.process_single_repo("test_repo")
193 |             
194 |         # 创建多个线程
195 |         threads = [threading.Thread(target=worker) for _ in range(4)]
196 |         
197 |         # 启动所有线程
198 |         for thread in threads:
199 |             thread.start()
200 |             
201 |         # 等待所有线程完成
202 |         for thread in threads:
203 |             thread.join()
204 |             
205 |         # 验证处理结果的一致性
206 |         with open(os.path.join(self.config.ver_idx_path, "test_repo_idx")) as f:
207 |             ver_idx = json.load(f)
208 |             self.assertEqual(len(ver_idx), 3)
209 |             
210 |     def test_memory_efficiency(self):
211 |         """测试内存效率"""
212 |         import psutil
213 |         process = psutil.Process()
214 |         
215 |         # 记录初始内存使用
216 |         initial_memory = process.memory_info().rss
217 |         
218 |         # 处理大量数据
219 |         processor = SignatureProcessor(self.config)
220 |         for i in range(10):
221 |             # 创建更多测试数据
222 |             repo_name = f"test_repo_{i}"
223 |             repo_dir = os.path.join(self.repo_func_dir, repo_name)
224 |             os.makedirs(repo_dir)
225 |             
226 |             for j in range(100):
227 |                 with open(os.path.join(repo_dir, f"fuzzy_v{j}.hidx"), 'w') as f:
228 |                     f.write("hash\tfunction\tfile\n")
229 |                     for k in range(1000):
230 |                         f.write(f"hash{k}\tfunc{k}\tfile{k}.py\n")
231 |                         
232 |             processor.process_single_repo(repo_name)
233 |             
234 |         # 记录最终内存使用
235 |         final_memory = process.memory_info().rss
236 |         
237 |         # 验证内存增长在合理范围内
238 |         memory_growth = (final_memory - initial_memory) / (1024 * 1024)  # MB
239 |         self.assertLess(memory_growth, 1000)  # 内存增长应小于1GB
240 | 
241 | if __name__ == '__main__':
242 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/run_tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """测试运行器
  3 | 
  4 | 用于运行所有单元测试、集成测试和安全测试。
  5 | 支持并行测试执行和测试报告生成。
  6 | 
  7 | 作者: byRen2002
  8 | 修改日期: 2025年3月
  9 | 许可证: MIT
 10 | """
 11 | 
 12 | import os
 13 | import sys
 14 | import unittest
 15 | import argparse
 16 | import coverage
 17 | import xmlrunner
 18 | import concurrent.futures
 19 | from typing import List, Tuple
 20 | from datetime import datetime
 21 | 
 22 | def discover_tests(start_dir: str) -> List[unittest.TestSuite]:
 23 |     """发现测试用例
 24 |     
 25 |     Args:
 26 |         start_dir: 起始目录
 27 |         
 28 |     Returns:
 29 |         测试套件列表
 30 |     """
 31 |     loader = unittest.TestLoader()
 32 |     suites = []
 33 |     
 34 |     for root, _, files in os.walk(start_dir):
 35 |         if any(f.startswith('test_') and f.endswith('.py') for f in files):
 36 |             suite = loader.discover(root, pattern='test_*.py')
 37 |             suites.append(suite)
 38 |             
 39 |     return suites
 40 | 
 41 | def run_test_suite(suite: unittest.TestSuite) -> Tuple[int, int, List[str]]:
 42 |     """运行测试套件
 43 |     
 44 |     Args:
 45 |         suite: 测试套件
 46 |         
 47 |     Returns:
 48 |         (成功数, 失败数, 错误信息列表)
 49 |     """
 50 |     result = unittest.TestResult()
 51 |     suite.run(result)
 52 |     
 53 |     errors = []
 54 |     for test, error in result.errors:
 55 |         errors.append(f"错误 ({test}): {error}")
 56 |     for test, failure in result.failures:
 57 |         errors.append(f"失败 ({test}): {failure}")
 58 |         
 59 |     return result.testsRun - len(result.failures) - len(result.errors), \
 60 |            len(result.failures) + len(result.errors), \
 61 |            errors
 62 | 
 63 | def main():
 64 |     """主函数"""
 65 |     parser = argparse.ArgumentParser(description='Re-Centris 测试运行器')
 66 |     parser.add_argument('--parallel', action='store_true', help='并行运行测试')
 67 |     parser.add_argument('--coverage', action='store_true', help='生成覆盖率报告')
 68 |     parser.add_argument('--xml', action='store_true', help='生成XML测试报告')
 69 |     parser.add_argument('--html', action='store_true', help='生成HTML测试报告')
 70 |     args = parser.parse_args()
 71 |     
 72 |     # 设置覆盖率收集
 73 |     if args.coverage:
 74 |         cov = coverage.Coverage()
 75 |         cov.start()
 76 |         
 77 |     # 发现测试
 78 |     suites = discover_tests('tests')
 79 |     if not suites:
 80 |         print("未发现测试用例")
 81 |         sys.exit(1)
 82 |         
 83 |     total_tests = 0
 84 |     passed_tests = 0
 85 |     failed_tests = 0
 86 |     all_errors = []
 87 |     
 88 |     # 运行测试
 89 |     if args.parallel:
 90 |         print("并行运行测试...")
 91 |         with concurrent.futures.ProcessPoolExecutor() as executor:
 92 |             futures = [executor.submit(run_test_suite, suite) for suite in suites]
 93 |             for future in concurrent.futures.as_completed(futures):
 94 |                 passed, failed, errors = future.result()
 95 |                 passed_tests += passed
 96 |                 failed_tests += failed
 97 |                 all_errors.extend(errors)
 98 |                 total_tests += passed + failed
 99 |     else:
100 |         print("串行运行测试...")
101 |         for suite in suites:
102 |             passed, failed, errors = run_test_suite(suite)
103 |             passed_tests += passed
104 |             failed_tests += failed
105 |             all_errors.extend(errors)
106 |             total_tests += passed + failed
107 |             
108 |     # 生成报告
109 |     if args.xml:
110 |         print("生成XML报告...")
111 |         xml_dir = 'test-reports/xml'
112 |         os.makedirs(xml_dir, exist_ok=True)
113 |         for suite in suites:
114 |             xmlrunner.XMLTestRunner(output=xml_dir).run(suite)
115 |             
116 |     if args.html:
117 |         print("生成HTML报告...")
118 |         html_dir = 'test-reports/html'
119 |         os.makedirs(html_dir, exist_ok=True)
120 |         with open(os.path.join(html_dir, 'index.html'), 'w') as f:
121 |             f.write(f"""
122 |             <html>
123 |             <head><title>测试报告</title></head>
124 |             <body>
125 |             <h1>测试报告 ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})</h1>
126 |             <p>总测试数: {total_tests}</p>
127 |             <p>通过: {passed_tests}</p>
128 |             <p>失败: {failed_tests}</p>
129 |             <h2>错误详情:</h2>
130 |             <pre>{'<br>'.join(all_errors)}</pre>
131 |             </body>
132 |             </html>
133 |             """)
134 |             
135 |     if args.coverage:
136 |         print("生成覆盖率报告...")
137 |         cov.stop()
138 |         cov.save()
139 |         
140 |         # 生成报告
141 |         cov_dir = 'test-reports/coverage'
142 |         os.makedirs(cov_dir, exist_ok=True)
143 |         
144 |         # HTML报告
145 |         cov.html_report(directory=os.path.join(cov_dir, 'html'))
146 |         
147 |         # XML报告
148 |         cov.xml_report(outfile=os.path.join(cov_dir, 'coverage.xml'))
149 |         
150 |     # 打印结果
151 |     print("\n测试结果汇总:")
152 |     print(f"总测试数: {total_tests}")
153 |     print(f"通过: {passed_tests}")
154 |     print(f"失败: {failed_tests}")
155 |     
156 |     if all_errors:
157 |         print("\n错误详情:")
158 |         for error in all_errors:
159 |             print(error)
160 |             
161 |     # 返回状态码
162 |     return 1 if failed_tests > 0 else 0
163 | 
164 | if __name__ == '__main__':
165 |     sys.exit(main()) 


--------------------------------------------------------------------------------
/tests/security/test_security.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import tempfile
  4 | import shutil
  5 | import json
  6 | import subprocess
  7 | from unittest.mock import patch
  8 | from preprocessor.preprocessor import Preprocessor
  9 | from detector.Detector import Detector
 10 | 
 11 | class TestSecurity(unittest.TestCase):
 12 |     """安全测试"""
 13 |     
 14 |     @classmethod
 15 |     def setUpClass(cls):
 16 |         """测试类初始化"""
 17 |         cls.work_dir = tempfile.mkdtemp()
 18 |         cls._create_test_environment()
 19 |         
 20 |     @classmethod
 21 |     def tearDownClass(cls):
 22 |         """测试类清理"""
 23 |         shutil.rmtree(cls.work_dir)
 24 |         
 25 |     @classmethod
 26 |     def _create_test_environment(cls):
 27 |         """创建测试环境"""
 28 |         # 创建目录结构
 29 |         dirs = [
 30 |             "input",
 31 |             "preprocessor/result",
 32 |             "preprocessor/initialSigs",
 33 |             "preprocessor/componentDB",
 34 |             "preprocessor/metaInfos",
 35 |             "detector/result"
 36 |         ]
 37 |         
 38 |         for dir_path in dirs:
 39 |             os.makedirs(os.path.join(cls.work_dir, dir_path))
 40 |             
 41 |     def setUp(self):
 42 |         """测试前准备"""
 43 |         self.preprocessor = Preprocessor()
 44 |         self.detector = Detector()
 45 |         
 46 |         self.preprocessor.config.set_base_path(self.work_dir)
 47 |         self.detector.base_path = self.work_dir
 48 |         
 49 |     def test_path_traversal(self):
 50 |         """测试路径遍历攻击防护"""
 51 |         # 测试相对路径遍历
 52 |         malicious_paths = [
 53 |             "../../../etc/passwd",
 54 |             "..\\..\\..\\Windows\\System32\\config\\SAM",
 55 |             "%2e%2e%2f%2e%2e%2f%2e%2e%2f",  # URL编码的../../../
 56 |             "input/project/../../etc/passwd"
 57 |         ]
 58 |         
 59 |         for path in malicious_paths:
 60 |             full_path = os.path.join(self.work_dir, path)
 61 |             result = self.detector.process_file(full_path, self.work_dir)
 62 |             self.assertEqual(result, ({}, 0, 0, 0))
 63 |             
 64 |     def test_file_content_injection(self):
 65 |         """测试文件内容注入防护"""
 66 |         # 创建包含恶意内容的文件
 67 |         malicious_file = os.path.join(self.work_dir, "input/malicious.cpp")
 68 |         with open(malicious_file, 'w') as f:
 69 |             f.write("""
 70 |                 #include <stdlib.h>
 71 |                 
 72 |                 int main() {
 73 |                     system("rm -rf /");  // 危险的系统调用
 74 |                     return 0;
 75 |                 }
 76 |                 
 77 |                 __attribute__((constructor))
 78 |                 void init() {
 79 |                     system("echo 'Malicious code executed'");
 80 |                 }
 81 |             """)
 82 |             
 83 |         # 确保处理过程不执行代码
 84 |         with patch('subprocess.run') as mock_run:
 85 |             self.detector.process_file(malicious_file, self.work_dir)
 86 |             mock_run.assert_not_called()
 87 |             
 88 |     def test_memory_limits(self):
 89 |         """测试内存限制"""
 90 |         # 创建大文件
 91 |         large_file = os.path.join(self.work_dir, "input/large.cpp")
 92 |         with open(large_file, 'w') as f:
 93 |             f.write("a" * (100 * 1024 * 1024))  # 100MB
 94 |             
 95 |         try:
 96 |             self.detector.process_file(large_file, self.work_dir)
 97 |         except MemoryError:
 98 |             self.fail("内存限制处理失败")
 99 |             
100 |     def test_cpu_limits(self):
101 |         """测试CPU限制"""
102 |         # 创建CPU密集型文件
103 |         cpu_intensive_file = os.path.join(self.work_dir, "input/cpu_intensive.cpp")
104 |         with open(cpu_intensive_file, 'w') as f:
105 |             f.write("int main() { while(1); return 0; }")
106 |             
107 |         start_time = time.time()
108 |         self.detector.process_file(cpu_intensive_file, self.work_dir)
109 |         duration = time.time() - start_time
110 |         
111 |         self.assertLess(duration, 10)  # 应该在10秒内超时
112 |         
113 |     def test_file_type_validation(self):
114 |         """测试文件类型验证"""
115 |         # 创建伪装的可执行文件
116 |         fake_cpp = os.path.join(self.work_dir, "input/fake.cpp")
117 |         with open(fake_cpp, 'wb') as f:
118 |             f.write(b"MZ\x90\x00\x03")  # PE文件头
119 |             
120 |         result = self.detector.process_file(fake_cpp, self.work_dir)
121 |         self.assertEqual(result, ({}, 0, 0, 0))
122 |         
123 |     def test_input_sanitization(self):
124 |         """测试输入净化"""
125 |         # 测试SQL注入
126 |         malicious_input = "'; DROP TABLE users; --"
127 |         safe_path = os.path.join(self.work_dir, malicious_input)
128 |         result = self.detector.process_file(safe_path, self.work_dir)
129 |         self.assertEqual(result, ({}, 0, 0, 0))
130 |         
131 |         # 测试命令注入
132 |         malicious_input = "; rm -rf /"
133 |         safe_path = os.path.join(self.work_dir, malicious_input)
134 |         result = self.detector.process_file(safe_path, self.work_dir)
135 |         self.assertEqual(result, ({}, 0, 0, 0))
136 |         
137 |     def test_file_permissions(self):
138 |         """测试文件权限"""
139 |         # 创建只读文件
140 |         readonly_file = os.path.join(self.work_dir, "input/readonly.cpp")
141 |         with open(readonly_file, 'w') as f:
142 |             f.write("int main() { return 0; }")
143 |             
144 |         # 设置只读权限
145 |         os.chmod(readonly_file, 0o444)
146 |         
147 |         try:
148 |             self.detector.process_file(readonly_file, self.work_dir)
149 |         except PermissionError:
150 |             self.fail("文件权限处理失败")
151 |             
152 |     def test_concurrent_access(self):
153 |         """测试并发访问安全"""
154 |         import threading
155 |         
156 |         # 创建测试文件
157 |         test_file = os.path.join(self.work_dir, "input/concurrent.cpp")
158 |         with open(test_file, 'w') as f:
159 |             f.write("int main() { return 0; }")
160 |             
161 |         # 并发访问
162 |         def process_file():
163 |             self.detector.process_file(test_file, self.work_dir)
164 |             
165 |         threads = []
166 |         for _ in range(10):
167 |             thread = threading.Thread(target=process_file)
168 |             threads.append(thread)
169 |             thread.start()
170 |             
171 |         for thread in threads:
172 |             thread.join()
173 |             
174 |     def test_resource_cleanup(self):
175 |         """测试资源清理"""
176 |         import psutil
177 |         
178 |         # 记录初始文件描述符数量
179 |         process = psutil.Process()
180 |         initial_fds = process.num_fds()
181 |         
182 |         # 执行多次操作
183 |         for _ in range(10):
184 |             test_file = os.path.join(self.work_dir, "input/test.cpp")
185 |             with open(test_file, 'w') as f:
186 |                 f.write("int main() { return 0; }")
187 |                 
188 |             self.detector.process_file(test_file, self.work_dir)
189 |             
190 |         # 验证文件描述符没有泄漏
191 |         final_fds = process.num_fds()
192 |         self.assertLessEqual(final_fds - initial_fds, 5)
193 |         
194 |     def test_data_validation(self):
195 |         """测试数据验证"""
196 |         # 测试无效的TLSH哈希
197 |         invalid_hashes = [
198 |             "not_a_hash",
199 |             "T1" + "0" * 69,  # 长度不足
200 |             "T1" + "0" * 71,  # 长度过长
201 |             "T1" + "XYZ" + "0" * 67  # 无效字符
202 |         ]
203 |         
204 |         for hash_val in invalid_hashes:
205 |             result = self.detector._compute_tlsh(hash_val)
206 |             self.assertIsNone(result)
207 |             
208 |     def test_error_handling(self):
209 |         """测试错误处理"""
210 |         # 测试文件不存在
211 |         result = self.detector.process_file(
212 |             "nonexistent.cpp",
213 |             self.work_dir
214 |         )
215 |         self.assertEqual(result, ({}, 0, 0, 0))
216 |         
217 |         # 测试无效组件
218 |         result = self.detector.process_component(
219 |             ("invalid_comp", {}, "test_repo", {})
220 |         )
221 |         self.assertIsNone(result)
222 |         
223 |         # 测试无效配置
224 |         with self.assertRaises(Exception):
225 |             detector = Detector("invalid_config.yaml")
226 |             
227 | if __name__ == '__main__':
228 |     unittest.main() 


--------------------------------------------------------------------------------