├── src ├── __init__.py └── gitprobe │ ├── __init__.py │ ├── core │ ├── __init__.py │ └── analysis_limits.py │ ├── models │ ├── __init__.py │ ├── analysis.py │ └── core.py │ ├── utils │ ├── __init__.py │ ├── logging_config.py │ ├── security.py │ └── patterns.py │ ├── web │ ├── __init__.py │ └── server.py │ ├── analysis │ ├── __init__.py │ ├── repo_analyzer.py │ ├── cloning.py │ ├── analysis_service.py │ └── call_graph_analyzer.py │ ├── analyzers │ ├── __init__.py │ ├── python.py │ ├── go.py │ ├── c_cpp.py │ └── javascript.py │ ├── __main__.py │ └── cli.py ├── docs └── preview.png ├── tests ├── __init__.py ├── README.md └── test_integration.py ├── requirements.txt ├── gitprobe ├── LICENSE.md ├── DEVELOPMENT.md ├── pyproject.toml ├── .gitignore └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/gitprobe/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /src/gitprobe/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /src/gitprobe/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /src/gitprobe/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /src/gitprobe/web/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /src/gitprobe/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /src/gitprobe/analyzers/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty 2 | -------------------------------------------------------------------------------- /docs/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-probe/gitprobe/HEAD/docs/preview.png -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | GitProbe Test Suite 3 | 4 | Integration and unit tests for GitProbe analyzers and components. 5 | """ 6 | 7 | __version__ = "1.0.0" 8 | __author__ = "GitProbe Team" -------------------------------------------------------------------------------- /src/gitprobe/utils/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def setup_logging(): 6 | """ 7 | Set up basic logging configuration. 8 | """ 9 | logging.basicConfig( 10 | level=logging.INFO, 11 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 12 | stream=sys.stdout, 13 | ) 14 | 15 | 16 | logger = logging.getLogger("gitprobe") 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gitpython==3.1.40 2 | pathspec==0.12.1 3 | requests==2.31.0 4 | pydantic>=2.0.0 5 | fastapi>=0.100.0 6 | uvicorn>=0.23.0 7 | python-multipart 8 | tree-sitter==0.23.2 9 | # Tree-sitter language packages (compatible versions) 10 | tree-sitter-c==0.21.4 11 | tree-sitter-cpp==0.23.4 12 | tree-sitter-go==0.21.1 13 | tree-sitter-javascript==0.21.4 14 | tree-sitter-typescript==0.21.2 15 | tree-sitter-rust==0.21.2 16 | rich>=13.0.0 17 | -------------------------------------------------------------------------------- /gitprobe: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | GitProbe CLI Wrapper 4 | 5 | Simple entry point that sets up Python path and runs the CLI. 6 | """ 7 | 8 | import sys 9 | import os 10 | from pathlib import Path 11 | 12 | # Add src directory to Python path 13 | current_dir = Path(__file__).parent 14 | src_dir = current_dir / "src" 15 | if str(src_dir) not in sys.path: 16 | sys.path.insert(0, str(src_dir)) 17 | 18 | if __name__ == "__main__": 19 | from gitprobe.cli import main # type: ignore 20 | main() -------------------------------------------------------------------------------- /src/gitprobe/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | GitProbe Package Main Entry Point 3 | 4 | Allows running GitProbe as a module: python -m gitprobe 5 | """ 6 | 7 | import sys 8 | from pathlib import Path 9 | 10 | 11 | def main(): 12 | """Main entry point for running GitProbe server.""" 13 | try: 14 | import uvicorn 15 | 16 | print("🚀 Starting GitProbe Server via package...") 17 | uvicorn.run("gitprobe.web.server:app", host="0.0.0.0", port=8000, reload=True) 18 | except ImportError: 19 | print("❌ uvicorn not installed. Please install with: pip install uvicorn") 20 | sys.exit(1) 21 | 22 | 23 | if __name__ == "__main__": 24 | main() 25 | -------------------------------------------------------------------------------- /src/gitprobe/models/analysis.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Dict, Any, Optional 3 | from .core import Function, CallRelationship, Repository 4 | 5 | 6 | class AnalysisResult(BaseModel): 7 | """Result of analyzing a repository""" 8 | 9 | repository: Repository 10 | functions: List[Function] 11 | relationships: List[CallRelationship] 12 | file_tree: Dict[str, Any] 13 | summary: Dict[str, Any] 14 | visualization: Dict[str, Any] = {} 15 | readme_content: Optional[str] = None 16 | 17 | 18 | class NodeSelection(BaseModel): 19 | """Selected nodes for partial export""" 20 | 21 | selected_nodes: List[str] = [] 22 | include_relationships: bool = True 23 | custom_names: Dict[str, str] = {} 24 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 GitProbe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/gitprobe/models/core.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional, Dict, Any 3 | from datetime import datetime 4 | 5 | 6 | class Function(BaseModel): 7 | """A function found in the codebase""" 8 | 9 | name: str 10 | file_path: str 11 | line_start: int 12 | line_end: Optional[int] = None 13 | parameters: Optional[List[str]] = None 14 | docstring: Optional[str] = None 15 | is_method: bool = False 16 | class_name: Optional[str] = None 17 | code_snippet: Optional[str] = None 18 | display_name: Optional[str] = None 19 | 20 | def get_display_name(self) -> str: 21 | """Get the name to display (custom or original)""" 22 | return self.display_name or self.name 23 | 24 | 25 | class CallRelationship(BaseModel): 26 | """A call relationship between two functions""" 27 | 28 | caller: str 29 | callee: str 30 | call_line: Optional[int] = None 31 | is_resolved: bool = False 32 | 33 | 34 | class Repository(BaseModel): 35 | """Basic repository information""" 36 | 37 | url: str 38 | name: str 39 | clone_path: str 40 | analysis_id: str 41 | -------------------------------------------------------------------------------- /src/gitprobe/utils/security.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | def _inside(base: Path, target: Path) -> bool: 5 | base_r = base.resolve() 6 | try: 7 | target_r = target.resolve() 8 | return target_r.is_relative_to(base_r) # py>=3.9 9 | except AttributeError: 10 | return str(target.resolve()).startswith(str(base_r)) 11 | 12 | def assert_safe_path(base_dir: Path, target: Path): 13 | # Block symlinks (file or dir) 14 | if target.is_symlink(): 15 | raise PermissionError(f"Symlink blocked: {target}") 16 | # Block paths that escape repo 17 | if not _inside(base_dir, target): 18 | raise PermissionError(f"Path escapes repo: {target} -> {target.resolve()}") 19 | 20 | def safe_open_text(base_dir: Path, target: Path, encoding="utf-8"): 21 | assert_safe_path(base_dir, target) 22 | flags = os.O_RDONLY 23 | if hasattr(os, "O_NOFOLLOW"): 24 | flags |= os.O_NOFOLLOW 25 | fd = os.open(str(target), flags) 26 | try: 27 | with os.fdopen(fd, "r", encoding=encoding, errors="replace") as f: 28 | return f.read() 29 | finally: 30 | try: 31 | os.close(fd) 32 | except OSError: 33 | pass 34 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # GitProbe Development Guide 2 | 3 | ## Quick Setup 4 | 5 | ### 1. Install in Development Mode 6 | ```bash 7 | # Clone the repository 8 | git clone https://github.com/yourusername/gitprobe.git 9 | cd gitprobe 10 | 11 | # Create virtual environment 12 | python -m venv env 13 | source env/bin/activate # On Windows: env\Scripts\activate 14 | 15 | # Install in editable mode with dev dependencies 16 | pip install -e ".[dev]" 17 | ``` 18 | 19 | ### 2. Run GitProbe 20 | 21 | Once installed, you can use GitProbe from anywhere: 22 | 23 | ```bash 24 | # Analyze a repository 25 | gitprobe analyze microsoft/vscode 26 | 27 | # Start the server 28 | gitprobe server 29 | 30 | # Start server with custom settings 31 | gitprobe server --port 8080 --reload 32 | ``` 33 | 34 | ## Alternative Development Setup 35 | 36 | If you prefer not to install the package: 37 | 38 | ```bash 39 | # Set Python path and run directly 40 | PYTHONPATH=src python -m gitprobe.cli analyze user/repo 41 | PYTHONPATH=src python -m gitprobe.web.server 42 | ``` 43 | 44 | ## Project Structure 45 | 46 | ``` 47 | gitprobe/ 48 | ├── src/gitprobe/ # Main package 49 | │ ├── analyzers/ # Language-specific analyzers 50 | │ ├── analysis/ # Business logic & orchestration 51 | │ ├── core/ # Shared utilities 52 | │ ├── models/ # Data models 53 | │ ├── utils/ # Helper functions 54 | │ ├── web/ # FastAPI server 55 | │ └── cli.py # Command-line interface 56 | ├── pyproject.toml # Package configuration 57 | ├── requirements.txt # Dependencies 58 | └── README.md # User documentation 59 | ``` 60 | 61 | ## Development Commands 62 | 63 | ```bash 64 | # Run tests 65 | pytest 66 | 67 | # Format code 68 | black src/ 69 | isort src/ 70 | 71 | # Type checking 72 | mypy src/ 73 | 74 | # Install pre-commit hooks 75 | pre-commit install 76 | ``` 77 | 78 | ## Adding New Languages 79 | 80 | 1. Create analyzer in `src/gitprobe/analyzers/` 81 | 2. Add language limits in `src/gitprobe/core/analysis_limits.py` 82 | 3. Update `src/gitprobe/analysis/call_graph_analyzer.py` 83 | 4. Add tests and documentation -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "gitprobe" 7 | version = "0.1.0" 8 | description = "Advanced repository analysis tool with multi-language call graph generation" 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | license = {text = "MIT"} 12 | authors = [ 13 | {name = "GitProbe Contributors"} 14 | ] 15 | keywords = ["code-analysis", "call-graph", "ast", "repository-analysis"] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Developers", 19 | "License :: OSI Approved :: MIT License", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12", 26 | "Topic :: Software Development :: Code Generators", 27 | "Topic :: Software Development :: Libraries :: Python Modules", 28 | ] 29 | 30 | dependencies = [ 31 | "fastapi>=0.104.0", 32 | "uvicorn[standard]>=0.24.0", 33 | "pydantic>=2.0.0", 34 | "tree-sitter>=0.20.0,<0.21.0", 35 | "tree-sitter-languages>=1.10.0", 36 | "GitPython>=3.1.0", 37 | ] 38 | 39 | [project.optional-dependencies] 40 | dev = [ 41 | "pytest>=7.0.0", 42 | "pytest-asyncio>=0.21.0", 43 | "black>=23.0.0", 44 | "isort>=5.12.0", 45 | "mypy>=1.5.0", 46 | "pre-commit>=3.4.0", 47 | ] 48 | 49 | [project.scripts] 50 | gitprobe = "gitprobe.cli:main" 51 | gitprobe-server = "gitprobe.web.server:cli_main" 52 | 53 | [project.urls] 54 | Homepage = "https://github.com/yourusername/gitprobe" 55 | Documentation = "https://github.com/yourusername/gitprobe#readme" 56 | Repository = "https://github.com/yourusername/gitprobe.git" 57 | Issues = "https://github.com/yourusername/gitprobe/issues" 58 | 59 | [tool.setuptools.packages.find] 60 | where = ["src"] 61 | 62 | [tool.setuptools.package-dir] 63 | "" = "src" 64 | 65 | [tool.black] 66 | line-length = 100 67 | target-version = ['py38'] 68 | 69 | [tool.isort] 70 | profile = "black" 71 | line_length = 100 72 | 73 | [tool.mypy] 74 | python_version = "3.8" 75 | strict = true 76 | warn_return_any = true 77 | warn_unused_configs = true -------------------------------------------------------------------------------- /src/gitprobe/web/server.py: -------------------------------------------------------------------------------- 1 | """ 2 | GitProbe FastAPI Server 3 | 4 | Main web server providing REST API endpoints for repository analysis. 5 | Coordinates between different GitProbe services to provide comprehensive code analysis. 6 | """ 7 | 8 | from fastapi import FastAPI, HTTPException 9 | from pydantic import BaseModel, field_validator 10 | from typing import Optional, List 11 | 12 | from gitprobe.analysis.analysis_service import AnalysisService 13 | from gitprobe.analysis.cloning import sanitize_github_url 14 | 15 | app = FastAPI( 16 | title="GitProbe API", 17 | description="Repository analysis API using GitProbe services", 18 | version="1.0.0", 19 | ) 20 | 21 | 22 | class AnalyzeRequest(BaseModel): 23 | github_url: str 24 | include_patterns: Optional[List[str]] = None 25 | exclude_patterns: Optional[List[str]] = None 26 | 27 | @field_validator("github_url") 28 | @classmethod 29 | def sanitize_url(cls, v): 30 | if not v: 31 | raise ValueError("GitHub URL is required") 32 | 33 | sanitized = sanitize_github_url(v) 34 | 35 | if "github.com" not in sanitized: 36 | raise ValueError("Must be a valid GitHub URL") 37 | 38 | return sanitized 39 | 40 | 41 | class AnalysisResponse(BaseModel): 42 | status: str 43 | data: dict 44 | 45 | 46 | @app.post("/analyze", response_model=AnalysisResponse) 47 | async def analyze_repo(request: AnalyzeRequest): 48 | """Complete repository analysis including call graphs.""" 49 | try: 50 | analysis_service = AnalysisService() 51 | analysis_result = analysis_service.analyze_repository_full( 52 | request.github_url, request.include_patterns, request.exclude_patterns 53 | ) 54 | return AnalysisResponse(status="success", data=analysis_result.model_dump()) 55 | except Exception as e: 56 | raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") 57 | 58 | 59 | @app.post("/analyze/structure-only", response_model=AnalysisResponse) 60 | async def analyze_structure_only(request: AnalyzeRequest): 61 | """Lightweight repository structure analysis without call graphs.""" 62 | try: 63 | analysis_service = AnalysisService() 64 | result = analysis_service.analyze_repository_structure_only(request.github_url) 65 | return AnalysisResponse(status="success", data=result) 66 | except Exception as e: 67 | raise HTTPException(status_code=500, detail=f"Structure analysis failed: {str(e)}") 68 | 69 | 70 | @app.post("/analyze/llm-context") 71 | async def get_llm_context(request: AnalyzeRequest): 72 | """Get clean, LLM-optimized analysis data.""" 73 | try: 74 | analysis_service = AnalysisService() 75 | result = analysis_service.analyze_repository_full( 76 | request.github_url, request.include_patterns, request.exclude_patterns 77 | ) 78 | llm_data = analysis_service.call_graph_analyzer.generate_llm_format() 79 | return {"status": "success", "data": llm_data} 80 | except Exception as e: 81 | raise HTTPException(status_code=500, detail=f"LLM context analysis failed: {str(e)}") 82 | 83 | 84 | @app.get("/") 85 | async def root(): 86 | return {"message": "GitProbe API is running"} 87 | 88 | 89 | @app.get("/health") 90 | async def health_check(): 91 | return {"status": "healthy"} 92 | 93 | 94 | def cli_main(): 95 | """CLI entry point for gitprobe-server command.""" 96 | import uvicorn 97 | 98 | uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) 99 | 100 | 101 | if __name__ == "__main__": 102 | cli_main() 103 | -------------------------------------------------------------------------------- /src/gitprobe/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | GitProbe Command Line Interface 3 | 4 | Provides command-line access to GitProbe functionality. 5 | """ 6 | 7 | import argparse 8 | import sys 9 | import json 10 | from pathlib import Path 11 | from typing import Optional, List 12 | 13 | from gitprobe.analysis.analysis_service import AnalysisService 14 | 15 | 16 | def analyze_repo( 17 | url: str, 18 | output: Optional[str] = None, 19 | format: str = "json", 20 | include: Optional[List[str]] = None, 21 | exclude: Optional[List[str]] = None, 22 | structure_only: bool = False, 23 | ) -> None: 24 | """Analyze a repository and output results.""" 25 | print(f"🔍 Analyzing repository: {url}") 26 | 27 | try: 28 | service = AnalysisService() 29 | 30 | if structure_only: 31 | result = service.analyze_repository_structure_only(url) 32 | else: 33 | analysis_result = service.analyze_repository_full(url, include, exclude) 34 | result = analysis_result.model_dump() 35 | 36 | # Output results 37 | if output: 38 | output_path = Path(output) 39 | with open(output_path, "w") as f: 40 | if format == "json": 41 | json.dump(result, f, indent=2) 42 | else: 43 | f.write(str(result)) 44 | print(f"✅ Results saved to: {output_path}") 45 | else: 46 | if format == "json": 47 | print(json.dumps(result, indent=2)) 48 | else: 49 | print(result) 50 | 51 | except Exception as e: 52 | print(f"❌ Analysis failed: {e}") 53 | sys.exit(1) 54 | 55 | 56 | def main(): 57 | """Main CLI entry point.""" 58 | parser = argparse.ArgumentParser( 59 | description="GitProbe - Advanced repository analysis with call graph generation", 60 | formatter_class=argparse.RawDescriptionHelpFormatter, 61 | epilog=""" 62 | Examples: 63 | gitprobe analyze https://github.com/user/repo 64 | gitprobe analyze user/repo --output results.json 65 | gitprobe analyze https://github.com/user/repo --structure-only 66 | gitprobe server --port 8080 67 | """, 68 | ) 69 | 70 | subparsers = parser.add_subparsers(dest="command", help="Available commands") 71 | 72 | # Analyze command 73 | analyze_parser = subparsers.add_parser("analyze", help="Analyze a repository") 74 | analyze_parser.add_argument("url", help="GitHub repository URL or owner/repo") 75 | analyze_parser.add_argument("--output", "-o", help="Output file path") 76 | analyze_parser.add_argument( 77 | "--format", choices=["json", "text"], default="json", help="Output format" 78 | ) 79 | analyze_parser.add_argument("--include", nargs="*", help="File patterns to include") 80 | analyze_parser.add_argument("--exclude", nargs="*", help="File patterns to exclude") 81 | analyze_parser.add_argument( 82 | "--structure-only", action="store_true", help="Analyze structure only (faster)" 83 | ) 84 | 85 | # Server command 86 | server_parser = subparsers.add_parser("server", help="Start the GitProbe server") 87 | server_parser.add_argument("--host", default="0.0.0.0", help="Host to bind to") 88 | server_parser.add_argument("--port", type=int, default=8000, help="Port to bind to") 89 | server_parser.add_argument("--reload", action="store_true", help="Enable auto-reload") 90 | 91 | args = parser.parse_args() 92 | 93 | if not args.command: 94 | parser.print_help() 95 | return 96 | 97 | if args.command == "analyze": 98 | analyze_repo( 99 | url=args.url, 100 | output=args.output, 101 | format=args.format, 102 | include=args.include, 103 | exclude=args.exclude, 104 | structure_only=args.structure_only, 105 | ) 106 | elif args.command == "server": 107 | start_server(host=args.host, port=args.port, reload=args.reload) 108 | 109 | 110 | def start_server(host: str = "0.0.0.0", port: int = 8000, reload: bool = False): 111 | """Start the GitProbe server.""" 112 | try: 113 | import uvicorn 114 | 115 | print(f"🚀 Starting GitProbe server on {host}:{port}") 116 | uvicorn.run("gitprobe.web.server:app", host=host, port=port, reload=reload) 117 | except ImportError: 118 | print("❌ uvicorn not installed. Please install with: pip install uvicorn") 119 | sys.exit(1) 120 | 121 | 122 | if __name__ == "__main__": 123 | main() 124 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | MANIFEST 23 | 24 | # Virtual environments 25 | env/ 26 | venv/ 27 | ENV/ 28 | env.bak/ 29 | venv.bak/ 30 | .venv/ 31 | 32 | # IDE 33 | .vscode/ 34 | .idea/ 35 | *.swp 36 | *.swo 37 | *~ 38 | 39 | # OS 40 | .DS_Store 41 | .DS_Store? 42 | ._* 43 | .Spotlight-V100 44 | .Trashes 45 | ehthumbs.db 46 | Thumbs.db 47 | 48 | # Project specific 49 | *.html 50 | *.svg 51 | *.json 52 | !requirements.txt 53 | !package.json 54 | 55 | # Temporary files 56 | *.tmp 57 | *.temp 58 | temp/ 59 | tmp/ 60 | 61 | # Logs 62 | *.log 63 | logs/ 64 | 65 | # Testing 66 | .coverage 67 | .pytest_cache/ 68 | .tox/ 69 | .nox/ 70 | htmlcov/ 71 | 72 | # Documentation 73 | docs/_build/ 74 | 75 | # Distribution / packaging 76 | .Python 77 | build/ 78 | develop-eggs/ 79 | dist/ 80 | downloads/ 81 | eggs/ 82 | .eggs/ 83 | lib/ 84 | lib64/ 85 | parts/ 86 | sdist/ 87 | var/ 88 | wheels/ 89 | share/python-wheels/ 90 | *.egg-info/ 91 | .installed.cfg 92 | *.egg 93 | MANIFEST 94 | 95 | tmp/* 96 | 97 | # PyInstaller 98 | # Usually these files are written by a python script from a template 99 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 100 | *.manifest 101 | *.spec 102 | 103 | # Installer logs 104 | pip-log.txt 105 | pip-delete-this-directory.txt 106 | 107 | # Unit test / coverage reports 108 | htmlcov/ 109 | .tox/ 110 | .nox/ 111 | .coverage 112 | .coverage.* 113 | .cache 114 | nosetests.xml 115 | coverage.xml 116 | *.cover 117 | *.py,cover 118 | .hypothesis/ 119 | .pytest_cache/ 120 | cover/ 121 | 122 | # Translations 123 | *.mo 124 | *.pot 125 | 126 | # Django stuff: 127 | *.log 128 | local_settings.py 129 | db.sqlite3 130 | db.sqlite3-journal 131 | 132 | # Flask stuff: 133 | instance/ 134 | .webassets-cache 135 | 136 | # Scrapy stuff: 137 | .scrapy 138 | 139 | # PyBuilder 140 | .pybuilder/ 141 | target/ 142 | 143 | # Jupyter Notebook 144 | .ipynb_checkpoints 145 | 146 | # IPython 147 | profile_default/ 148 | ipython_config.py 149 | 150 | # pyenv 151 | # For a library or package, you might want to ignore these files since the code is 152 | # intended to run in multiple environments; otherwise, check them in: 153 | # .python-version 154 | 155 | # pipenv 156 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 157 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 158 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 159 | # install all needed dependencies. 160 | #Pipfile.lock 161 | 162 | # poetry 163 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 164 | # This is especially recommended for binary packages to ensure reproducibility, and is more 165 | # commonly ignored for libraries. 166 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 167 | #poetry.lock 168 | 169 | # pdm 170 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 171 | #pdm.lock 172 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 173 | # in version control. 174 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 175 | .pdm.toml 176 | .pdm-python 177 | .pdm-build/ 178 | 179 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 180 | __pypackages__/ 181 | 182 | # Celery stuff 183 | celerybeat-schedule 184 | celerybeat.pid 185 | 186 | # SageMath parsed files 187 | *.sage.py 188 | 189 | # Environments 190 | .env 191 | .venv 192 | env/ 193 | venv/ 194 | ENV/ 195 | env.bak/ 196 | venv.bak/ 197 | .python-version 198 | 199 | # Spyder project settings 200 | .spyderproject 201 | .spyproject 202 | 203 | # Rope project settings 204 | .ropeproject 205 | 206 | # mkdocs documentation 207 | /site 208 | 209 | # mypy 210 | .mypy_cache/ 211 | .dmypy.json 212 | dmypy.json 213 | 214 | # Pyre type checker 215 | .pyre/ 216 | 217 | # pytype static type analyzer 218 | .pytype/ 219 | 220 | # Cython debug symbols 221 | cython_debug/ 222 | 223 | # PyCharm 224 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 225 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 226 | # and can be added to the global gitignore or merged into this file. For a more nuclear 227 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 228 | #.idea/ 229 | .vscode/settings.json 230 | .DS_Store 231 | 232 | # Project specific 233 | history.txt 234 | cleanup.py 235 | Caddyfile 236 | 237 | # ignore default output directory 238 | tmp/* 239 | 240 | # Gitingest 241 | digest.txt 242 | 243 | .cursor/ -------------------------------------------------------------------------------- /src/gitprobe/analysis/repo_analyzer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Repository Analyzer Module 3 | 4 | This module provides functionality to analyze repository structures and generate 5 | detailed file tree representations with filtering capabilities. 6 | """ 7 | 8 | import os 9 | import fnmatch 10 | import json 11 | from pathlib import Path 12 | from typing import Dict, List, Optional, Union 13 | from gitprobe.utils.patterns import DEFAULT_IGNORE_PATTERNS, DEFAULT_INCLUDE_PATTERNS 14 | 15 | 16 | class RepoAnalyzer: 17 | def __init__( 18 | self, 19 | include_patterns: Optional[List[str]] = None, 20 | exclude_patterns: Optional[List[str]] = None, 21 | ) -> None: 22 | self.include_patterns = ( 23 | include_patterns if include_patterns is not None else DEFAULT_INCLUDE_PATTERNS 24 | ) 25 | self.exclude_patterns = ( 26 | list(DEFAULT_IGNORE_PATTERNS) + exclude_patterns 27 | if exclude_patterns is not None 28 | else list(DEFAULT_IGNORE_PATTERNS) 29 | ) 30 | 31 | def analyze_repository_structure(self, repo_dir: str) -> Dict: 32 | file_tree = self._build_file_tree(repo_dir) 33 | return { 34 | "file_tree": file_tree, 35 | "summary": { 36 | "total_files": self._count_files(file_tree), 37 | "total_size_kb": self._calculate_size(file_tree), 38 | }, 39 | } 40 | 41 | def _build_file_tree(self, repo_dir: str) -> Dict: 42 | def build_tree(path: Path, base_path: Path) -> Optional[Dict]: 43 | relative_path = path.relative_to(base_path) 44 | relative_path_str = str(relative_path) 45 | 46 | # 🚫 Reject symlinks 47 | if path.is_symlink(): 48 | return None 49 | 50 | # 🚫 Reject escaped paths (e.g., symlinks pointing outside) 51 | try: 52 | if not path.resolve().is_relative_to(base_path.resolve()): 53 | return None 54 | except AttributeError: 55 | if not str(path.resolve()).startswith(str(base_path.resolve())): 56 | return None 57 | 58 | if self._should_exclude_path(relative_path_str, path.name): 59 | return None 60 | 61 | if path.is_file(): 62 | if not self._should_include_file(relative_path_str, path.name): 63 | return None 64 | 65 | size = path.stat().st_size 66 | return { 67 | "type": "file", 68 | "name": path.name, 69 | "path": relative_path_str, 70 | "extension": path.suffix, 71 | "_size_bytes": size, 72 | } 73 | 74 | elif path.is_dir(): 75 | children = [] 76 | try: 77 | for child in sorted(path.iterdir()): 78 | child_tree = build_tree(child, base_path) 79 | if child_tree is not None: 80 | children.append(child_tree) 81 | except PermissionError: 82 | pass 83 | 84 | if children or str(relative_path) == ".": 85 | return { 86 | "type": "directory", 87 | "name": path.name, 88 | "path": relative_path_str, 89 | "children": children, 90 | } 91 | return None 92 | 93 | # Other types (sockets, devices, etc.) 94 | return None 95 | 96 | return build_tree(Path(repo_dir), Path(repo_dir)) 97 | 98 | def _should_exclude_path(self, path: str, filename: str) -> bool: 99 | for pattern in self.exclude_patterns: 100 | if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern): 101 | return True 102 | if pattern.endswith("/") and path.startswith(pattern.rstrip("/")): 103 | return True 104 | if path.startswith(pattern + "/") or path == pattern: 105 | return True 106 | if pattern in path.split("/"): 107 | return True 108 | return False 109 | 110 | def _should_include_file(self, path: str, filename: str) -> bool: 111 | if not self.include_patterns: 112 | return True 113 | for pattern in self.include_patterns: 114 | if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern): 115 | return True 116 | return False 117 | 118 | def _count_files(self, tree: Dict) -> int: 119 | if tree["type"] == "file": 120 | return 1 121 | return sum(self._count_files(child) for child in tree.get("children", [])) 122 | 123 | def _calculate_size(self, tree: Dict) -> float: 124 | if tree["type"] == "file": 125 | return tree.get("_size_bytes", 0) / 1024 126 | return sum(self._calculate_size(child) for child in tree.get("children", [])) 127 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # GitProbe Integration Tests 2 | 3 | Comprehensive integration test suite for GitProbe's tree-sitter language analyzers. Tests real-world repositories to ensure all language parsers are working correctly. 4 | 5 | ## 🚀 Quick Start 6 | 7 | ```bash 8 | # Install dependencies 9 | pip install requests rich 10 | 11 | # Start GitProbe server (in another terminal) 12 | ./gitprobe server 13 | 14 | # Run all tests 15 | python tests/test_integration.py 16 | 17 | # Run quick subset (1 repo per language) 18 | python tests/test_integration.py --quick 19 | 20 | # Test specific language 21 | python tests/test_integration.py --language python 22 | 23 | # Verbose output with detailed progress 24 | python tests/test_integration.py --verbose 25 | ``` 26 | 27 | ## 📋 Test Coverage 28 | 29 | The integration tests cover **7 languages** with carefully curated real-world repositories: 30 | 31 | ### Supported Languages 32 | - **Python** - 4 repositories (rich, requests, flask, cpython) 33 | - **JavaScript** - 4 repositories (lodash, axios, express, node) 34 | - **TypeScript** - 3 repositories (vscode, TypeScript, angular) 35 | - **Rust** - 3 repositories (clap, ripgrep, rust) 36 | - **Go** - 3 repositories (cobra, hugo, kubernetes) 37 | - **C** - 3 repositories (cJSON, libuv, curl) 38 | - **C++** - 3 repositories (fmt, Catch2, protobuf) 39 | 40 | ### Test Repository Selection Criteria 41 | - **Real-world usage**: Popular, actively maintained projects 42 | - **Diverse complexity**: From small libraries to large frameworks 43 | - **Language features**: Covers different language patterns and idioms 44 | - **Performance testing**: Includes large repositories to test scaling 45 | 46 | ## 🛠️ Usage Examples 47 | 48 | ### Basic Testing 49 | 50 | ```bash 51 | # Test all languages with all repositories (~25 repositories) 52 | python tests/test_integration.py 53 | 54 | # Quick test with 1 repository per language (6 repositories) 55 | python tests/test_integration.py --quick 56 | ``` 57 | 58 | ### Language-Specific Testing 59 | 60 | ```bash 61 | # Test only Python repositories 62 | python tests/test_integration.py --language python 63 | 64 | # Test multiple specific languages 65 | python tests/test_integration.py --language python --language rust 66 | 67 | # Test C/C++ analyzers 68 | python tests/test_integration.py --language c --language c++ 69 | ``` 70 | 71 | ### Advanced Options 72 | 73 | ```bash 74 | # Verbose output showing each test result 75 | python tests/test_integration.py --verbose 76 | 77 | # Custom server URL 78 | python tests/test_integration.py --server http://localhost:9000 79 | 80 | # Longer timeout for large repositories 81 | python tests/test_integration.py --timeout 300 82 | 83 | # JSON output for CI/CD integration 84 | python tests/test_integration.py --json > test_results.json 85 | ``` 86 | 87 | ## 📊 Output Formats 88 | 89 | ### Standard Output 90 | Beautiful terminal output with: 91 | - Progress indicators with spinners 92 | - Colored summary table by language 93 | - Success/failure statistics 94 | - Performance metrics (functions found, duration) 95 | - Error details for failed tests 96 | 97 | ### JSON Output 98 | Structured data perfect for CI/CD integration: 99 | ```json 100 | { 101 | "total_tests": 17, 102 | "passed": 17, 103 | "failed": 0, 104 | "success_rate": 100.0, 105 | "overall_success": true, 106 | "duration": 125.3, 107 | "by_language": { 108 | "Python": { 109 | "passed": 3, 110 | "total": 3, 111 | "results": [...] 112 | } 113 | } 114 | } 115 | ``` 116 | 117 | ## 🔧 Configuration 118 | 119 | ### Environment Requirements 120 | - **GitProbe server**: Must be running on specified URL (default: `http://localhost:8000`) 121 | - **Dependencies**: `requests` and `rich` packages 122 | - **Network access**: Required for cloning public GitHub repositories 123 | - **Disk space**: Temporary clones are created and cleaned up automatically 124 | 125 | ### Timeout Settings 126 | - **Default**: 120 seconds per repository 127 | - **Large repos**: Consider increasing to 300+ seconds for repositories like kubernetes or rust 128 | - **Quick tests**: Usually complete in 30-60 seconds 129 | 130 | ### Server Health Check 131 | The test suite automatically: 132 | 1. Checks if GitProbe server is running 133 | 2. Validates server health endpoint 134 | 3. Provides clear error messages if server is unavailable 135 | 136 | ## 🎯 Test Success Criteria 137 | 138 | A repository test is considered **successful** if: 139 | - ✅ HTTP 200 response from GitProbe API 140 | - ✅ At least 1 function detected in the codebase 141 | - ✅ No error status in the response 142 | - ✅ Analysis completes within timeout period 143 | 144 | ## 🔍 Troubleshooting 145 | 146 | ### Common Issues 147 | 148 | **Server not running:** 149 | ``` 150 | ❌ GitProbe server is not running or unhealthy 151 | Start server with: ./gitprobe server 152 | ``` 153 | *Solution*: Start GitProbe server in another terminal 154 | 155 | **Timeout errors:** 156 | ``` 157 | ❌ rust/rust: Timeout 158 | ``` 159 | *Solution*: Increase timeout with `--timeout 300` for large repositories 160 | 161 | **No functions detected:** 162 | ``` 163 | ❌ python/someproject: No functions detected 164 | ``` 165 | *Possible causes*: 166 | - Repository has no supported files 167 | - Tree-sitter parser failed to initialize 168 | - Repository structure not recognized 169 | 170 | **Network issues:** 171 | ``` 172 | ❌ python/requests: HTTP 500 173 | ``` 174 | *Solution*: Check internet connection and GitHub API limits 175 | 176 | ### Debug Mode 177 | 178 | For detailed debugging, combine flags: 179 | ```bash 180 | python tests/test_integration.py --verbose --language python --timeout 300 181 | ``` 182 | 183 | ## 🚦 CI/CD Integration 184 | 185 | Perfect for continuous integration pipelines: 186 | 187 | ```yaml 188 | # GitHub Actions example 189 | - name: Run GitProbe Integration Tests 190 | run: | 191 | ./gitprobe server & 192 | sleep 10 # Wait for server startup 193 | python tests/test_integration.py --quick --json > results.json 194 | 195 | - name: Check Test Results 196 | run: | 197 | if jq -e '.overall_success == false' results.json; then 198 | echo "Tests failed" 199 | exit 1 200 | fi 201 | ``` 202 | 203 | ## 🏗️ Architecture 204 | 205 | ### Test Structure 206 | - **TestResult**: Dataclass for individual repository results 207 | - **GitProbeIntegrationTests**: Main test runner class 208 | - **Progress tracking**: Real-time progress with rich library 209 | - **Error handling**: Comprehensive timeout and exception handling 210 | 211 | ### Repository Management 212 | - Repositories are cloned by GitProbe server 213 | - Temporary directories are automatically cleaned up 214 | - No local storage required for test suite 215 | 216 | ### Extensibility 217 | - Easy to add new test repositories 218 | - Simple language addition process 219 | - Configurable test sets (quick vs. comprehensive) 220 | 221 | ## 📈 Performance Benchmarks 222 | 223 | Typical execution times on modern hardware: 224 | 225 | | Test Set | Repositories | Duration | Use Case | 226 | |----------|-------------|----------|----------| 227 | | Quick | 6 repos | 30-60s | Development, quick validation | 228 | | Full | ~25 repos | 5-15min | CI/CD, comprehensive testing | 229 | | Single Language | 3-4 repos | 1-3min | Language-specific debugging | 230 | 231 | ## 🤝 Contributing 232 | 233 | To add new test repositories: 234 | 235 | 1. Add to appropriate language section in `TEST_REPOSITORIES` 236 | 2. Include description for context 237 | 3. Test with `--language ` first 238 | 4. Consider adding to `QUICK_TEST_SET` if it's a good representative 239 | 240 | Example: 241 | ```python 242 | "Python": [ 243 | ("https://github.com/new/repository", "Description of what it tests"), 244 | # ... existing repos 245 | ] 246 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GitProbe 2 | 3 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/your-org/gitprobe/blob/main/LICENSE) 4 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) 5 | [![FastAPI](https://img.shields.io/badge/FastAPI-0.115+-green.svg)](https://fastapi.tiangolo.com/) 6 | 7 | Turn any GitHub repository into comprehensive code analysis with interactive call graphs and multi-language support. 8 | 9 | ## 🚀 Features 10 | 11 | - **Multi-language Analysis**: Support for Python, JavaScript, TypeScript, Rust, Go, C, and C++ 12 | - **Tree-sitter Powered**: Advanced syntax parsing with tree-sitter for accurate code analysis 13 | - **Call Graph Generation**: Interactive visualizations showing function relationships 14 | - **Web API**: RESTful API for integration with other tools and frontends 15 | - **Real-time Analysis**: Live progress tracking and results 16 | - **Repository Insights**: File structure, function counts, and relationship mapping 17 | - **LLM-Ready Output**: Structured JSON optimized for AI analysis 18 | 19 | ## 📸 Preview 20 | 21 | ![GitProbe Preview](docs/preview.png) 22 | 23 | *GitProbe's interactive call graph visualization showing function relationships and code structure analysis* 24 | 25 | ## 📚 Requirements 26 | 27 | - Python 3.8+ 28 | - Git (for repository cloning) 29 | - Internet access for GitHub repository analysis 30 | 31 | ## 📦 Installation 32 | 33 | ```bash 34 | # Clone the repository 35 | git clone https://github.com/your-org/gitprobe.git 36 | cd gitprobe 37 | 38 | # Create virtual environment 39 | python -m venv env 40 | source env/bin/activate # On Windows: env\Scripts\activate 41 | 42 | # Install dependencies 43 | pip install -r requirements.txt 44 | ``` 45 | 46 | ## 💡 Command line usage 47 | 48 | ### Start the Web Server 49 | 50 | ```bash 51 | # Start GitProbe server 52 | ./gitprobe server 53 | 54 | # Server will be available at http://localhost:8000 55 | # API documentation at http://localhost:8000/docs 56 | ``` 57 | 58 | ### CLI Analysis (Legacy) 59 | 60 | ```bash 61 | # Analyze a GitHub repository 62 | python -m gitprobe https://github.com/user/repository 63 | 64 | # With custom output directory 65 | python -m gitprobe https://github.com/user/repository --output ./analysis/ 66 | ``` 67 | 68 | ## 🌐 Web API Usage 69 | 70 | ### Analyze Repository 71 | 72 | ```bash 73 | # Start analysis 74 | curl -X POST "http://localhost:8000/analyze" \ 75 | -H "Content-Type: application/json" \ 76 | -d '{"github_url": "https://github.com/psf/requests"}' 77 | ``` 78 | 79 | ### Python API Client 80 | 81 | ```python 82 | import requests 83 | 84 | # Analyze repository 85 | response = requests.post("http://localhost:8000/analyze", json={ 86 | "github_url": "https://github.com/psf/requests", 87 | "include_patterns": ["*.py"], 88 | "exclude_patterns": ["*test*", "docs/"] 89 | }) 90 | 91 | result = response.json() 92 | print(f"Found {result['data']['summary']['total_functions']} functions") 93 | print(f"Languages: {result['data']['summary']['languages_analyzed']}") 94 | ``` 95 | 96 | ### Example Response 97 | 98 | ```json 99 | { 100 | "status": "success", 101 | "data": { 102 | "summary": { 103 | "total_functions": 235, 104 | "total_calls": 657, 105 | "languages_analyzed": ["python"], 106 | "files_analyzed": 45 107 | }, 108 | "functions": [...], 109 | "relationships": [...], 110 | "visualization": { 111 | "cytoscape": {...} 112 | } 113 | } 114 | } 115 | ``` 116 | 117 | ## 🧪 Testing 118 | 119 | GitProbe includes a comprehensive integration test suite that validates all language analyzers: 120 | 121 | ```bash 122 | # Install test dependencies 123 | pip install rich 124 | 125 | # Run quick tests (1 repo per language) 126 | python tests/test_integration.py --quick 127 | 128 | # Test all languages comprehensive 129 | python tests/test_integration.py 130 | 131 | # Test specific language 132 | python tests/test_integration.py --language python 133 | 134 | # Verbose output with detailed progress 135 | python tests/test_integration.py --verbose 136 | 137 | # JSON output for CI/CD 138 | python tests/test_integration.py --json > results.json 139 | ``` 140 | 141 | ### Test Coverage 142 | 143 | - **Python**: rich, requests, flask, cpython 144 | - **JavaScript**: lodash, axios, express, node.js 145 | - **TypeScript**: vscode, typescript, angular 146 | - **Rust**: clap, ripgrep, rust compiler 147 | - **Go**: cobra, hugo, kubernetes 148 | - **C**: cJSON, libuv, curl 149 | - **C++**: fmt, catch2, protobuf 150 | 151 | ## 🏗️ Architecture 152 | 153 | ``` 154 | gitprobe/ 155 | ├── src/gitprobe/ 156 | │ ├── analysis/ # Core analysis engine 157 | │ │ ├── analysis_service.py 158 | │ │ ├── call_graph_analyzer.py 159 | │ │ └── repo_analyzer.py 160 | │ ├── analyzers/ # Language-specific parsers 161 | │ │ ├── python.py # Python tree-sitter analyzer 162 | │ │ ├── javascript.py # JavaScript/TypeScript analyzer 163 | │ │ ├── rust.py # Rust analyzer 164 | │ │ ├── go.py # Go analyzer 165 | │ │ ├── c_cpp.py # C/C++ analyzer 166 | │ │ └── ... 167 | │ ├── web/ # FastAPI web server 168 | │ │ └── server.py 169 | │ └── models/ # Data models 170 | │ └── ... 171 | ├── tests/ # Integration test suite 172 | │ ├── test_integration.py 173 | │ └── README.md 174 | └── requirements.txt 175 | ``` 176 | 177 | ## 🎯 Language Support 178 | 179 | | Language | Functions | Calls | Classes | Imports | Status | 180 | |------------|-----------|-------|---------|---------|--------| 181 | | Python | ✅ | ✅ | ✅ | ✅ | Stable | 182 | | JavaScript | ✅ | ✅ | ✅ | ✅ | Stable | 183 | | TypeScript | ✅ | ✅ | ✅ | ✅ | Stable | 184 | | Rust | ✅ | ✅ | ✅ | ✅ | Stable | 185 | | Go | ✅ | ✅ | ✅ | ✅ | Stable | 186 | | C | ✅ | ✅ | ❌ | ✅ | Stable | 187 | | C++ | ✅ | ✅ | ✅ | ✅ | Stable | 188 | 189 | ## 🔧 Configuration 190 | 191 | ### Environment Variables 192 | 193 | ```bash 194 | # Optional: Custom server configuration 195 | export GITPROBE_HOST=0.0.0.0 196 | export GITPROBE_PORT=8000 197 | ``` 198 | 199 | ### Analysis Options 200 | 201 | ```python 202 | # Include/exclude patterns 203 | { 204 | "github_url": "https://github.com/user/repo", 205 | "include_patterns": ["*.py", "*.js"], 206 | "exclude_patterns": ["*test*", "node_modules/", "__pycache__/"] 207 | } 208 | ``` 209 | 210 | ## 🤝 Contributing 211 | 212 | ### Running Tests 213 | 214 | ```bash 215 | # Start GitProbe server (in one terminal) 216 | ./gitprobe server 217 | 218 | # Run integration tests (in another terminal) 219 | python tests/test_integration.py --quick 220 | ``` 221 | 222 | ### Adding New Languages 223 | 224 | 1. Create analyzer in `src/gitprobe/analyzers/` 225 | 2. Add tree-sitter language dependency to `requirements.txt` 226 | 3. Register analyzer in analysis service 227 | 4. Add test repositories to `tests/test_integration.py` 228 | 229 | ### Development Setup 230 | 231 | ```bash 232 | # Install in development mode 233 | pip install -e . 234 | 235 | # Install development dependencies 236 | pip install pytest black isort mypy 237 | 238 | # Run code formatting 239 | black . 240 | isort . 241 | ``` 242 | 243 | ## 🛠️ Stack 244 | 245 | - [Tree-sitter](https://tree-sitter.github.io/) - Syntax parsing and analysis 246 | - [FastAPI](https://fastapi.tiangolo.com/) - Web API framework 247 | - [Pydantic](https://docs.pydantic.dev/) - Data validation and modeling 248 | - [Rich](https://rich.readthedocs.io/) - Beautiful terminal output 249 | - [Cytoscape.js](https://cytoscape.org/) - Graph visualization (frontend) 250 | 251 | ## 🐛 Known Issues 252 | 253 | - Large repositories (>1000 functions) are limited to 900 functions for performance 254 | - Some complex C++ template syntax may not parse correctly 255 | - Private repositories require local cloning 256 | 257 | ## 📄 License 258 | 259 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 260 | 261 | --- 262 | 263 | **GitProbe** - Comprehensive multi-language code analysis with interactive call graphs. -------------------------------------------------------------------------------- /src/gitprobe/analyzers/python.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python AST Analyzer 3 | 4 | Analyzes Python source code using the Abstract Syntax Tree (AST) to extract 5 | function definitions, method information, and function call relationships. 6 | """ 7 | 8 | import ast 9 | import logging 10 | from typing import List, Tuple, Optional 11 | from pathlib import Path 12 | 13 | from gitprobe.models.core import Function, CallRelationship 14 | from gitprobe.core.analysis_limits import AnalysisLimits, create_python_limits 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class PythonASTAnalyzer(ast.NodeVisitor): 20 | """ 21 | AST visitor to extract function information from Python code. 22 | 23 | This analyzer traverses Python AST nodes to identify: 24 | - Function and method definitions 25 | - Function parameters and docstrings 26 | - Function call relationships 27 | - Class context for methods 28 | - Code snippets and line numbers 29 | """ 30 | 31 | def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None): 32 | """ 33 | Initialize the Python AST analyzer. 34 | 35 | Args: 36 | file_path: Path to the Python file being analyzed 37 | content: Raw content of the Python file 38 | limits: Analysis limits configuration 39 | """ 40 | self.file_path = file_path 41 | self.content = content 42 | self.lines = content.splitlines() 43 | self.functions: List[Function] = [] 44 | self.call_relationships: List[CallRelationship] = [] 45 | self.current_class_name: str | None = None 46 | self.current_function_name: str | None = None 47 | self.limits = limits or create_python_limits() 48 | 49 | def generic_visit(self, node): 50 | """Override generic_visit to continue AST traversal with limit checks.""" 51 | if self.limits.should_stop(): 52 | return 53 | super().generic_visit(node) 54 | 55 | def visit_ClassDef(self, node: ast.ClassDef): 56 | """Visit class definition and track current class context.""" 57 | if self.limits.should_stop(): 58 | return 59 | 60 | if self.limits.increment(): 61 | return 62 | 63 | self.current_class_name = node.name 64 | self.generic_visit(node) 65 | self.current_class_name = None 66 | 67 | def _process_function_node(self, node: ast.FunctionDef | ast.AsyncFunctionDef): 68 | """Helper to process both sync and async function definitions.""" 69 | if self.limits.should_stop(): 70 | return 71 | 72 | if self.limits.increment(): 73 | return 74 | 75 | self.current_function_name = node.name 76 | 77 | function_obj = Function( 78 | name=node.name, 79 | file_path=str(self.file_path), 80 | line_start=node.lineno, 81 | line_end=node.end_lineno, 82 | parameters=[arg.arg for arg in node.args.args], 83 | docstring=ast.get_docstring(node), 84 | is_method=self.current_class_name is not None, 85 | class_name=self.current_class_name, 86 | code_snippet="\n".join(self.lines[node.lineno - 1 : node.end_lineno or node.lineno]), 87 | ) 88 | 89 | if self._should_include_function(function_obj): 90 | if self.limits.can_add_function(): 91 | self.functions.append(function_obj) 92 | if self.limits.add_function(): 93 | return 94 | else: 95 | return 96 | 97 | self.generic_visit(node) 98 | self.current_function_name = None 99 | 100 | def _should_include_function(self, func: Function) -> bool: 101 | """Determine if a function should be included in analysis.""" 102 | if func.name.startswith("_test_") or func.name in ["setUp", "tearDown"]: 103 | return False 104 | 105 | return True 106 | 107 | def visit_FunctionDef(self, node: ast.FunctionDef): 108 | """Visit function definition and extract function information.""" 109 | if self.limits.should_stop(): 110 | return 111 | self._process_function_node(node) 112 | 113 | def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): 114 | """Visit async function definition and extract function information.""" 115 | if self.limits.should_stop(): 116 | return 117 | self._process_function_node(node) 118 | 119 | def visit_Call(self, node: ast.Call): 120 | """Visit function call nodes and record relationships.""" 121 | if self.limits.should_stop(): 122 | return 123 | 124 | if self.limits.increment(): 125 | return 126 | 127 | if self.current_function_name: 128 | call_name = self._get_call_name(node.func) 129 | if call_name: 130 | if self.limits.can_add_relationship(): 131 | relationship = CallRelationship( 132 | caller=f"{self.file_path}:{self.current_function_name}", 133 | callee=call_name, 134 | call_line=node.lineno, 135 | is_resolved=False, 136 | ) 137 | self.call_relationships.append(relationship) 138 | if self.limits.add_relationship(): 139 | return 140 | else: 141 | return 142 | self.generic_visit(node) 143 | 144 | def _get_call_name(self, node) -> str | None: 145 | """ 146 | Extract function name from a call node. 147 | Handles simple names, attributes (obj.method), and filters built-ins. 148 | """ 149 | PYTHON_BUILTINS = { 150 | "print", 151 | "len", 152 | "str", 153 | "int", 154 | "float", 155 | "bool", 156 | "list", 157 | "dict", 158 | "range", 159 | "enumerate", 160 | "zip", 161 | "isinstance", 162 | "hasattr", 163 | "open", 164 | "super", 165 | "__import__", 166 | } 167 | 168 | if isinstance(node, ast.Name): 169 | if node.id in PYTHON_BUILTINS: 170 | return None 171 | return node.id 172 | elif isinstance(node, ast.Attribute): 173 | if isinstance(node.value, ast.Name): 174 | return f"{node.value.id}.{node.attr}" 175 | return node.attr 176 | return None 177 | 178 | def analyze(self): 179 | """Analyze the Python file and extract functions and relationships.""" 180 | if not self.limits.start_new_file(): 181 | logger.info(f"Skipping {self.file_path} - global limits reached") 182 | return 183 | 184 | try: 185 | tree = ast.parse(self.content) 186 | self.visit(tree) 187 | 188 | logger.info( 189 | f"Python analysis complete for {self.file_path}: {len(self.functions)} functions, " 190 | f"{len(self.call_relationships)} relationships, " 191 | f"nodes_processed={self.limits.nodes_processed}" 192 | ) 193 | except SyntaxError as e: 194 | logger.warning(f"⚠️ Could not parse {self.file_path}: {e}") 195 | except Exception as e: 196 | logger.error(f"⚠️ Error analyzing {self.file_path}: {e}", exc_info=True) 197 | 198 | 199 | def analyze_python_file( 200 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 201 | ) -> Tuple[List[Function], List[CallRelationship]]: 202 | """ 203 | Analyze a Python file and return functions and relationships. 204 | 205 | Args: 206 | file_path: Path to the Python file 207 | content: Content of the Python file 208 | limits: Analysis limits configuration 209 | 210 | Returns: 211 | tuple: (functions, call_relationships) 212 | """ 213 | if limits is None: 214 | limits = create_python_limits() 215 | 216 | analyzer = PythonASTAnalyzer(file_path, content, limits) 217 | analyzer.analyze() 218 | return analyzer.functions, analyzer.call_relationships 219 | -------------------------------------------------------------------------------- /src/gitprobe/analysis/cloning.py: -------------------------------------------------------------------------------- 1 | """ 2 | GitProbe Utility Functions 3 | Repository cloning and cleanup utilities. 4 | """ 5 | 6 | import os 7 | import shutil 8 | import tempfile 9 | import subprocess 10 | import stat 11 | import time 12 | from typing import Optional 13 | 14 | GIT_EXECUTABLE_PATH = shutil.which("git") 15 | 16 | 17 | def sanitize_github_url(github_url: str) -> str: 18 | """ 19 | Sanitize GitHub URL to ensure proper format and remove extra path components. 20 | 21 | Args: 22 | github_url: Raw GitHub URL or repository path 23 | 24 | Returns: 25 | str: Sanitized GitHub URL suitable for cloning 26 | """ 27 | 28 | url = github_url.strip() 29 | 30 | protocol = "https://" 31 | if url.startswith("https://"): 32 | url = url[8:] 33 | elif url.startswith("http://"): 34 | url = url[7:] 35 | protocol = "http://" 36 | 37 | if url.startswith("www."): 38 | url = url[4:] 39 | 40 | parts = url.split("/") 41 | 42 | if url.startswith("github.com/"): 43 | url_parts = url.split("/") 44 | if len(url_parts) >= 3: 45 | owner = url_parts[1] 46 | repo = url_parts[2] 47 | else: 48 | return github_url 49 | elif "/" in url and not url.startswith("github.com"): 50 | url_parts = url.split("/") 51 | if len(url_parts) >= 2: 52 | owner = url_parts[0] 53 | repo = url_parts[1] 54 | else: 55 | return github_url 56 | else: 57 | return github_url 58 | 59 | if repo.endswith(".git"): 60 | repo = repo[:-4] 61 | 62 | return f"{protocol}github.com/{owner}/{repo}" 63 | 64 | 65 | def clone_repository(github_url: str) -> str: 66 | """ 67 | Clone a GitHub repository to a temporary directory. 68 | 69 | Args: 70 | github_url: GitHub repository URL (will be sanitized automatically) 71 | 72 | Returns: 73 | str: Path to the cloned repository directory 74 | 75 | Raises: 76 | RuntimeError: If cloning fails or git executable is not found. 77 | """ 78 | if not GIT_EXECUTABLE_PATH: 79 | raise RuntimeError( 80 | "Git executable not found. Please install Git and ensure it is in the system's PATH." 81 | ) 82 | 83 | sanitized_url = sanitize_github_url(github_url) 84 | 85 | temp_dir = tempfile.mkdtemp(prefix="gitprobe_") 86 | 87 | try: 88 | if os.name == "nt": 89 | try: 90 | subprocess.run( 91 | [ 92 | GIT_EXECUTABLE_PATH, 93 | "config", 94 | "--global", 95 | "core.longpaths", 96 | "true", 97 | ], 98 | capture_output=True, 99 | text=True, 100 | ) 101 | except: 102 | pass 103 | 104 | subprocess.run( 105 | [ 106 | GIT_EXECUTABLE_PATH, 107 | "clone", 108 | "--depth", 109 | "1", 110 | "--filter=blob:none", 111 | sanitized_url, 112 | temp_dir, 113 | ], 114 | check=True, 115 | capture_output=True, 116 | text=True, 117 | timeout=300, 118 | ) 119 | 120 | if os.name == "nt": 121 | try: 122 | subprocess.run( 123 | [ 124 | GIT_EXECUTABLE_PATH, 125 | "-C", 126 | temp_dir, 127 | "config", 128 | "core.sparseCheckout", 129 | "true", 130 | ], 131 | capture_output=True, 132 | text=True, 133 | ) 134 | 135 | sparse_checkout_path = os.path.join(temp_dir, ".git", "info", "sparse-checkout") 136 | os.makedirs(os.path.dirname(sparse_checkout_path), exist_ok=True) 137 | with open(sparse_checkout_path, "w") as f: 138 | f.write("*\n") 139 | f.write("!**/tests/**/CvnF9nAXfESwhrtdkjGhX2wAkKHzwr8N2rjExPK8eZYS/**\n") 140 | f.write( 141 | "!**/0x0000000000000000000000000000000000000000000000000000000000000002/**\n" 142 | ) 143 | 144 | subprocess.run( 145 | [ 146 | GIT_EXECUTABLE_PATH, 147 | "-C", 148 | temp_dir, 149 | "read-tree", 150 | "-m", 151 | "-u", 152 | "HEAD", 153 | ], 154 | capture_output=True, 155 | text=True, 156 | ) 157 | except: 158 | pass 159 | return temp_dir 160 | except subprocess.TimeoutExpired: 161 | if os.path.exists(temp_dir): 162 | cleanup_repository_safe(temp_dir) 163 | raise RuntimeError( 164 | f"Repository cloning timed out after 5 minutes. The repository may be too large or network is slow." 165 | ) 166 | except subprocess.CalledProcessError as e: 167 | if os.path.exists(temp_dir): 168 | cleanup_repository_safe(temp_dir) 169 | raise RuntimeError(f"Failed to clone repository: {e.stderr}") 170 | except FileNotFoundError: 171 | if os.path.exists(temp_dir): 172 | cleanup_repository_safe(temp_dir) 173 | raise RuntimeError( 174 | f"Git executable not found at '{GIT_EXECUTABLE_PATH}'. " 175 | "Please ensure Git is installed and the path is correct." 176 | ) 177 | 178 | 179 | def cleanup_repository_safe(repo_dir: str) -> bool: 180 | """ 181 | Windows-safe removal of the cloned repository directory. 182 | Handles read-only files and permission issues common on Windows. 183 | 184 | Args: 185 | repo_dir: Path to the repository directory to remove 186 | 187 | Returns: 188 | bool: True if cleanup successful, False otherwise 189 | """ 190 | 191 | def handle_remove_readonly(func, path, exc): 192 | """Error handler for Windows read-only files.""" 193 | if os.path.exists(path): 194 | os.chmod(path, stat.S_IWRITE) 195 | func(path) 196 | 197 | try: 198 | if os.path.exists(repo_dir): 199 | if os.name == "nt": 200 | shutil.rmtree(repo_dir, onerror=handle_remove_readonly) 201 | else: 202 | shutil.rmtree(repo_dir) 203 | return True 204 | return False 205 | except PermissionError as e: 206 | try: 207 | time.sleep(1) 208 | if os.path.exists(repo_dir): 209 | for root, dirs, files in os.walk(repo_dir): 210 | for dir in dirs: 211 | os.chmod(os.path.join(root, dir), stat.S_IWRITE) 212 | for file in files: 213 | file_path = os.path.join(root, file) 214 | if os.path.exists(file_path): 215 | os.chmod(file_path, stat.S_IWRITE) 216 | shutil.rmtree(repo_dir) 217 | return True 218 | except Exception as retry_e: 219 | print(f"⚠️ Warning: Failed to cleanup {repo_dir} after retry: {str(retry_e)}") 220 | return False 221 | except Exception as e: 222 | print(f"⚠️ Warning: Failed to cleanup {repo_dir}: {str(e)}") 223 | return False 224 | 225 | 226 | def cleanup_repository(repo_dir: str) -> bool: 227 | """ 228 | Remove the cloned repository directory (wrapper for backward compatibility). 229 | 230 | Args: 231 | repo_dir: Path to the repository directory to remove 232 | 233 | Returns: 234 | bool: True if cleanup successful, False otherwise 235 | """ 236 | return cleanup_repository_safe(repo_dir) 237 | 238 | 239 | def parse_github_url(github_url: str) -> dict: 240 | """ 241 | Parse GitHub URL to extract owner and repository name. 242 | 243 | Args: 244 | github_url: GitHub repository URL 245 | 246 | Returns: 247 | dict: Repository information 248 | """ 249 | parts = github_url.rstrip("/").split("/") 250 | if len(parts) >= 2: 251 | owner = parts[-2] 252 | name = parts[-1].replace(".git", "") 253 | return { 254 | "owner": owner, 255 | "name": name, 256 | "full_name": f"{owner}/{name}", 257 | "url": github_url, 258 | } 259 | return { 260 | "owner": "unknown", 261 | "name": "unknown", 262 | "full_name": "unknown", 263 | "url": github_url, 264 | } 265 | -------------------------------------------------------------------------------- /src/gitprobe/core/analysis_limits.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared Analysis Limits 3 | 4 | Common analysis limits and performance controls used across all language analyzers. 5 | Ensures consistent behavior and resource management across Python, JavaScript, Go, Rust, etc. 6 | """ 7 | 8 | import time 9 | import logging 10 | from typing import Optional 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class GlobalLimitTracker: 16 | """ 17 | Global limit tracker shared across ALL language analyzers. 18 | Enforces hard caps on total functions and relationships across the entire analysis. 19 | """ 20 | 21 | def __init__(self, max_total_functions: int = 5000, max_total_relationships: int = 8000): 22 | self.max_total_functions = max_total_functions 23 | self.max_total_relationships = max_total_relationships 24 | self.total_functions = 0 25 | self.total_relationships = 0 26 | self.global_limit_reached = False 27 | 28 | def can_add_function(self) -> bool: 29 | """Check if we can add another function without exceeding global limits.""" 30 | if self.global_limit_reached: 31 | return False 32 | return self.total_functions < self.max_total_functions 33 | 34 | def can_add_relationship(self) -> bool: 35 | """Check if we can add another relationship without exceeding global limits.""" 36 | if self.global_limit_reached: 37 | return False 38 | return self.total_relationships < self.max_total_relationships 39 | 40 | def add_function(self) -> bool: 41 | """Add a function and return True if global limit reached.""" 42 | if self.global_limit_reached: 43 | return True 44 | 45 | self.total_functions += 1 46 | if self.total_functions >= self.max_total_functions: 47 | logger.warning(f"Global function limit reached: {self.max_total_functions}") 48 | self.global_limit_reached = True 49 | return True 50 | return False 51 | 52 | def add_relationship(self) -> bool: 53 | """Add a relationship and return True if global limit reached.""" 54 | if self.global_limit_reached: 55 | return True 56 | 57 | self.total_relationships += 1 58 | if self.total_relationships >= self.max_total_relationships: 59 | logger.warning(f"Global relationship limit reached: {self.max_total_relationships}") 60 | self.global_limit_reached = True 61 | return True 62 | return False 63 | 64 | def should_stop(self) -> bool: 65 | """Check if analysis should stop due to global limits.""" 66 | return self.global_limit_reached 67 | 68 | 69 | _global_tracker = None 70 | 71 | 72 | def get_global_tracker() -> GlobalLimitTracker: 73 | """Get the global limit tracker instance.""" 74 | global _global_tracker 75 | if _global_tracker is None: 76 | _global_tracker = GlobalLimitTracker() 77 | return _global_tracker 78 | 79 | 80 | def reset_global_tracker(): 81 | """Reset the global tracker (for testing or new analysis runs).""" 82 | global _global_tracker 83 | _global_tracker = GlobalLimitTracker() 84 | 85 | 86 | class AnalysisLimits: 87 | """ 88 | Unified analysis limits for all language analyzers. 89 | 90 | Provides consistent resource management and performance controls across 91 | Python, JavaScript, TypeScript, Go, Rust, and C/C++ analyzers. 92 | 93 | Uses a hybrid approach with both per-file and global limits to ensure: 94 | - Breadth over depth (sample from many files vs exhaustive analysis of few) 95 | - Fast analysis for real-time LLM interactions 96 | - Representative sampling across the codebase 97 | - Consistent resource usage across languages 98 | """ 99 | 100 | def __init__( 101 | self, 102 | max_nodes_per_file: int = 3000, 103 | max_time_per_file: float = 15.0, 104 | max_files_analyzed: int = 999999, 105 | max_total_time: float = 180.0, 106 | language: str = "unknown", 107 | ): 108 | self.max_nodes_per_file = max_nodes_per_file 109 | self.max_time_per_file = max_time_per_file 110 | self.max_files_analyzed = max_files_analyzed 111 | self.max_total_time = max_total_time 112 | self.language = language 113 | 114 | self.nodes_processed = 0 115 | self.start_time: Optional[float] = None 116 | self.limit_reached = False 117 | 118 | self.files_analyzed = 0 119 | self.global_start_time: Optional[float] = None 120 | self.global_limit_reached = False 121 | 122 | self.global_tracker = get_global_tracker() 123 | 124 | def start_new_file(self) -> bool: 125 | """ 126 | Start analyzing a new file. Returns True if global limits allow it. 127 | Resets per-file counters but maintains global state. 128 | """ 129 | if self.global_tracker.should_stop(): 130 | logger.info(f"Skipping {self.language} file - global analysis limits reached") 131 | return False 132 | 133 | if self.global_start_time is None: 134 | self.global_start_time = time.time() 135 | 136 | if self.files_analyzed >= self.max_files_analyzed: 137 | logger.info( 138 | f"Skipping {self.language} file - reached global file limit: {self.max_files_analyzed}" 139 | ) 140 | self.global_limit_reached = True 141 | return False 142 | 143 | if self.global_start_time: 144 | global_elapsed = time.time() - self.global_start_time 145 | if global_elapsed >= self.max_total_time: 146 | logger.info( 147 | f"Skipping {self.language} file - reached global time limit: {self.max_total_time}s" 148 | ) 149 | self.global_limit_reached = True 150 | return False 151 | 152 | self.nodes_processed = 0 153 | self.start_time = time.time() 154 | self.limit_reached = False 155 | self.files_analyzed += 1 156 | 157 | return True 158 | 159 | def increment(self) -> bool: 160 | """ 161 | Increment node counter and check all limits. 162 | Returns True if any limit exceeded and analysis should stop. 163 | """ 164 | if self.limit_reached or self.global_limit_reached: 165 | return True 166 | 167 | self.nodes_processed += 1 168 | 169 | if self.start_time: 170 | elapsed = time.time() - self.start_time 171 | if elapsed >= self.max_time_per_file: 172 | logger.debug( 173 | f"{self.language} analysis hit per-file time limit: {self.max_time_per_file}s" 174 | ) 175 | self.limit_reached = True 176 | return True 177 | 178 | if self.nodes_processed >= self.max_nodes_per_file: 179 | logger.debug( 180 | f"{self.language} analysis hit per-file node limit: {self.max_nodes_per_file} nodes" 181 | ) 182 | self.limit_reached = True 183 | return True 184 | 185 | if self.files_analyzed >= self.max_files_analyzed: 186 | logger.warning( 187 | f"{self.language} analysis hit global file limit: {self.max_files_analyzed} files" 188 | ) 189 | self.global_limit_reached = True 190 | return True 191 | 192 | if self.global_start_time: 193 | global_elapsed = time.time() - self.global_start_time 194 | if global_elapsed >= self.max_total_time: 195 | logger.warning( 196 | f"{self.language} analysis hit global time limit: {self.max_total_time}s" 197 | ) 198 | self.global_limit_reached = True 199 | return True 200 | 201 | return False 202 | 203 | def should_stop(self) -> bool: 204 | """Check if analysis should stop due to any limits.""" 205 | return self.limit_reached or self.global_limit_reached or self.global_tracker.should_stop() 206 | 207 | def can_add_function(self) -> bool: 208 | """Check if we can add another function without exceeding global limits.""" 209 | return self.global_tracker.can_add_function() and not self.should_stop() 210 | 211 | def can_add_relationship(self) -> bool: 212 | """Check if we can add another relationship without exceeding global limits.""" 213 | return self.global_tracker.can_add_relationship() and not self.should_stop() 214 | 215 | def add_function(self) -> bool: 216 | """Add a function to global count. Returns True if global limit reached.""" 217 | return self.global_tracker.add_function() 218 | 219 | def add_relationship(self) -> bool: 220 | """Add a relationship to global count. Returns True if global limit reached.""" 221 | return self.global_tracker.add_relationship() 222 | 223 | def get_stats(self) -> dict: 224 | """Get current analysis statistics.""" 225 | global_elapsed = 0.0 226 | if self.global_start_time: 227 | global_elapsed = time.time() - self.global_start_time 228 | 229 | file_elapsed = 0.0 230 | if self.start_time: 231 | file_elapsed = time.time() - self.start_time 232 | 233 | return { 234 | "language": self.language, 235 | "files_analyzed": self.files_analyzed, 236 | "max_files": self.max_files_analyzed, 237 | "global_time_elapsed": round(global_elapsed, 2), 238 | "max_global_time": self.max_total_time, 239 | "current_file_nodes": self.nodes_processed, 240 | "max_nodes_per_file": self.max_nodes_per_file, 241 | "current_file_time": round(file_elapsed, 2), 242 | "max_time_per_file": self.max_time_per_file, 243 | "limit_reached": self.limit_reached, 244 | "global_limit_reached": self.global_limit_reached, 245 | } 246 | 247 | def __str__(self) -> str: 248 | """String representation for logging.""" 249 | return ( 250 | f"AnalysisLimits({self.language}: " 251 | f"{self.max_nodes_per_file} nodes/file, " 252 | f"{self.max_time_per_file}s/file, " 253 | f"{self.max_total_time}s total)" 254 | ) 255 | 256 | 257 | def create_python_limits() -> AnalysisLimits: 258 | """Create analysis limits optimized for Python files.""" 259 | return AnalysisLimits( 260 | max_nodes_per_file=300, 261 | max_time_per_file=5.0, 262 | max_total_time=60.0, 263 | language="python", 264 | ) 265 | 266 | 267 | def create_javascript_limits() -> AnalysisLimits: 268 | """Create analysis limits optimized for JavaScript/TypeScript files.""" 269 | return AnalysisLimits( 270 | max_nodes_per_file=250, 271 | max_time_per_file=3.0, 272 | max_total_time=45.0, 273 | language="javascript", 274 | ) 275 | 276 | 277 | def create_go_limits() -> AnalysisLimits: 278 | """Create analysis limits optimized for Go files.""" 279 | return AnalysisLimits( 280 | max_nodes_per_file=200, 281 | max_time_per_file=3.0, 282 | max_total_time=30.0, 283 | language="go", 284 | ) 285 | 286 | 287 | def create_rust_limits() -> AnalysisLimits: 288 | """Create analysis limits optimized for Rust files.""" 289 | return AnalysisLimits( 290 | max_nodes_per_file=200, 291 | max_time_per_file=4.0, 292 | max_total_time=30.0, 293 | language="rust", 294 | ) 295 | 296 | 297 | def create_c_cpp_limits() -> AnalysisLimits: 298 | """Create analysis limits optimized for C/C++ files.""" 299 | return AnalysisLimits( 300 | max_nodes_per_file=200, 301 | max_time_per_file=4.0, 302 | max_total_time=30.0, 303 | language="c_cpp", 304 | ) 305 | -------------------------------------------------------------------------------- /src/gitprobe/analysis/analysis_service.py: -------------------------------------------------------------------------------- 1 | """ 2 | Analysis Service 3 | 4 | Centralized service for repository analysis with support for multiple languages. 5 | Handles the orchestration of repository cloning, structure analysis, and multi-language 6 | AST parsing for call graph generation. 7 | """ 8 | 9 | import logging 10 | from typing import Dict, List, Optional, Any 11 | from pathlib import Path 12 | from gitprobe.utils.security import safe_open_text, assert_safe_path 13 | from gitprobe.analysis.repo_analyzer import RepoAnalyzer 14 | from gitprobe.analysis.call_graph_analyzer import CallGraphAnalyzer 15 | from gitprobe.analysis.cloning import clone_repository, cleanup_repository, parse_github_url 16 | from gitprobe.models.analysis import AnalysisResult 17 | from gitprobe.models.core import Repository 18 | 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class AnalysisService: 24 | """ 25 | Centralized analysis service supporting multiple programming languages. 26 | 27 | This service orchestrates the complete analysis workflow: 28 | 1. Repository cloning and validation 29 | 2. File structure analysis with filtering 30 | 3. Multi-language AST parsing and call graph generation 31 | 4. Result consolidation and cleanup 32 | 33 | Supports: 34 | - Python (fully implemented) 35 | - JavaScript/TypeScript (fully implemented) 36 | - C/C++ (fully implemented) 37 | - Go (fully implemented) 38 | - Rust (fully implemented) 39 | - Additional languages (extensible) 40 | """ 41 | 42 | def __init__(self): 43 | """Initialize the analysis service with language-specific analyzers.""" 44 | self.call_graph_analyzer = CallGraphAnalyzer() 45 | self._temp_directories = [] 46 | 47 | def analyze_repository_full( 48 | self, 49 | github_url: str, 50 | include_patterns: Optional[List[str]] = None, 51 | exclude_patterns: Optional[List[str]] = None, 52 | ) -> AnalysisResult: 53 | """ 54 | Perform complete repository analysis including call graph generation. 55 | 56 | Args: 57 | github_url: GitHub repository URL to analyze 58 | include_patterns: File patterns to include (e.g., ['*.py', '*.js']) 59 | exclude_patterns: Additional patterns to exclude 60 | 61 | Returns: 62 | AnalysisResult: Complete analysis with functions, relationships, and visualization 63 | 64 | Raises: 65 | ValueError: If GitHub URL is invalid 66 | RuntimeError: If analysis fails 67 | """ 68 | temp_dir = None 69 | try: 70 | logger.info(f"Starting full analysis of {github_url}") 71 | 72 | temp_dir = self._clone_repository(github_url) 73 | repo_info = self._parse_repository_info(github_url) 74 | 75 | logger.info("Analyzing repository file structure...") 76 | structure_result = self._analyze_structure(temp_dir, include_patterns, exclude_patterns) 77 | logger.info(f"Found {structure_result['summary']['total_files']} files to analyze.") 78 | 79 | logger.info("Starting call graph analysis...") 80 | call_graph_result = self._analyze_call_graph(structure_result["file_tree"], temp_dir) 81 | logger.info( 82 | f"Call graph analysis complete. Found {call_graph_result['call_graph']['total_functions']} functions." 83 | ) 84 | 85 | readme_content = self._read_readme_file(temp_dir) 86 | 87 | analysis_result = AnalysisResult( 88 | repository=Repository( 89 | url=repo_info["url"], 90 | name=repo_info["name"], 91 | clone_path=temp_dir, 92 | analysis_id=f"{repo_info['owner']}-{repo_info['name']}", 93 | ), 94 | functions=call_graph_result["functions"], 95 | relationships=call_graph_result["relationships"], 96 | file_tree=structure_result["file_tree"], 97 | summary={ 98 | **structure_result["summary"], 99 | **call_graph_result["call_graph"], 100 | "analysis_type": "full", 101 | "languages_analyzed": call_graph_result["call_graph"]["languages_found"], 102 | }, 103 | visualization=call_graph_result["visualization"], 104 | readme_content=readme_content, 105 | ) 106 | 107 | logger.info(f"Cleaning up temporary repository directory: {temp_dir}") 108 | self._cleanup_repository(temp_dir) 109 | 110 | logger.info( 111 | f"Analysis completed: {analysis_result.summary['total_functions']} functions found" 112 | ) 113 | return analysis_result 114 | 115 | except Exception as e: 116 | logger.error(f"Analysis failed: {str(e)}", exc_info=True) 117 | if "temp_dir" in locals() and Path(temp_dir).exists(): 118 | self._cleanup_repository(temp_dir) 119 | raise RuntimeError(f"Repository analysis failed: {str(e)}") 120 | 121 | def analyze_repository_structure_only( 122 | self, 123 | github_url: str, 124 | include_patterns: Optional[List[str]] = None, 125 | exclude_patterns: Optional[List[str]] = None, 126 | ) -> Dict[str, Any]: 127 | """ 128 | Perform lightweight structure-only analysis without call graph generation. 129 | 130 | Args: 131 | github_url: GitHub repository URL to analyze 132 | include_patterns: File patterns to include 133 | exclude_patterns: Additional patterns to exclude 134 | 135 | Returns: 136 | Dict: Repository structure with file tree and summary statistics 137 | """ 138 | temp_dir = None 139 | try: 140 | logger.info(f"Starting structure analysis of {github_url}") 141 | 142 | temp_dir = self._clone_repository(github_url) 143 | repo_info = self._parse_repository_info(github_url) 144 | 145 | structure_result = self._analyze_structure(temp_dir, include_patterns, exclude_patterns) 146 | 147 | result = { 148 | "repository": repo_info, 149 | "file_tree": structure_result["file_tree"], 150 | "file_summary": { 151 | **structure_result["summary"], 152 | "analysis_type": "structure_only", 153 | }, 154 | } 155 | 156 | self._cleanup_repository(temp_dir) 157 | 158 | logger.info( 159 | f"Structure analysis completed: {result['file_summary']['total_files']} files found" 160 | ) 161 | return result 162 | 163 | except Exception as e: 164 | if temp_dir: 165 | self._cleanup_repository(temp_dir) 166 | logger.error(f"Structure analysis failed for {github_url}: {str(e)}") 167 | raise RuntimeError(f"Structure analysis failed: {str(e)}") from e 168 | 169 | def _clone_repository(self, github_url: str) -> str: 170 | """Clone repository and return temp dir path.""" 171 | logger.info(f"Cloning {github_url}...") 172 | temp_dir = clone_repository(github_url) 173 | logger.info(f"Repository cloned to {temp_dir}") 174 | self._temp_directories.append(temp_dir) 175 | return temp_dir 176 | 177 | def _parse_repository_info(self, github_url: str) -> Dict[str, str]: 178 | """Parse GitHub URL and extract repository metadata.""" 179 | return parse_github_url(github_url) 180 | 181 | def _analyze_structure( 182 | self, 183 | repo_dir: str, 184 | include_patterns: Optional[List[str]], 185 | exclude_patterns: Optional[List[str]], 186 | ) -> Dict[str, Any]: 187 | """Analyze repository file structure with filtering.""" 188 | logger.info( 189 | f"Initializing RepoAnalyzer with include: {include_patterns}, exclude: {exclude_patterns}" 190 | ) 191 | repo_analyzer = RepoAnalyzer(include_patterns, exclude_patterns) 192 | return repo_analyzer.analyze_repository_structure(repo_dir) 193 | 194 | def _read_readme_file(self, repo_dir: str) -> Optional[str]: 195 | """Find and read the README file from the repository root.""" 196 | # possible_readme_names = ["README.md", "README", "readme.md", "README.txt"] 197 | # for name in possible_readme_names: 198 | # readme_path = Path(repo_dir) / name 199 | # if readme_path.exists(): 200 | # try: 201 | # logger.info(f"Found README file at {readme_path}") 202 | # return readme_path.read_text(encoding="utf-8") 203 | # except Exception as e: 204 | # logger.warning(f"Could not read README file at {readme_path}: {e}") 205 | # return None 206 | # logger.info("No README file found in repository root.") 207 | # return None 208 | base = Path(repo_dir) 209 | possible_readme_names = ["README.md", "README", "readme.md", "README.txt"] 210 | for name in possible_readme_names: 211 | p = base / name 212 | if p.exists(): 213 | try: 214 | assert_safe_path(base, p) 215 | logger.info(f"Found README file at {p}") 216 | return safe_open_text(base, p, encoding="utf-8") 217 | except Exception as e: 218 | logger.warning(f"Skipping unsafe/ unreadable README at {p}: {e}") 219 | return None 220 | logger.info("No README file found in repository root.") 221 | return None 222 | 223 | def _analyze_call_graph(self, file_tree: Dict[str, Any], repo_dir: str) -> Dict[str, Any]: 224 | """ 225 | Perform multi-language call graph analysis. 226 | 227 | This method will be expanded to handle: 228 | - Python AST analysis (current) 229 | - JavaScript/TypeScript AST analysis (planned) 230 | - Additional language support (future) 231 | """ 232 | logger.info("Extracting code files from file tree...") 233 | code_files = self.call_graph_analyzer.extract_code_files(file_tree) 234 | 235 | logger.info(f"Found {len(code_files)} total code files. Filtering for supported languages.") 236 | supported_files = self._filter_supported_languages(code_files) 237 | logger.info(f"Analyzing {len(supported_files)} supported files.") 238 | 239 | result = self.call_graph_analyzer.analyze_code_files(supported_files, repo_dir) 240 | 241 | result["call_graph"]["supported_languages"] = self._get_supported_languages() 242 | result["call_graph"]["unsupported_files"] = len(code_files) - len(supported_files) 243 | 244 | return result 245 | 246 | def _filter_supported_languages(self, code_files: List[Dict]) -> List[Dict]: 247 | """ 248 | Filter code files to only include supported languages. 249 | 250 | Supports Python, JavaScript, TypeScript, C, C++, Go, and Rust. 251 | """ 252 | supported_languages = { 253 | "python", 254 | "javascript", 255 | "typescript", 256 | "c", 257 | "cpp", 258 | "go", 259 | "rust", 260 | } 261 | 262 | return [ 263 | file_info 264 | for file_info in code_files 265 | if file_info.get("language") in supported_languages 266 | ] 267 | 268 | def _get_supported_languages(self) -> List[str]: 269 | """Get list of currently supported languages for analysis.""" 270 | return ["python", "javascript", "typescript", "c", "cpp", "go", "rust"] 271 | 272 | def _cleanup_repository(self, temp_dir: str): 273 | """Clean up cloned repository.""" 274 | logger.info(f"Attempting to clean up {temp_dir}") 275 | cleanup_repository(temp_dir) 276 | if temp_dir in self._temp_directories: 277 | self._temp_directories.remove(temp_dir) 278 | 279 | def cleanup_all(self): 280 | """Clean up all tracked temporary directories.""" 281 | for temp_dir in self._temp_directories[:]: 282 | self._cleanup_repository(temp_dir) 283 | 284 | def __del__(self): 285 | """Ensure cleanup on service destruction.""" 286 | self.cleanup_all() 287 | 288 | 289 | def analyze_repository( 290 | github_url: str, include_patterns=None, exclude_patterns=None 291 | ) -> tuple[AnalysisResult, None]: 292 | """ 293 | Backward compatibility function. 294 | 295 | Returns: 296 | tuple: (AnalysisResult, None) - None instead of temp_dir since cleanup is handled internally 297 | """ 298 | service = AnalysisService() 299 | result = service.analyze_repository_full(github_url, include_patterns, exclude_patterns) 300 | return result, None 301 | 302 | 303 | def analyze_repository_structure_only( 304 | github_url: str, include_patterns=None, exclude_patterns=None 305 | ) -> tuple[Dict, None]: 306 | """ 307 | Backward compatibility function. 308 | 309 | Returns: 310 | tuple: (structure_result, None) - None instead of temp_dir since cleanup is handled internally 311 | """ 312 | service = AnalysisService() 313 | result = service.analyze_repository_structure_only( 314 | github_url, include_patterns, exclude_patterns 315 | ) 316 | return result, None 317 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | GitProbe Integration Test Suite 4 | 5 | Comprehensive integration tests for all GitProbe tree-sitter language analyzers. 6 | Tests real-world repositories to ensure all language parsers are working correctly. 7 | 8 | Usage: 9 | python tests/test_integration.py # Run all tests 10 | python tests/test_integration.py --language python # Test specific language 11 | python tests/test_integration.py --quick # Run quick subset 12 | python tests/test_integration.py --verbose # Detailed output 13 | 14 | Author: GitProbe Team 15 | License: MIT 16 | """ 17 | 18 | import argparse 19 | import json 20 | import sys 21 | import time 22 | from typing import Dict, List, Tuple, Optional 23 | from dataclasses import dataclass 24 | from pathlib import Path 25 | 26 | try: 27 | import requests 28 | from rich.console import Console 29 | from rich.table import Table 30 | from rich.progress import Progress, SpinnerColumn, TextColumn 31 | from rich.panel import Panel 32 | from rich.text import Text 33 | except ImportError: 34 | print("❌ Missing dependencies. Install with:") 35 | print(" pip install requests rich") 36 | sys.exit(1) 37 | 38 | 39 | @dataclass 40 | class TestResult: 41 | """Result of a single repository test.""" 42 | repo_name: str 43 | language: str 44 | success: bool 45 | functions: int 46 | calls: int 47 | error: Optional[str] = None 48 | duration: float = 0.0 49 | 50 | 51 | class GitProbeIntegrationTests: 52 | """Main integration test runner for GitProbe analyzers.""" 53 | 54 | # Curated test repositories by language 55 | TEST_REPOSITORIES = { 56 | "Python": [ 57 | ("https://github.com/Textualize/rich", "Modern terminal formatting"), 58 | ("https://github.com/psf/requests", "HTTP library for humans"), 59 | ("https://github.com/pallets/flask", "Lightweight web framework"), 60 | ("https://github.com/python/cpython", "Python interpreter (large)"), 61 | ], 62 | "JavaScript": [ 63 | ("https://github.com/lodash/lodash", "Modern utility library"), 64 | ("https://github.com/axios/axios", "Promise-based HTTP client"), 65 | ("https://github.com/expressjs/express", "Fast web framework"), 66 | ("https://github.com/nodejs/node", "Node.js runtime (large)"), 67 | ], 68 | "TypeScript": [ 69 | ("https://github.com/microsoft/vscode", "Code editor (large)"), 70 | ("https://github.com/microsoft/TypeScript", "TypeScript compiler"), 71 | ("https://github.com/angular/angular", "Angular framework (large)"), 72 | ], 73 | "Rust": [ 74 | ("https://github.com/clap-rs/clap", "Command line parser"), 75 | ("https://github.com/BurntSushi/ripgrep", "Fast grep alternative"), 76 | ("https://github.com/rust-lang/rust", "Rust compiler (very large)"), 77 | ], 78 | "Go": [ 79 | ("https://github.com/spf13/cobra", "CLI library"), 80 | ("https://github.com/gohugoio/hugo", "Static site generator"), 81 | ("https://github.com/kubernetes/kubernetes", "Container orchestration (very large)"), 82 | ], 83 | "C": [ 84 | ("https://github.com/DaveGamble/cJSON", "JSON parser in C"), 85 | ("https://github.com/libuv/libuv", "Cross-platform async I/O"), 86 | ("https://github.com/curl/curl", "Data transfer library"), 87 | ], 88 | "C++": [ 89 | ("https://github.com/fmtlib/fmt", "Modern formatting library"), 90 | ("https://github.com/catchorg/Catch2", "Modern test framework"), 91 | ("https://github.com/protocolbuffers/protobuf", "Protocol buffers"), 92 | ] 93 | } 94 | 95 | # Quick subset for fast testing 96 | QUICK_TEST_SET = { 97 | "Python": [("https://github.com/psf/requests", "HTTP library")], 98 | "JavaScript": [("https://github.com/axios/axios", "HTTP client")], 99 | "Rust": [("https://github.com/clap-rs/clap", "CLI parser")], 100 | "Go": [("https://github.com/spf13/cobra", "CLI library")], 101 | "C": [("https://github.com/DaveGamble/cJSON", "JSON parser")], 102 | "C++": [("https://github.com/fmtlib/fmt", "Formatting library")], 103 | } 104 | 105 | def __init__(self, server_url: str = "http://localhost:8000", timeout: int = 120): 106 | """Initialize test runner.""" 107 | self.server_url = server_url 108 | self.timeout = timeout 109 | self.console = Console() 110 | self.results: List[TestResult] = [] 111 | 112 | def check_server_health(self) -> bool: 113 | """Check if GitProbe server is running and healthy.""" 114 | try: 115 | response = requests.get(f"{self.server_url}/health", timeout=5) 116 | return response.status_code == 200 117 | except requests.RequestException: 118 | return False 119 | 120 | def test_repository(self, repo_url: str, language: str, description: str = "") -> TestResult: 121 | """Test analysis of a single repository.""" 122 | repo_name = repo_url.split('/')[-1] 123 | start_time = time.time() 124 | 125 | try: 126 | response = requests.post( 127 | f"{self.server_url}/analyze", 128 | json={"github_url": repo_url}, 129 | timeout=self.timeout 130 | ) 131 | 132 | duration = time.time() - start_time 133 | 134 | if response.status_code == 200: 135 | data = response.json() 136 | summary = data.get("data", {}).get("summary", {}) 137 | 138 | functions = summary.get("total_functions", 0) 139 | calls = summary.get("total_calls", 0) 140 | 141 | # Consider success if we found functions and no errors 142 | has_errors = "error" in data.get("status", "").lower() 143 | success = functions > 0 and not has_errors 144 | 145 | return TestResult( 146 | repo_name=repo_name, 147 | language=language, 148 | success=success, 149 | functions=functions, 150 | calls=calls, 151 | duration=duration 152 | ) 153 | else: 154 | return TestResult( 155 | repo_name=repo_name, 156 | language=language, 157 | success=False, 158 | functions=0, 159 | calls=0, 160 | error=f"HTTP {response.status_code}", 161 | duration=duration 162 | ) 163 | 164 | except requests.exceptions.Timeout: 165 | return TestResult( 166 | repo_name=repo_name, 167 | language=language, 168 | success=False, 169 | functions=0, 170 | calls=0, 171 | error="Timeout", 172 | duration=self.timeout 173 | ) 174 | except Exception as e: 175 | return TestResult( 176 | repo_name=repo_name, 177 | language=language, 178 | success=False, 179 | functions=0, 180 | calls=0, 181 | error=str(e), 182 | duration=time.time() - start_time 183 | ) 184 | 185 | def run_tests(self, languages: Optional[List[str]] = None, quick: bool = False, verbose: bool = False) -> Dict: 186 | """Run integration tests and return detailed results.""" 187 | 188 | # Check server health first 189 | if not self.check_server_health(): 190 | self.console.print("❌ [red]GitProbe server is not running or unhealthy[/red]") 191 | self.console.print(" Start server with: [cyan]./gitprobe server[/cyan]") 192 | return {"error": "Server not available"} 193 | 194 | # Select test set 195 | test_set = self.QUICK_TEST_SET if quick else self.TEST_REPOSITORIES 196 | 197 | # Filter by languages if specified 198 | if languages: 199 | test_set = {lang: repos for lang, repos in test_set.items() 200 | if lang.lower() in [l.lower() for l in languages]} 201 | 202 | if not test_set: 203 | self.console.print("❌ [red]No tests to run with current filters[/red]") 204 | return {"error": "No tests selected"} 205 | 206 | # Display test plan 207 | total_tests = sum(len(repos) for repos in test_set.values()) 208 | self.console.print(Panel( 209 | f"🧪 [bold blue]GitProbe Integration Test Suite[/bold blue]\n\n" 210 | f"Testing {len(test_set)} languages, {total_tests} repositories\n" 211 | f"Server: {self.server_url}\n" 212 | f"Timeout: {self.timeout}s per repository", 213 | title="Test Configuration" 214 | )) 215 | 216 | # Run tests with progress tracking 217 | with Progress( 218 | SpinnerColumn(), 219 | TextColumn("[progress.description]{task.description}"), 220 | console=self.console 221 | ) as progress: 222 | 223 | for language, repos in test_set.items(): 224 | lang_task = progress.add_task(f"Testing {language}...", total=len(repos)) 225 | 226 | for repo_url, description in repos: 227 | repo_name = repo_url.split('/')[-1] 228 | progress.update(lang_task, description=f"Testing {language}: {repo_name}") 229 | 230 | result = self.test_repository(repo_url, language, description) 231 | self.results.append(result) 232 | 233 | if verbose: 234 | status = "✅" if result.success else "❌" 235 | details = f"({result.functions} functions, {result.calls} calls, {result.duration:.1f}s)" 236 | if result.error: 237 | details = f"Error: {result.error}" 238 | self.console.print(f" {status} {repo_name}: {details}") 239 | 240 | progress.advance(lang_task) 241 | 242 | return self._generate_report() 243 | 244 | def _generate_report(self) -> Dict: 245 | """Generate comprehensive test report.""" 246 | # Calculate statistics 247 | total_tests = len(self.results) 248 | passed_tests = sum(1 for r in self.results if r.success) 249 | failed_tests = total_tests - passed_tests 250 | 251 | # Group by language 252 | by_language = {} 253 | for result in self.results: 254 | if result.language not in by_language: 255 | by_language[result.language] = [] 256 | by_language[result.language].append(result) 257 | 258 | # Create summary table 259 | table = Table(title="📊 Test Results Summary") 260 | table.add_column("Language", style="cyan", no_wrap=True) 261 | table.add_column("Passed", style="green", justify="center") 262 | table.add_column("Failed", style="red", justify="center") 263 | table.add_column("Success Rate", justify="center") 264 | table.add_column("Avg Functions", justify="right") 265 | table.add_column("Total Duration", justify="right") 266 | 267 | overall_success = True 268 | total_duration = 0 269 | 270 | for language, results in by_language.items(): 271 | passed = sum(1 for r in results if r.success) 272 | total = len(results) 273 | failed = total - passed 274 | success_rate = (passed / total * 100) if total > 0 else 0 275 | avg_functions = sum(r.functions for r in results if r.success) / max(passed, 1) 276 | lang_duration = sum(r.duration for r in results) 277 | total_duration += lang_duration 278 | 279 | if failed > 0: 280 | overall_success = False 281 | 282 | status_style = "green" if failed == 0 else "yellow" if passed > 0 else "red" 283 | table.add_row( 284 | f"[{status_style}]{language}[/{status_style}]", 285 | str(passed), 286 | str(failed), 287 | f"{success_rate:.0f}%", 288 | f"{avg_functions:.0f}" if passed > 0 else "0", 289 | f"{lang_duration:.1f}s" 290 | ) 291 | 292 | self.console.print("\n") 293 | self.console.print(table) 294 | 295 | # Overall status 296 | if overall_success: 297 | self.console.print("\n🎉 [bold green]All analyzers working perfectly![/bold green]") 298 | elif passed_tests > 0: 299 | self.console.print(f"\n⚠️ [yellow]Partial success: {passed_tests}/{total_tests} tests passed[/yellow]") 300 | else: 301 | self.console.print(f"\n❌ [red]All tests failed - check GitProbe server[/red]") 302 | 303 | # Show failures if any 304 | failures = [r for r in self.results if not r.success] 305 | if failures: 306 | self.console.print(f"\n[red]Failed Tests ({len(failures)}):[/red]") 307 | for failure in failures: 308 | error_msg = failure.error or "No functions detected" 309 | self.console.print(f" ❌ {failure.language}/{failure.repo_name}: {error_msg}") 310 | 311 | return { 312 | "total_tests": total_tests, 313 | "passed": passed_tests, 314 | "failed": failed_tests, 315 | "success_rate": (passed_tests / total_tests * 100) if total_tests > 0 else 0, 316 | "overall_success": overall_success, 317 | "duration": total_duration, 318 | "by_language": { 319 | lang: { 320 | "passed": sum(1 for r in results if r.success), 321 | "total": len(results), 322 | "results": [ 323 | { 324 | "repo": r.repo_name, 325 | "success": r.success, 326 | "functions": r.functions, 327 | "calls": r.calls, 328 | "error": r.error, 329 | "duration": r.duration 330 | } 331 | for r in results 332 | ] 333 | } 334 | for lang, results in by_language.items() 335 | } 336 | } 337 | 338 | 339 | def main(): 340 | """CLI entry point.""" 341 | parser = argparse.ArgumentParser( 342 | description="GitProbe Integration Test Suite", 343 | formatter_class=argparse.RawDescriptionHelpFormatter, 344 | epilog=""" 345 | Examples: 346 | python tests/test_integration.py # Run all tests 347 | python tests/test_integration.py --quick # Quick test subset 348 | python tests/test_integration.py --language python # Test Python only 349 | python tests/test_integration.py --language python --language rust # Multiple languages 350 | python tests/test_integration.py --verbose # Detailed output 351 | python tests/test_integration.py --server http://localhost:9000 # Custom server 352 | """ 353 | ) 354 | 355 | parser.add_argument( 356 | "--language", 357 | action="append", 358 | help="Test specific language(s) only (can be used multiple times)" 359 | ) 360 | parser.add_argument( 361 | "--quick", 362 | action="store_true", 363 | help="Run quick test subset (1 repo per language)" 364 | ) 365 | parser.add_argument( 366 | "--verbose", 367 | action="store_true", 368 | help="Show detailed test progress" 369 | ) 370 | parser.add_argument( 371 | "--server", 372 | default="http://localhost:8000", 373 | help="GitProbe server URL (default: http://localhost:8000)" 374 | ) 375 | parser.add_argument( 376 | "--timeout", 377 | type=int, 378 | default=120, 379 | help="Request timeout in seconds (default: 120)" 380 | ) 381 | parser.add_argument( 382 | "--json", 383 | action="store_true", 384 | help="Output results as JSON" 385 | ) 386 | 387 | args = parser.parse_args() 388 | 389 | # Run tests 390 | runner = GitProbeIntegrationTests(server_url=args.server, timeout=args.timeout) 391 | report = runner.run_tests( 392 | languages=args.language, 393 | quick=args.quick, 394 | verbose=args.verbose 395 | ) 396 | 397 | # Output results 398 | if args.json: 399 | print(json.dumps(report, indent=2)) 400 | 401 | # Exit with error code if tests failed 402 | if "error" in report: 403 | sys.exit(1) 404 | elif not report.get("overall_success", False): 405 | sys.exit(1) 406 | else: 407 | sys.exit(0) 408 | 409 | 410 | if __name__ == "__main__": 411 | main() -------------------------------------------------------------------------------- /src/gitprobe/analyzers/go.py: -------------------------------------------------------------------------------- 1 | """ 2 | Go analyzer using tree-sitter for accurate AST parsing and function extraction. 3 | """ 4 | 5 | import logging 6 | from typing import List, Set, Optional 7 | from pathlib import Path 8 | 9 | from tree_sitter import Parser, Language 10 | import tree_sitter_go 11 | 12 | from gitprobe.models.core import Function, CallRelationship 13 | from gitprobe.core.analysis_limits import AnalysisLimits, create_go_limits 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class TreeSitterGoAnalyzer: 19 | """Go analyzer using tree-sitter for proper AST parsing.""" 20 | 21 | def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None): 22 | self.file_path = Path(file_path) 23 | self.content = content 24 | self.functions: List[Function] = [] 25 | self.call_relationships: List[CallRelationship] = [] 26 | self.limits = limits or create_go_limits() 27 | 28 | try: 29 | language_capsule = tree_sitter_go.language() 30 | self.go_language = Language(language_capsule) 31 | self.parser = Parser(self.go_language) 32 | logger.debug(f"Go parser initialized with language object: {type(self.go_language)}") 33 | 34 | test_code = 'package main\nfunc main() { println("test") }' 35 | test_tree = self.parser.parse(bytes(test_code, "utf8")) 36 | if test_tree is None or test_tree.root_node is None: 37 | raise RuntimeError("Parser setup test failed for Go") 38 | logger.debug(f"Go parser test successful - root node type: {test_tree.root_node.type}") 39 | 40 | except Exception as e: 41 | logger.error(f"Failed to initialize Go parser: {e}") 42 | self.parser = None 43 | self.go_language = None 44 | 45 | logger.info(f"TreeSitterGoAnalyzer initialized for {file_path} with limits: {self.limits}") 46 | 47 | def analyze(self) -> None: 48 | """Analyze the Go content and extract functions and call relationships.""" 49 | if not self.limits.start_new_file(): 50 | logger.info(f"Skipping {self.file_path} - global limits reached") 51 | return 52 | 53 | if self.parser is None: 54 | logger.warning(f"Skipping {self.file_path} - parser initialization failed") 55 | return 56 | 57 | try: 58 | tree = self.parser.parse(bytes(self.content, "utf8")) 59 | root_node = tree.root_node 60 | 61 | logger.info(f"Parsed AST with root node type: {root_node.type}") 62 | 63 | self._extract_functions(root_node) 64 | 65 | if not self.limits.should_stop(): 66 | self._extract_call_relationships(root_node) 67 | 68 | logger.info( 69 | f"Analysis complete: {len(self.functions)} functions, {len(self.call_relationships)} relationships, {self.limits.nodes_processed} nodes processed" 70 | ) 71 | 72 | except Exception as e: 73 | logger.error(f"Error analyzing Go file {self.file_path}: {e}", exc_info=True) 74 | 75 | def _extract_functions(self, node) -> None: 76 | """Extract all function definitions from the AST.""" 77 | self._traverse_for_functions(node) 78 | self.functions.sort(key=lambda f: f.line_start) 79 | 80 | def _traverse_for_functions(self, node) -> None: 81 | """Recursively traverse AST nodes to find functions.""" 82 | if self.limits.should_stop(): 83 | return 84 | 85 | if node.type == "function_declaration": 86 | func = self._extract_function_declaration(node) 87 | if func and self._should_include_function(func): 88 | if self.limits.can_add_function(): 89 | self.functions.append(func) 90 | if self.limits.add_function(): 91 | return 92 | else: 93 | return 94 | 95 | elif node.type == "method_declaration": 96 | func = self._extract_method_declaration(node) 97 | if func and self._should_include_function(func): 98 | if self.limits.can_add_function(): 99 | self.functions.append(func) 100 | if self.limits.add_function(): 101 | return 102 | else: 103 | return 104 | 105 | elif node.type == "func_literal": 106 | func = self._extract_func_literal(node) 107 | if func and self._should_include_function(func): 108 | if self.limits.can_add_function(): 109 | self.functions.append(func) 110 | if self.limits.add_function(): 111 | return 112 | else: 113 | return 114 | 115 | for child in node.children: 116 | self._traverse_for_functions(child) 117 | if self.limits.should_stop(): 118 | break 119 | 120 | def _extract_function_declaration(self, node) -> Optional[Function]: 121 | """Extract regular function declaration: func name() {}""" 122 | try: 123 | name_node = self._find_child_by_type(node, "identifier") 124 | if not name_node: 125 | return None 126 | 127 | func_name = self._get_node_text(name_node) 128 | line_start = node.start_point[0] + 1 129 | line_end = node.end_point[0] + 1 130 | parameters = self._extract_parameters(node) 131 | code_snippet = self._get_node_text(node) 132 | 133 | return Function( 134 | name=func_name, 135 | file_path=str(self.file_path), 136 | line_start=line_start, 137 | line_end=line_end, 138 | parameters=parameters, 139 | docstring=self._extract_docstring(node), 140 | is_method=False, 141 | class_name=None, 142 | code_snippet=code_snippet, 143 | ) 144 | except Exception as e: 145 | logger.warning(f"Error extracting function declaration: {e}") 146 | return None 147 | 148 | def _extract_method_declaration(self, node) -> Optional[Function]: 149 | """Extract method declaration: func (receiver) methodName() {}""" 150 | try: 151 | name_node = self._find_child_by_type(node, "identifier") 152 | if not name_node: 153 | return None 154 | 155 | func_name = self._get_node_text(name_node) 156 | line_start = node.start_point[0] + 1 157 | line_end = node.end_point[0] + 1 158 | parameters = self._extract_parameters(node) 159 | code_snippet = self._get_node_text(node) 160 | receiver_type = self._extract_receiver_type(node) 161 | 162 | return Function( 163 | name=func_name, 164 | file_path=str(self.file_path), 165 | line_start=line_start, 166 | line_end=line_end, 167 | parameters=parameters, 168 | docstring=self._extract_docstring(node), 169 | is_method=True, 170 | class_name=receiver_type, 171 | code_snippet=code_snippet, 172 | ) 173 | except Exception as e: 174 | logger.warning(f"Error extracting method declaration: {e}") 175 | return None 176 | 177 | def _extract_func_literal(self, node) -> Optional[Function]: 178 | """Extract anonymous function/closure: func() {}""" 179 | try: 180 | line_start = node.start_point[0] + 1 181 | line_end = node.end_point[0] + 1 182 | parameters = self._extract_parameters(node) 183 | code_snippet = self._get_node_text(node) 184 | 185 | func_name = f"anonymous_func_line_{line_start}" 186 | 187 | return Function( 188 | name=func_name, 189 | file_path=str(self.file_path), 190 | line_start=line_start, 191 | line_end=line_end, 192 | parameters=parameters, 193 | docstring=None, 194 | is_method=False, 195 | class_name=None, 196 | code_snippet=code_snippet, 197 | ) 198 | except Exception as e: 199 | logger.warning(f"Error extracting func literal: {e}") 200 | return None 201 | 202 | def _should_include_function(self, func: Function) -> bool: 203 | """Determine if a function should be included in the analysis.""" 204 | excluded_names = { 205 | "init", 206 | "main", 207 | } 208 | 209 | if func.name.lower() in excluded_names: 210 | logger.debug(f"Skipping excluded function: {func.name}") 211 | return False 212 | 213 | if func.line_end - func.line_start < 2: 214 | logger.debug(f"Skipping short function: {func.name}") 215 | return False 216 | 217 | if func.name.startswith("anonymous_func") and func.line_end - func.line_start < 3: 218 | logger.debug(f"Skipping simple anonymous function: {func.name}") 219 | return False 220 | 221 | return True 222 | 223 | def _extract_parameters(self, node) -> List[str]: 224 | """Extract parameter names from a function node.""" 225 | parameters = [] 226 | params_node = self._find_child_by_type(node, "parameter_list") 227 | if params_node: 228 | for child in params_node.children: 229 | if child.type == "parameter_declaration": 230 | param_name = self._find_child_by_type(child, "identifier") 231 | if param_name: 232 | parameters.append(self._get_node_text(param_name)) 233 | elif child.type == "variadic_parameter_declaration": 234 | param_name = self._find_child_by_type(child, "identifier") 235 | if param_name: 236 | parameters.append(f"...{self._get_node_text(param_name)}") 237 | return parameters 238 | 239 | def _extract_receiver_type(self, node) -> Optional[str]: 240 | """Extract receiver type from method declaration.""" 241 | receiver_node = self._find_child_by_type(node, "parameter_list") 242 | if receiver_node and receiver_node.children: 243 | first_param = receiver_node.children[0] if receiver_node.children else None 244 | if first_param and first_param.type == "parameter_declaration": 245 | type_nodes = [ 246 | child 247 | for child in first_param.children 248 | if child.type in ["type_identifier", "pointer_type"] 249 | ] 250 | if type_nodes: 251 | return self._get_node_text(type_nodes[0]) 252 | return None 253 | 254 | def _extract_docstring(self, node) -> Optional[str]: 255 | """Extract Go doc comment from function.""" 256 | if node.prev_sibling and node.prev_sibling.type == "comment": 257 | comment_text = self._get_node_text(node.prev_sibling) 258 | lines = comment_text.split("\n") 259 | cleaned_lines = [] 260 | for line in lines: 261 | line = line.strip() 262 | if line.startswith("//"): 263 | cleaned_lines.append(line[2:].strip()) 264 | elif line.startswith("/*") and line.endswith("*/"): 265 | cleaned_lines.append(line[2:-2].strip()) 266 | return "\n".join(cleaned_lines) if cleaned_lines else None 267 | return None 268 | 269 | def _extract_call_relationships(self, node) -> None: 270 | """Extract function call relationships from the AST.""" 271 | func_ranges = {} 272 | for func in self.functions: 273 | for line in range(func.line_start, func.line_end + 1): 274 | func_ranges[line] = func 275 | 276 | self._traverse_for_calls(node, func_ranges) 277 | 278 | def _traverse_for_calls(self, node, func_ranges: dict) -> None: 279 | """Recursively find function calls.""" 280 | if self.limits.should_stop(): 281 | return 282 | 283 | if node.type == "call_expression": 284 | call_info = self._extract_call_from_node(node, func_ranges) 285 | if call_info: 286 | if self.limits.can_add_relationship(): 287 | self.call_relationships.append(call_info) 288 | if self.limits.add_relationship(): 289 | return 290 | else: 291 | return 292 | 293 | for child in node.children: 294 | self._traverse_for_calls(child, func_ranges) 295 | if self.limits.should_stop(): 296 | break 297 | 298 | def _extract_call_from_node(self, node, func_ranges: dict) -> Optional[CallRelationship]: 299 | """Extract call relationship from a call_expression node.""" 300 | try: 301 | call_line = node.start_point[0] + 1 302 | caller_func = func_ranges.get(call_line) 303 | if not caller_func: 304 | return None 305 | 306 | callee_name = self._extract_callee_name(node) 307 | if not callee_name or self._is_builtin_function(callee_name): 308 | return None 309 | 310 | caller_id = f"{self.file_path}:{caller_func.name}" 311 | return CallRelationship( 312 | caller=caller_id, 313 | callee=callee_name, 314 | call_line=call_line, 315 | is_resolved=False, 316 | ) 317 | except Exception as e: 318 | logger.warning(f"Error extracting call relationship: {e}") 319 | return None 320 | 321 | def _extract_callee_name(self, call_node) -> Optional[str]: 322 | """Extract the name of the called function.""" 323 | if call_node.children: 324 | callee_node = call_node.children[0] 325 | 326 | if callee_node.type == "identifier": 327 | return self._get_node_text(callee_node) 328 | elif callee_node.type == "selector_expression": 329 | field_node = self._find_child_by_type(callee_node, "field_identifier") 330 | if field_node: 331 | return self._get_node_text(field_node) 332 | elif callee_node.type == "qualified_type": 333 | name_node = self._find_child_by_type(callee_node, "type_identifier") 334 | if name_node: 335 | return self._get_node_text(name_node) 336 | return None 337 | 338 | def _is_builtin_function(self, name: str) -> bool: 339 | """Check if function name is a Go built-in.""" 340 | builtins = { 341 | "append", 342 | "cap", 343 | "close", 344 | "complex", 345 | "copy", 346 | "delete", 347 | "imag", 348 | "len", 349 | "make", 350 | "new", 351 | "panic", 352 | "print", 353 | "println", 354 | "real", 355 | "recover", 356 | "fmt", 357 | "log", 358 | "os", 359 | "io", 360 | "strings", 361 | "strconv", 362 | "time", 363 | "context", 364 | "errors", 365 | "sync", 366 | "http", 367 | "json", 368 | "encoding", 369 | "reflect", 370 | "sort", 371 | "math", 372 | "rand", 373 | "crypto", 374 | "hash", 375 | "net", 376 | "url", 377 | "path", 378 | "filepath", 379 | "buffer", 380 | "bytes", 381 | "regexp", 382 | "template", 383 | "html", 384 | "xml", 385 | "sql", 386 | "runtime", 387 | "unsafe", 388 | "atomic", 389 | "testing", 390 | "flag", 391 | "tar", 392 | "zip", 393 | "gzip", 394 | "base64", 395 | "hex", 396 | "pprof", 397 | "debug", 398 | "trace", 399 | "plugin", 400 | } 401 | return name in builtins 402 | 403 | # Helper methods 404 | def _find_child_by_type(self, node, node_type: str): 405 | """Find first child node of specified type.""" 406 | for child in node.children: 407 | if child.type == node_type: 408 | return child 409 | return None 410 | 411 | def _find_children_by_type(self, node, node_type: str): 412 | """Find all child nodes of specified type.""" 413 | return [child for child in node.children if child.type == node_type] 414 | 415 | def _get_node_text(self, node) -> str: 416 | """Get the text content of a node.""" 417 | start_byte = node.start_byte 418 | end_byte = node.end_byte 419 | return self.content.encode("utf8")[start_byte:end_byte].decode("utf8") 420 | 421 | 422 | # Integration functions 423 | def analyze_go_file_treesitter( 424 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 425 | ) -> tuple[List[Function], List[CallRelationship]]: 426 | """Analyze a Go file using tree-sitter.""" 427 | try: 428 | logger.info(f"Tree-sitter Go analysis for {file_path}") 429 | analyzer = TreeSitterGoAnalyzer(file_path, content, limits) 430 | analyzer.analyze() 431 | logger.info( 432 | f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {analyzer.limits.nodes_processed} nodes processed" 433 | ) 434 | return analyzer.functions, analyzer.call_relationships 435 | except Exception as e: 436 | logger.error(f"Error in tree-sitter Go analysis for {file_path}: {e}", exc_info=True) 437 | return [], [] 438 | -------------------------------------------------------------------------------- /src/gitprobe/utils/patterns.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code analysis patterns for different programming languages. 3 | 4 | This module contains patterns used to identify entry points, high-connectivity files, 5 | and function definitions across multiple programming languages. 6 | """ 7 | 8 | from typing import List, Dict 9 | 10 | DEFAULT_IGNORE_PATTERNS = { 11 | ".github", 12 | ".vscode", 13 | ".git", 14 | ".gitignore", 15 | ".gitmodules", 16 | ".gitignore", 17 | # Python 18 | "*.pyc", 19 | "*.pyo", 20 | "*.pyd", 21 | "__pycache__", 22 | ".pytest_cache", 23 | ".coverage", 24 | ".tox", 25 | ".nox", 26 | ".mypy_cache", 27 | ".ruff_cache", 28 | ".hypothesis", 29 | "poetry.lock", 30 | "Pipfile.lock", 31 | # JavaScript/FileSystemNode 32 | "node_modules", 33 | "bower_components", 34 | "package-lock.json", 35 | "yarn.lock", 36 | ".npm", 37 | ".yarn", 38 | ".pnpm-store", 39 | "bun.lock", 40 | "bun.lockb", 41 | # Java 42 | "*.class", 43 | "*.jar", 44 | "*.war", 45 | "*.ear", 46 | "*.nar", 47 | ".gradle/", 48 | "build/", 49 | ".settings/", 50 | ".classpath", 51 | "gradle-app.setting", 52 | "*.gradle", 53 | # IDEs and editors / Java 54 | ".project", 55 | # C/C++ 56 | "*.o", 57 | "*.obj", 58 | "*.dll", 59 | "*.dylib", 60 | "*.exe", 61 | "*.lib", 62 | "*.out", 63 | "*.a", 64 | "*.pdb", 65 | # Swift/Xcode 66 | ".build/", 67 | "*.xcodeproj/", 68 | "*.xcworkspace/", 69 | "*.pbxuser", 70 | "*.mode1v3", 71 | "*.mode2v3", 72 | "*.perspectivev3", 73 | "*.xcuserstate", 74 | "xcuserdata/", 75 | ".swiftpm/", 76 | # Ruby 77 | "*.gem", 78 | ".bundle/", 79 | "vendor/bundle", 80 | "Gemfile.lock", 81 | ".ruby-version", 82 | ".ruby-gemset", 83 | ".rvmrc", 84 | # Rust 85 | "Cargo.lock", 86 | "**/*.rs.bk", 87 | # Java / Rust 88 | "target/", 89 | # Go 90 | "pkg/", 91 | # .NET/C# 92 | "obj/", 93 | "*.suo", 94 | "*.user", 95 | "*.userosscache", 96 | "*.sln.docstates", 97 | "packages/", 98 | "*.nupkg", 99 | # Go / .NET / C# 100 | "bin/", 101 | # Version control 102 | ".git", 103 | ".svn", 104 | ".hg", 105 | ".gitignore", 106 | ".gitattributes", 107 | ".gitmodules", 108 | # Images and media 109 | "*.svg", 110 | "*.png", 111 | "*.jpg", 112 | "*.jpeg", 113 | "*.gif", 114 | "*.ico", 115 | "*.pdf", 116 | "*.mov", 117 | "*.mp4", 118 | "*.mp3", 119 | "*.wav", 120 | # Virtual environments 121 | "venv", 122 | ".venv", 123 | "env", 124 | ".env", 125 | "virtualenv", 126 | # IDEs and editors 127 | ".idea", 128 | ".vscode", 129 | ".vs", 130 | "*.swo", 131 | "*.swn", 132 | ".settings", 133 | "*.sublime-*", 134 | # Temporary and cache files 135 | "*.log", 136 | "*.bak", 137 | "*.swp", 138 | "*.tmp", 139 | "*.temp", 140 | ".cache", 141 | ".sass-cache", 142 | ".eslintcache", 143 | ".DS_Store", 144 | "Thumbs.db", 145 | "desktop.ini", 146 | # Build directories and artifacts 147 | "build", 148 | "dist", 149 | "target", 150 | "out", 151 | "*.egg-info", 152 | "*.egg", 153 | "*.whl", 154 | "*.so", 155 | # Documentation 156 | "site-packages", 157 | ".docusaurus", 158 | ".next", 159 | ".nuxt", 160 | # Other common patterns 161 | ## Minified files 162 | "*.min.js", 163 | "*.min.css", 164 | ## Source maps 165 | "*.map", 166 | ## Terraform 167 | ".terraform", 168 | "*.tfstate*", 169 | ## Dependencies in various languages 170 | "vendor/", 171 | # Gitingest 172 | "digest.txt", 173 | } 174 | 175 | 176 | DEFAULT_INCLUDE_PATTERNS = [ 177 | "*.py", 178 | "*.js", 179 | "*.ts", 180 | "*.jsx", 181 | "*.tsx", 182 | "*.java", 183 | "*.cpp", 184 | "*.c", 185 | "*.h", 186 | "*.cs", 187 | "*.go", 188 | "*.rs", 189 | "*.php", 190 | "*.rb", 191 | "*.swift", 192 | "*.kt", 193 | "*.scala", 194 | "*.clj", 195 | "*.hs", 196 | "*.ml", 197 | "*.html", 198 | "*.css", 199 | "*.scss", 200 | "*.sass", 201 | "*.json", 202 | "*.yaml", 203 | "*.yml", 204 | "*.xml", 205 | "*.md", 206 | "*.txt", 207 | "*.toml", 208 | "*.cfg", 209 | "*.ini", 210 | ] 211 | 212 | CODE_EXTENSIONS = { 213 | ".py": "python", 214 | ".js": "javascript", 215 | ".ts": "typescript", 216 | ".jsx": "javascript", 217 | ".tsx": "typescript", 218 | ".java": "java", 219 | ".cpp": "cpp", 220 | ".cc": "cpp", 221 | ".cxx": "cpp", 222 | ".c++": "cpp", 223 | ".c": "c", 224 | ".h": "c", 225 | ".hpp": "cpp", 226 | ".hxx": "cpp", 227 | ".h++": "cpp", 228 | ".rs": "rust", 229 | ".go": "go", 230 | ".php": "php", 231 | ".rb": "ruby", 232 | ".swift": "swift", 233 | ".kt": "kotlin", 234 | ".scala": "scala", 235 | ".cs": "csharp", 236 | } 237 | 238 | # Entry point file patterns for all supported languages 239 | ENTRY_POINT_PATTERNS = { 240 | # Python 241 | "main.py", 242 | "app.py", 243 | "server.py", 244 | "__main__.py", 245 | "run.py", 246 | "start.py", 247 | "manage.py", 248 | "wsgi.py", 249 | "asgi.py", 250 | "gunicorn.py", # Django/Flask patterns 251 | # JavaScript/TypeScript 252 | "index.js", 253 | "app.js", 254 | "server.js", 255 | "main.js", 256 | "index.ts", 257 | "app.ts", 258 | "server.ts", 259 | "main.ts", 260 | "start.js", 261 | "start.ts", 262 | "bootstrap.js", 263 | "bootstrap.ts", 264 | "entry.js", 265 | "entry.ts", 266 | # Go 267 | "main.go", 268 | "cmd.go", 269 | "server.go", 270 | "app.go", 271 | "root.go", 272 | "start.go", 273 | # Rust 274 | "main.rs", 275 | "lib.rs", 276 | "server.rs", 277 | "app.rs", 278 | "start.rs", 279 | "bin.rs", 280 | # C/C++ 281 | "main.c", 282 | "main.cpp", 283 | "main.cc", 284 | "main.cxx", 285 | "app.c", 286 | "app.cpp", 287 | "start.c", 288 | "start.cpp", 289 | "entry.c", 290 | "entry.cpp", 291 | } 292 | 293 | # Additional entry point path patterns (for when filename patterns fail) 294 | ENTRY_POINT_PATH_PATTERNS = [ 295 | "cmd/main", 296 | "cmd/root", 297 | "cmd/server", # Go command patterns 298 | "src/main", 299 | "src/app", 300 | "src/server", # Common src patterns 301 | "bin/main", 302 | "bin/app", 303 | "bin/server", # Binary patterns 304 | "app/main", 305 | "app/server", 306 | "app/start", # App directory patterns 307 | "scripts/start", 308 | "scripts/run", # Script patterns 309 | ] 310 | 311 | # Flexible entry point name patterns (partial matches) 312 | ENTRY_POINT_NAME_PATTERNS = [ 313 | "main", 314 | "app", 315 | "server", 316 | "start", 317 | "run", 318 | "entry", 319 | "bootstrap", 320 | "init", 321 | "cmd", 322 | "cli", 323 | "daemon", 324 | "service", 325 | "worker", 326 | "launcher", 327 | ] 328 | 329 | # High connectivity file patterns (files likely to have many function calls) 330 | HIGH_CONNECTIVITY_PATTERNS = { 331 | # General patterns 332 | "router", 333 | "controller", 334 | "service", 335 | "handler", 336 | "middleware", 337 | "api", 338 | "core", 339 | "engine", 340 | "manager", 341 | "processor", 342 | "client", 343 | # Language-specific patterns 344 | "mod", 345 | "module", # Rust modules 346 | "pkg", 347 | "package", # Go packages 348 | "lib", 349 | "util", 350 | "utils", 351 | "helper", 352 | "helpers", 353 | # Framework patterns 354 | "express", 355 | "fastapi", 356 | "gin", 357 | "actix", 358 | "rocket", # Web frameworks 359 | "db", 360 | "database", 361 | "model", 362 | "entity", 363 | "repo", 364 | "repository", 365 | # Additional patterns 366 | "config", 367 | "settings", 368 | "constants", 369 | "types", 370 | "interfaces", 371 | # Generic library patterns (added for broader coverage) 372 | "console", 373 | "text", 374 | "style", 375 | "render", 376 | "display", 377 | "format", 378 | "parse", 379 | "parser", 380 | "convert", 381 | "transform", 382 | "process", 383 | "table", 384 | "tree", 385 | "list", 386 | "grid", 387 | "layout", 388 | "widget", 389 | "color", 390 | "theme", 391 | "visual", 392 | "graphic", 393 | "draw", 394 | "paint", 395 | "file", 396 | "io", 397 | "stream", 398 | "buffer", 399 | "cache", 400 | "store", 401 | "base", 402 | "common", 403 | "shared", 404 | "global", 405 | "main", 406 | "index", 407 | } 408 | 409 | # Source directory patterns across all languages 410 | SOURCE_DIRECTORY_PATTERNS = [ 411 | "src/", 412 | "lib/", 413 | "core/", 414 | "pkg/", # General 415 | "cmd/", 416 | "internal/", # Go specific 417 | "crates/", 418 | "modules/", # Rust specific 419 | "include/", 420 | "source/", # C/C++ specific 421 | "components/", 422 | "services/", 423 | "utils/", # Framework patterns 424 | ] 425 | 426 | # Function definition patterns for quick file scanning 427 | FUNCTION_DEFINITION_PATTERNS = { 428 | "python": ["def {name}"], 429 | "javascript": ["function {name}", "const {name}", "export {name}"], 430 | "typescript": ["function {name}", "const {name}", "export {name}"], 431 | "go": ["func {name}"], 432 | "rust": ["fn {name}", "pub fn {name}"], 433 | "c": ["void {name}", "int {name}", "{name}("], 434 | "cpp": ["void {name}", "int {name}", "{name}("], 435 | "general": ["{name}("], # Fallback pattern 436 | } 437 | 438 | # Critical function name patterns 439 | CRITICAL_FUNCTION_NAMES = {"main", "index", "app", "server", "start", "init", "run", "new"} 440 | 441 | # Export/public function patterns for critical function detection 442 | EXPORT_PATTERNS = [ 443 | # JavaScript/TypeScript exports 444 | "export default", 445 | "module.exports =", 446 | "exports.", 447 | # Rust public functions 448 | "pub fn main", 449 | "pub fn new", 450 | "pub fn", 451 | # Go exported functions (capitalized) 452 | "func main", 453 | "func new", 454 | # C/C++ main functions 455 | "int main", 456 | "void main", 457 | "public static void main", 458 | # Python special methods 459 | 'if __name__ == "__main__"', 460 | ] 461 | 462 | # Fallback patterns when standard patterns don't work 463 | FALLBACK_PATTERNS = { 464 | "any_main_file": ["main"], # Any file with "main" in name 465 | "any_app_file": ["app"], # Any file with "app" in name 466 | "any_server_file": ["server", "srv"], # Any server-related file 467 | "any_index_file": ["index", "idx"], # Any index file 468 | "largest_files": True, # Fall back to largest files by line count 469 | } 470 | 471 | 472 | def get_function_patterns_for_language(language: str) -> list: 473 | """ 474 | Get function definition patterns for a specific language. 475 | 476 | Args: 477 | language: Programming language name 478 | 479 | Returns: 480 | List of function definition patterns for the language 481 | """ 482 | return FUNCTION_DEFINITION_PATTERNS.get( 483 | language.lower(), FUNCTION_DEFINITION_PATTERNS["general"] 484 | ) 485 | 486 | 487 | def is_entry_point_file(filename: str) -> bool: 488 | """ 489 | Check if a filename matches entry point patterns. 490 | 491 | Args: 492 | filename: Name of the file to check 493 | 494 | Returns: 495 | True if the file is likely an entry point 496 | """ 497 | filename_lower = filename.lower() 498 | 499 | # Exact match 500 | if filename_lower in ENTRY_POINT_PATTERNS: 501 | return True 502 | 503 | # Partial name matching for flexibility 504 | for pattern in ENTRY_POINT_NAME_PATTERNS: 505 | if pattern in filename_lower and any( 506 | ext in filename_lower for ext in [".py", ".js", ".ts", ".go", ".rs", ".c", ".cpp"] 507 | ): 508 | return True 509 | 510 | return False 511 | 512 | 513 | def is_entry_point_path(filepath: str) -> bool: 514 | """ 515 | Check if a file path matches entry point path patterns. 516 | 517 | Args: 518 | filepath: Full path of the file to check 519 | 520 | Returns: 521 | True if the path suggests an entry point 522 | """ 523 | filepath_lower = filepath.lower() 524 | 525 | for pattern in ENTRY_POINT_PATH_PATTERNS: 526 | if pattern in filepath_lower: 527 | return True 528 | 529 | return False 530 | 531 | 532 | def has_high_connectivity_potential(filename: str, filepath: str) -> bool: 533 | """ 534 | Check if a file has high connectivity potential based on name and path. 535 | 536 | Args: 537 | filename: Name of the file 538 | filepath: Full path of the file 539 | 540 | Returns: 541 | True if the file likely has high connectivity 542 | """ 543 | filename_lower = filename.lower() 544 | filepath_lower = filepath.lower() 545 | 546 | # Check filename patterns 547 | if any(pattern in filename_lower for pattern in HIGH_CONNECTIVITY_PATTERNS): 548 | return True 549 | 550 | # Check filepath patterns 551 | if any(pattern in filepath_lower for pattern in HIGH_CONNECTIVITY_PATTERNS): 552 | return True 553 | 554 | # Check source directory patterns 555 | if any(pattern in filepath_lower for pattern in SOURCE_DIRECTORY_PATTERNS): 556 | return True 557 | 558 | return False 559 | 560 | 561 | def is_critical_function(func_name: str, code_snippet: str = None) -> bool: 562 | """ 563 | Check if a function is critical based on name and code patterns. 564 | 565 | Args: 566 | func_name: Name of the function 567 | code_snippet: Optional code snippet to analyze 568 | 569 | Returns: 570 | True if the function is considered critical 571 | """ 572 | # Check critical function names 573 | if func_name.lower() in CRITICAL_FUNCTION_NAMES: 574 | return True 575 | 576 | # Check export patterns in code snippet 577 | if code_snippet: 578 | snippet_lower = code_snippet.lower() 579 | if any(pattern in snippet_lower for pattern in EXPORT_PATTERNS): 580 | return True 581 | 582 | return False 583 | 584 | 585 | def find_fallback_entry_points(code_files: List[Dict], max_files: int = 5) -> List[Dict]: 586 | """ 587 | Find fallback entry points when standard patterns don't match. 588 | 589 | Args: 590 | code_files: List of all code files 591 | max_files: Maximum number of fallback files to return 592 | 593 | Returns: 594 | List of files that could serve as entry points 595 | """ 596 | fallback_files = [] 597 | 598 | # Try fallback name patterns 599 | for file_info in code_files: 600 | filename = file_info["name"].lower() 601 | filepath = file_info["path"].lower() 602 | 603 | # Check for any main-like files 604 | if any(pattern in filename for pattern in ["main", "app", "server", "start", "index"]): 605 | fallback_files.append(file_info) 606 | 607 | # Check for entry point paths 608 | elif is_entry_point_path(filepath): 609 | fallback_files.append(file_info) 610 | 611 | # If still nothing, try files in root or common directories 612 | if not fallback_files: 613 | for file_info in code_files: 614 | filepath = file_info["path"] 615 | # Files in root directory or immediate subdirectories 616 | if filepath.count("/") <= 1: 617 | fallback_files.append(file_info) 618 | 619 | # Sort by likelihood (prefer shorter paths, common names) 620 | def fallback_priority(file_info): 621 | path = file_info["path"].lower() 622 | name = file_info["name"].lower() 623 | 624 | score = 0 625 | # Prefer shorter paths (closer to root) 626 | score -= path.count("/") 627 | # Prefer common entry point names 628 | if any(pattern in name for pattern in ["main", "app", "index"]): 629 | score -= 10 630 | # Prefer certain extensions 631 | if any(ext in name for ext in [".py", ".js", ".go", ".rs"]): 632 | score -= 5 633 | 634 | return score 635 | 636 | fallback_files.sort(key=fallback_priority) 637 | return fallback_files[:max_files] 638 | 639 | 640 | def find_fallback_connectivity_files(code_files: List[Dict], max_files: int = 10) -> List[Dict]: 641 | """ 642 | Find fallback high-connectivity files when standard patterns don't match. 643 | 644 | Args: 645 | code_files: List of all code files 646 | max_files: Maximum number of fallback files to return 647 | 648 | Returns: 649 | List of files that likely have good connectivity 650 | """ 651 | fallback_files = [] 652 | 653 | # Include all files from common source directories 654 | for file_info in code_files: 655 | filepath = file_info["path"].lower() 656 | 657 | # Any file in src, lib, or similar directories 658 | if any(pattern in filepath for pattern in ["src/", "lib/", "app/", "pkg/", "core/"]): 659 | fallback_files.append(file_info) 660 | 661 | # If still not enough, include files with certain extensions 662 | if len(fallback_files) < max_files: 663 | for file_info in code_files: 664 | if file_info not in fallback_files: 665 | name = file_info["name"].lower() 666 | # Include common source file extensions 667 | if any(ext in name for ext in [".py", ".js", ".ts", ".go", ".rs", ".c", ".cpp"]): 668 | # Skip test files 669 | if not any(test_pattern in name for test_pattern in ["test", "spec", "_test"]): 670 | fallback_files.append(file_info) 671 | 672 | return fallback_files[:max_files] 673 | -------------------------------------------------------------------------------- /src/gitprobe/analyzers/c_cpp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Advanced C/C++ analyzer using Tree-sitter for accurate AST parsing. 3 | 4 | This module provides C and C++ source code analysis using tree-sitter, 5 | which is faster and more reliable than libclang or pycparser for basic 6 | function and call relationship extraction. 7 | """ 8 | 9 | import logging 10 | from typing import List, Tuple, Dict, Any, Optional, Set 11 | from pathlib import Path 12 | 13 | from tree_sitter import Parser, Language 14 | import tree_sitter_c 15 | import tree_sitter_cpp 16 | 17 | from gitprobe.models.core import Function, CallRelationship 18 | from gitprobe.core.analysis_limits import AnalysisLimits, create_c_cpp_limits 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class TreeSitterCAnalyzer: 24 | """C/C++ analyzer using tree-sitter for proper AST parsing.""" 25 | 26 | def __init__( 27 | self, 28 | file_path: str, 29 | content: str, 30 | language: str = "c", 31 | limits: Optional[AnalysisLimits] = None, 32 | ): 33 | self.file_path = str(file_path) 34 | self.content = content 35 | self.language = language.lower() 36 | self.lines = content.splitlines() 37 | self.functions: List[Function] = [] 38 | self.call_relationships: List[CallRelationship] = [] 39 | self.limits = limits or create_c_cpp_limits() 40 | 41 | is_cpp = ( 42 | self.language == "cpp" 43 | or self.language == "c++" 44 | or Path(file_path).suffix.lower() 45 | in [".cpp", ".cc", ".cxx", ".c++", ".hpp", ".hxx", ".h++"] 46 | ) 47 | 48 | try: 49 | if is_cpp: 50 | language_capsule = tree_sitter_cpp.language() 51 | self.language_obj = Language(language_capsule) 52 | self.parser = Parser(self.language_obj) 53 | logger.debug( 54 | f"C++ parser initialized with language object: {type(self.language_obj)}" 55 | ) 56 | else: 57 | language_capsule = tree_sitter_c.language() 58 | self.language_obj = Language(language_capsule) 59 | self.parser = Parser(self.language_obj) 60 | logger.debug( 61 | f"C parser initialized with language object: {type(self.language_obj)}" 62 | ) 63 | 64 | test_code = "int main() { return 0; }" if not is_cpp else "int main() { return 0; }" 65 | test_tree = self.parser.parse(bytes(test_code, "utf8")) 66 | if test_tree is None or test_tree.root_node is None: 67 | raise RuntimeError(f"Parser setup test failed for {self.language.upper()}") 68 | logger.debug(f"Parser test successful - root node type: {test_tree.root_node.type}") 69 | 70 | except Exception as e: 71 | logger.error(f"Failed to initialize {self.language.upper()} parser: {e}") 72 | self.parser = None 73 | self.language_obj = None 74 | 75 | logger.info( 76 | f"TreeSitterCAnalyzer initialized for {file_path} ({self.language.upper()}) with limits: {self.limits}" 77 | ) 78 | 79 | def analyze(self) -> None: 80 | """Analyze C/C++ code using tree-sitter.""" 81 | if not self.limits.start_new_file(): 82 | logger.info(f"Skipping {self.file_path} - global limits reached") 83 | return 84 | 85 | if self.parser is None: 86 | logger.warning(f"Skipping {self.file_path} - parser initialization failed") 87 | return 88 | 89 | try: 90 | logger.debug( 91 | f"Attempting to parse {len(self.content)} bytes of {self.language.upper()} code" 92 | ) 93 | logger.debug(f"Parser language object: {self.language_obj}") 94 | 95 | tree = self.parser.parse(bytes(self.content, "utf8")) 96 | 97 | if tree is None: 98 | raise ValueError("Parser returned None tree") 99 | 100 | root_node = tree.root_node 101 | if root_node is None: 102 | raise ValueError("Tree has no root node") 103 | 104 | logger.info(f"Parsed AST with root node type: {root_node.type}") 105 | 106 | if root_node.has_error: 107 | logger.warning(f"Parse tree contains errors for {self.file_path}") 108 | 109 | self._extract_functions(root_node) 110 | 111 | if not self.limits.should_stop(): 112 | self._extract_calls(root_node) 113 | 114 | logger.info( 115 | f"Tree-sitter {self.language.upper()} analysis complete: " 116 | f"{len(self.functions)} functions, {len(self.call_relationships)} calls, " 117 | f"{self.limits.nodes_processed} nodes processed" 118 | ) 119 | 120 | except Exception as e: 121 | logger.error( 122 | f"Tree-sitter {self.language.upper()} analysis failed for {self.file_path}: {e}", 123 | exc_info=True, 124 | ) 125 | 126 | def _extract_functions(self, node): 127 | """Extract function definitions from the AST.""" 128 | if self.limits.should_stop(): 129 | return 130 | 131 | if node.type == "function_definition": 132 | func = self._create_function_from_node(node) 133 | if func: 134 | if self.limits.can_add_function(): 135 | self.functions.append(func) 136 | if self.limits.add_function(): 137 | return 138 | else: 139 | return 140 | elif node.type == "function_declarator": 141 | func = self._create_function_from_declarator(node) 142 | if func: 143 | if self.limits.can_add_function(): 144 | self.functions.append(func) 145 | if self.limits.add_function(): 146 | return 147 | else: 148 | return 149 | elif self.language in ["cpp", "c++"] and node.type in [ 150 | "method_definition", 151 | "constructor_definition", 152 | "destructor_definition", 153 | ]: 154 | func = self._create_method_from_node(node) 155 | if func: 156 | if self.limits.can_add_function(): 157 | self.functions.append(func) 158 | if self.limits.add_function(): 159 | return 160 | else: 161 | return 162 | 163 | for child in node.children: 164 | self._extract_functions(child) 165 | if self.limits.should_stop(): 166 | break 167 | 168 | def _create_function_from_node(self, node) -> Optional[Function]: 169 | """Create a Function object from a function_definition node.""" 170 | try: 171 | declarator = self._find_child_by_type(node, "function_declarator") 172 | if not declarator: 173 | return None 174 | 175 | identifier = self._find_child_by_type(declarator, "identifier") 176 | if not identifier: 177 | return None 178 | 179 | func_name = self._get_node_text(identifier) 180 | 181 | line_start = node.start_point[0] + 1 182 | line_end = node.end_point[0] + 1 183 | 184 | params = self._extract_parameters(declarator) 185 | 186 | code_snippet = self._get_node_text(node) 187 | 188 | is_method = self._is_method(node) 189 | class_name = self._get_class_name(node) if is_method else None 190 | 191 | return Function( 192 | name=func_name, 193 | file_path=self.file_path, 194 | line_start=line_start, 195 | line_end=line_end, 196 | parameters=params, 197 | code_snippet=code_snippet, 198 | is_method=is_method, 199 | class_name=class_name, 200 | docstring=None, 201 | ) 202 | 203 | except Exception as e: 204 | logger.warning(f"Failed to create function from node: {e}") 205 | return None 206 | 207 | def _create_function_from_declarator(self, node) -> Optional[Function]: 208 | """Create a Function object from a function_declarator node (for declarations).""" 209 | try: 210 | identifier = self._find_child_by_type(node, "identifier") 211 | if not identifier: 212 | return None 213 | 214 | func_name = self._get_node_text(identifier) 215 | 216 | line_start = node.start_point[0] + 1 217 | line_end = node.end_point[0] + 1 218 | 219 | params = self._extract_parameters(node) 220 | 221 | code_snippet = ( 222 | self._get_node_text(node.parent) if node.parent else self._get_node_text(node) 223 | ) 224 | 225 | return Function( 226 | name=func_name, 227 | file_path=self.file_path, 228 | line_start=line_start, 229 | line_end=line_end, 230 | parameters=params, 231 | code_snippet=code_snippet, 232 | is_method=False, 233 | class_name=None, 234 | docstring=None, 235 | ) 236 | 237 | except Exception as e: 238 | logger.warning(f"Failed to create function from declarator: {e}") 239 | return None 240 | 241 | def _create_method_from_node(self, node) -> Optional[Function]: 242 | """Create a Function object from a method_definition node.""" 243 | try: 244 | declarator = self._find_child_by_type(node, "function_declarator") 245 | if not declarator: 246 | return None 247 | 248 | identifier = self._find_child_by_type(declarator, "identifier") 249 | if not identifier: 250 | if node.type == "destructor_definition": 251 | destructor_name = self._find_child_by_type(node, "destructor_name") 252 | if destructor_name: 253 | identifier = self._find_child_by_type(destructor_name, "identifier") 254 | 255 | if not identifier: 256 | return None 257 | 258 | func_name = self._get_node_text(identifier) 259 | 260 | line_start = node.start_point[0] + 1 261 | line_end = node.end_point[0] + 1 262 | 263 | params = self._extract_parameters(declarator) 264 | 265 | code_snippet = self._get_node_text(node) 266 | 267 | class_name = self._get_class_name(node) 268 | 269 | return Function( 270 | name=func_name, 271 | file_path=self.file_path, 272 | line_start=line_start, 273 | line_end=line_end, 274 | parameters=params, 275 | code_snippet=code_snippet, 276 | is_method=True, 277 | class_name=class_name, 278 | docstring=None, 279 | ) 280 | 281 | except Exception as e: 282 | logger.warning(f"Failed to create method from node: {e}") 283 | return None 284 | 285 | def _extract_parameters(self, declarator_node) -> List[str]: 286 | """Extract parameter names from function declarator.""" 287 | params = [] 288 | 289 | param_list = self._find_child_by_type(declarator_node, "parameter_list") 290 | if param_list: 291 | for child in param_list.children: 292 | if child.type == "parameter_declaration": 293 | param_name = self._extract_parameter_name(child) 294 | if param_name: 295 | params.append(param_name) 296 | 297 | return params 298 | 299 | def _extract_parameter_name(self, param_node) -> Optional[str]: 300 | """Extract parameter name from parameter_declaration node.""" 301 | for child in param_node.children: 302 | if child.type == "identifier": 303 | return self._get_node_text(child) 304 | elif child.type in ["pointer_declarator", "array_declarator"]: 305 | identifier = self._find_child_by_type(child, "identifier") 306 | if identifier: 307 | return self._get_node_text(identifier) 308 | return None 309 | 310 | def _extract_calls(self, node): 311 | """Extract function calls from the AST.""" 312 | if self.limits.should_stop(): 313 | return 314 | 315 | if node.type == "call_expression": 316 | self._process_call_expression(node) 317 | if self.limits.increment(): 318 | return 319 | 320 | for child in node.children: 321 | self._extract_calls(child) 322 | if self.limits.should_stop(): 323 | break 324 | 325 | def _process_call_expression(self, node): 326 | """Process a call_expression node to extract call relationships.""" 327 | try: 328 | function_node = node.children[0] if node.children else None 329 | if not function_node: 330 | return 331 | 332 | callee_name = None 333 | 334 | if function_node.type == "identifier": 335 | callee_name = self._get_node_text(function_node) 336 | elif function_node.type == "field_expression": 337 | field = self._find_child_by_type(function_node, "field_identifier") 338 | if field: 339 | callee_name = self._get_node_text(field) 340 | elif function_node.type == "scoped_identifier": 341 | identifier = self._find_child_by_type(function_node, "identifier") 342 | if identifier: 343 | callee_name = self._get_node_text(identifier) 344 | 345 | if callee_name and not self._is_builtin_function(callee_name): 346 | containing_func = self._find_containing_function(node.start_point[0] + 1) 347 | if containing_func and containing_func.name != callee_name: 348 | call_line = node.start_point[0] + 1 349 | 350 | relationship = CallRelationship( 351 | caller=f"{self.file_path}:{containing_func.name}", 352 | callee=callee_name, 353 | call_line=call_line, 354 | is_resolved=False, 355 | ) 356 | if self.limits.can_add_relationship(): 357 | self.call_relationships.append(relationship) 358 | self.limits.add_relationship() 359 | 360 | except Exception as e: 361 | logger.warning(f"Failed to process call expression: {e}") 362 | 363 | def _find_containing_function(self, line_number: int) -> Optional[Function]: 364 | """Find the function that contains the given line number.""" 365 | for func in self.functions: 366 | if func.line_start is not None and func.line_end is not None: 367 | if func.line_start <= line_number <= func.line_end: 368 | return func 369 | return None 370 | 371 | def _is_method(self, node) -> bool: 372 | """Check if the function is a method (inside a class/struct).""" 373 | parent = node.parent 374 | while parent: 375 | if parent.type in ["class_specifier", "struct_specifier"]: 376 | return True 377 | parent = parent.parent 378 | return False 379 | 380 | def _get_class_name(self, node) -> Optional[str]: 381 | """Get the class name containing this method.""" 382 | parent = node.parent 383 | while parent: 384 | if parent.type in ["class_specifier", "struct_specifier"]: 385 | for child in parent.children: 386 | if child.type == "type_identifier": 387 | return self._get_node_text(child) 388 | parent = parent.parent 389 | return None 390 | 391 | def _is_builtin_function(self, name: str) -> bool: 392 | """Check if function name is a C/C++ built-in.""" 393 | builtins = { 394 | "printf", 395 | "scanf", 396 | "malloc", 397 | "free", 398 | "calloc", 399 | "realloc", 400 | "strlen", 401 | "strcpy", 402 | "strcmp", 403 | "strcat", 404 | "memcpy", 405 | "memset", 406 | "exit", 407 | "abort", 408 | "assert", 409 | "sizeof", 410 | } 411 | return name in builtins 412 | 413 | def _find_child_by_type(self, node, target_type: str): 414 | """Find the first child node of the specified type.""" 415 | for child in node.children: 416 | if child.type == target_type: 417 | return child 418 | return None 419 | 420 | def _get_node_text(self, node) -> str: 421 | """Get the text content of a node.""" 422 | return self.content[node.start_byte : node.end_byte] 423 | 424 | 425 | def analyze_c_file_treesitter( 426 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 427 | ) -> Tuple[List[Function], List[CallRelationship]]: 428 | """ 429 | Analyze a C file using Tree-sitter. 430 | 431 | Args: 432 | file_path: Path to the C file 433 | content: Content of the C file 434 | limits: Analysis limits 435 | 436 | Returns: 437 | Tuple of (functions, call_relationships) 438 | """ 439 | try: 440 | logger.info(f"Tree-sitter C analysis for {file_path}") 441 | if limits is None: 442 | limits = create_c_cpp_limits() 443 | analyzer = TreeSitterCAnalyzer(file_path, content, language="c", limits=limits) 444 | analyzer.analyze() 445 | logger.info( 446 | f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {analyzer.limits.nodes_processed} nodes processed" 447 | ) 448 | return analyzer.functions, analyzer.call_relationships 449 | except Exception as e: 450 | logger.error(f"Error in tree-sitter C analysis for {file_path}: {e}", exc_info=True) 451 | return [], [] 452 | 453 | 454 | def analyze_cpp_file_treesitter( 455 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 456 | ) -> Tuple[List[Function], List[CallRelationship]]: 457 | """ 458 | Analyze a C++ file using Tree-sitter. 459 | 460 | Args: 461 | file_path: Path to the C++ file 462 | content: Content of the C++ file 463 | limits: Analysis limits 464 | 465 | Returns: 466 | Tuple of (functions, call_relationships) 467 | """ 468 | try: 469 | logger.info(f"Tree-sitter C++ analysis for {file_path}") 470 | if limits is None: 471 | limits = create_c_cpp_limits() 472 | analyzer = TreeSitterCAnalyzer(file_path, content, language="cpp", limits=limits) 473 | analyzer.analyze() 474 | logger.info( 475 | f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {analyzer.limits.nodes_processed} nodes processed" 476 | ) 477 | return analyzer.functions, analyzer.call_relationships 478 | except Exception as e: 479 | logger.error(f"Error in tree-sitter C++ analysis for {file_path}: {e}", exc_info=True) 480 | return [], [] 481 | 482 | 483 | def analyze_c_file( 484 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 485 | ) -> Tuple[List[Function], List[CallRelationship]]: 486 | """Main entry point for C file analysis.""" 487 | return analyze_c_file_treesitter(file_path, content, limits) 488 | 489 | 490 | def analyze_cpp_file( 491 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 492 | ) -> Tuple[List[Function], List[CallRelationship]]: 493 | """Main entry point for C++ file analysis.""" 494 | return analyze_cpp_file_treesitter(file_path, content, limits) 495 | -------------------------------------------------------------------------------- /src/gitprobe/analyzers/javascript.py: -------------------------------------------------------------------------------- 1 | """ 2 | Advanced JavaScript/TypeScript analyzer using Tree-sitter for accurate AST parsing. 3 | 4 | This module provides proper AST-based analysis for JavaScript and TypeScript files, 5 | replacing the regex-based approach with a more accurate tree-sitter implementation. 6 | """ 7 | 8 | import logging 9 | from typing import List, Set, Optional 10 | from pathlib import Path 11 | 12 | from tree_sitter import Parser, Language 13 | import tree_sitter_javascript 14 | import tree_sitter_typescript 15 | 16 | from gitprobe.models.core import Function, CallRelationship 17 | from gitprobe.core.analysis_limits import AnalysisLimits, create_javascript_limits 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class TreeSitterJSAnalyzer: 23 | """JavaScript analyzer using tree-sitter for proper AST parsing.""" 24 | 25 | def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None): 26 | self.file_path = Path(file_path) 27 | self.content = content 28 | self.functions: List[Function] = [] 29 | self.call_relationships: List[CallRelationship] = [] 30 | self.limits = limits or create_javascript_limits() 31 | 32 | try: 33 | language_capsule = tree_sitter_javascript.language() 34 | self.js_language = Language(language_capsule) 35 | self.parser = Parser(self.js_language) 36 | logger.debug( 37 | f"JavaScript parser initialized with language object: {type(self.js_language)}" 38 | ) 39 | 40 | test_code = "function test() { console.log('test'); }" 41 | test_tree = self.parser.parse(bytes(test_code, "utf8")) 42 | if test_tree is None or test_tree.root_node is None: 43 | raise RuntimeError("Parser setup test failed for JavaScript") 44 | logger.debug( 45 | f"JavaScript parser test successful - root node type: {test_tree.root_node.type}" 46 | ) 47 | 48 | except Exception as e: 49 | logger.error(f"Failed to initialize JavaScript parser: {e}") 50 | self.parser = None 51 | self.js_language = None 52 | 53 | logger.info(f"TreeSitterJSAnalyzer initialized for {file_path} with limits: {self.limits}") 54 | 55 | def analyze(self) -> None: 56 | """Analyze the JavaScript content and extract functions and call relationships.""" 57 | if not self.limits.start_new_file(): 58 | logger.info(f"Skipping {self.file_path} - global limits reached") 59 | return 60 | 61 | if self.parser is None: 62 | logger.warning(f"Skipping {self.file_path} - parser initialization failed") 63 | return 64 | 65 | try: 66 | tree = self.parser.parse(bytes(self.content, "utf8")) 67 | root_node = tree.root_node 68 | 69 | logger.info(f"Parsed AST with root node type: {root_node.type}") 70 | 71 | self._extract_functions(root_node) 72 | 73 | if not self.limits.should_stop(): 74 | self._extract_call_relationships(root_node) 75 | 76 | logger.info( 77 | f"Analysis complete: {len(self.functions)} functions, {len(self.call_relationships)} relationships, {self.limits.nodes_processed} nodes processed" 78 | ) 79 | 80 | except Exception as e: 81 | logger.error(f"Error analyzing JavaScript file {self.file_path}: {e}", exc_info=True) 82 | 83 | def _extract_functions(self, node) -> None: 84 | """Extract all function definitions from the AST.""" 85 | self._traverse_for_functions(node) 86 | self.functions.sort(key=lambda f: f.line_start) 87 | 88 | def _traverse_for_functions(self, node) -> None: 89 | """Recursively traverse AST nodes to find functions.""" 90 | 91 | if node.type == "function_declaration": 92 | func = self._extract_function_declaration(node) 93 | if func and self._should_include_function(func): 94 | if self.limits.can_add_function(): 95 | self.functions.append(func) 96 | if self.limits.add_function(): 97 | return 98 | else: 99 | return 100 | 101 | elif node.type == "export_statement": 102 | func = self._extract_exported_function(node) 103 | if func and self._should_include_function(func): 104 | if self.limits.can_add_function(): 105 | self.functions.append(func) 106 | if self.limits.add_function(): 107 | return 108 | else: 109 | return 110 | 111 | elif node.type == "lexical_declaration": 112 | func = self._extract_arrow_function_from_declaration(node) 113 | if func and self._should_include_function(func): 114 | if self.limits.can_add_function(): 115 | self.functions.append(func) 116 | if self.limits.add_function(): 117 | return 118 | else: 119 | return 120 | 121 | elif node.type == "method_definition": 122 | func = self._extract_method_definition(node) 123 | if func and self._should_include_function(func): 124 | if self.limits.can_add_function(): 125 | self.functions.append(func) 126 | if self.limits.add_function(): 127 | return 128 | else: 129 | return 130 | 131 | elif node.type == "pair": 132 | func = self._extract_object_method(node) 133 | if func and self._should_include_function(func): 134 | if self.limits.can_add_function(): 135 | self.functions.append(func) 136 | if self.limits.add_function(): 137 | return 138 | else: 139 | return 140 | 141 | elif node.type == "assignment_expression": 142 | func = self._extract_assignment_function(node) 143 | if func and self._should_include_function(func): 144 | if self.limits.can_add_function(): 145 | self.functions.append(func) 146 | if self.limits.add_function(): 147 | return 148 | else: 149 | return 150 | 151 | for child in node.children: 152 | if self.limits.should_stop(): 153 | break 154 | self._traverse_for_functions(child) 155 | 156 | def _extract_function_declaration(self, node) -> Optional[Function]: 157 | """Extract regular function declaration: function name() {}""" 158 | try: 159 | name_node = self._find_child_by_type(node, "identifier") 160 | if not name_node: 161 | return None 162 | 163 | func_name = self._get_node_text(name_node) 164 | line_start = node.start_point[0] + 1 165 | line_end = node.end_point[0] + 1 166 | parameters = self._extract_parameters(node) 167 | code_snippet = self._get_node_text(node) 168 | 169 | return Function( 170 | name=func_name, 171 | file_path=str(self.file_path), 172 | line_start=line_start, 173 | line_end=line_end, 174 | parameters=parameters, 175 | docstring=None, 176 | is_method=False, 177 | class_name=None, 178 | code_snippet=code_snippet, 179 | ) 180 | except Exception as e: 181 | logger.warning(f"Error extracting function declaration: {e}") 182 | return None 183 | 184 | def _extract_exported_function(self, node) -> Optional[Function]: 185 | """Extract export function or export default function""" 186 | try: 187 | func_decl = self._find_child_by_type(node, "function_declaration") 188 | if func_decl: 189 | func = self._extract_function_declaration(func_decl) 190 | if func: 191 | export_text = self._get_node_text(node) 192 | if "export default" in export_text and "function (" in export_text: 193 | func.name = "default" 194 | return func 195 | except Exception as e: 196 | logger.warning(f"Error extracting exported function: {e}") 197 | return None 198 | 199 | def _extract_arrow_function_from_declaration(self, node) -> Optional[Function]: 200 | """Extract arrow function or function expression from const/let/var declarations""" 201 | try: 202 | for child in node.children: 203 | if child.type == "variable_declarator": 204 | name_node = self._find_child_by_type(child, "identifier") 205 | func_node = self._find_child_by_type( 206 | child, "arrow_function" 207 | ) or self._find_child_by_type(child, "function_expression") 208 | 209 | if name_node and func_node: 210 | func_name = self._get_node_text(name_node) 211 | line_start = func_node.start_point[0] + 1 212 | line_end = func_node.end_point[0] + 1 213 | parameters = self._extract_parameters(func_node) 214 | code_snippet = self._get_node_text(child) 215 | 216 | return Function( 217 | name=func_name, 218 | file_path=str(self.file_path), 219 | line_start=line_start, 220 | line_end=line_end, 221 | parameters=parameters, 222 | docstring=None, 223 | is_method=False, 224 | class_name=None, 225 | code_snippet=code_snippet, 226 | ) 227 | except Exception as e: 228 | logger.warning(f"Error extracting function from declaration: {e}") 229 | return None 230 | 231 | def _extract_method_definition(self, node) -> Optional[Function]: 232 | """Extract class method definition""" 233 | try: 234 | property_name = self._find_child_by_type(node, "property_identifier") 235 | if not property_name: 236 | return None 237 | 238 | func_name = self._get_node_text(property_name) 239 | line_start = node.start_point[0] + 1 240 | line_end = node.end_point[0] + 1 241 | parameters = self._extract_parameters(node) 242 | code_snippet = self._get_node_text(node) 243 | class_name = self._find_containing_class_name(node) 244 | 245 | return Function( 246 | name=func_name, 247 | file_path=str(self.file_path), 248 | line_start=line_start, 249 | line_end=line_end, 250 | parameters=parameters, 251 | docstring=None, 252 | is_method=True, 253 | class_name=class_name, 254 | code_snippet=code_snippet, 255 | ) 256 | except Exception as e: 257 | logger.warning(f"Error extracting method definition: {e}") 258 | return None 259 | 260 | def _should_include_function(self, func: Function) -> bool: 261 | """Determine if a function should be included in the analysis.""" 262 | excluded_names = { 263 | "constructor", 264 | } 265 | 266 | if func.name.lower() in excluded_names: 267 | logger.debug(f"Skipping excluded function: {func.name}") 268 | return False 269 | 270 | return True 271 | 272 | def _extract_parameters(self, node) -> List[str]: 273 | """Extract parameter names from a function node.""" 274 | parameters = [] 275 | params_node = self._find_child_by_type(node, "formal_parameters") 276 | if params_node: 277 | for child in params_node.children: 278 | if child.type == "identifier": 279 | parameters.append(self._get_node_text(child)) 280 | return parameters 281 | 282 | def _extract_call_relationships(self, node) -> None: 283 | """Extract function call relationships from the AST.""" 284 | func_ranges = {} 285 | for func in self.functions: 286 | for line in range(func.line_start, func.line_end + 1): 287 | func_ranges[line] = func 288 | 289 | self._traverse_for_calls(node, func_ranges) 290 | 291 | def _traverse_for_calls(self, node, func_ranges: dict) -> None: 292 | """Recursively find function calls.""" 293 | 294 | if node.type == "call_expression": 295 | call_info = self._extract_call_from_node(node, func_ranges) 296 | if call_info: 297 | if self.limits.can_add_relationship(): 298 | self.call_relationships.append(call_info) 299 | if self.limits.add_relationship(): 300 | return 301 | else: 302 | return 303 | 304 | for child in node.children: 305 | if self.limits.should_stop(): 306 | break 307 | self._traverse_for_calls(child, func_ranges) 308 | 309 | def _extract_call_from_node(self, node, func_ranges: dict) -> Optional[CallRelationship]: 310 | """Extract call relationship from a call_expression node.""" 311 | try: 312 | call_line = node.start_point[0] + 1 313 | caller_func = func_ranges.get(call_line) 314 | 315 | if not caller_func: 316 | return None 317 | 318 | callee_name = self._extract_callee_name(node) 319 | if not callee_name or self._is_builtin_function(callee_name): 320 | return None 321 | 322 | caller_id = f"{self.file_path}:{caller_func.name}" 323 | return CallRelationship( 324 | caller=caller_id, 325 | callee=callee_name, 326 | call_line=call_line, 327 | is_resolved=False, 328 | ) 329 | except Exception as e: 330 | logger.warning(f"Error extracting call relationship: {e}") 331 | return None 332 | 333 | def _extract_callee_name(self, call_node) -> Optional[str]: 334 | """Extract the name of the called function.""" 335 | if call_node.children: 336 | callee_node = call_node.children[0] 337 | 338 | if callee_node.type == "identifier": 339 | return self._get_node_text(callee_node) 340 | elif callee_node.type == "member_expression": 341 | property_node = self._find_child_by_type(callee_node, "property_identifier") 342 | if property_node: 343 | return self._get_node_text(property_node) 344 | return None 345 | 346 | def _is_builtin_function(self, name: str) -> bool: 347 | """Check if function name is a JavaScript built-in.""" 348 | builtins = { 349 | "setTimeout", 350 | "setInterval", 351 | "clearTimeout", 352 | "clearInterval", 353 | "parseInt", 354 | "parseFloat", 355 | "isNaN", 356 | "isFinite", 357 | "encodeURIComponent", 358 | "decodeURIComponent", 359 | "eval", 360 | "require", 361 | } 362 | return name in builtins 363 | 364 | def _find_child_by_type(self, node, node_type: str): 365 | """Find first child node of specified type.""" 366 | for child in node.children: 367 | if child.type == node_type: 368 | return child 369 | return None 370 | 371 | def _get_node_text(self, node) -> str: 372 | """Get the text content of a node.""" 373 | start_byte = node.start_byte 374 | end_byte = node.end_byte 375 | return self.content.encode("utf8")[start_byte:end_byte].decode("utf8") 376 | 377 | def _find_containing_class_name(self, method_node) -> Optional[str]: 378 | """Find the name of the class containing a method.""" 379 | current = method_node.parent 380 | while current: 381 | if current.type == "class_declaration": 382 | name_node = self._find_child_by_type(current, "identifier") 383 | if name_node: 384 | return self._get_node_text(name_node) 385 | current = current.parent 386 | return None 387 | 388 | def _extract_object_method(self, node) -> Optional[Function]: 389 | """Extract method from object literal: { method() {} } or { method: function() {} }""" 390 | try: 391 | key_node = None 392 | value_node = None 393 | 394 | for child in node.children: 395 | if child.type in ["property_identifier", "identifier"]: 396 | key_node = child 397 | elif child.type in ["function_expression", "arrow_function"]: 398 | value_node = child 399 | elif child.type == "function_signature": 400 | value_node = node 401 | 402 | if key_node and value_node: 403 | func_name = self._get_node_text(key_node) 404 | line_start = value_node.start_point[0] + 1 405 | line_end = value_node.end_point[0] + 1 406 | 407 | if value_node == node: 408 | parameters = self._extract_parameters(node) 409 | else: 410 | parameters = self._extract_parameters(value_node) 411 | 412 | code_snippet = self._get_node_text(node) 413 | 414 | return Function( 415 | name=func_name, 416 | file_path=str(self.file_path), 417 | line_start=line_start, 418 | line_end=line_end, 419 | parameters=parameters, 420 | docstring=None, 421 | is_method=False, 422 | class_name=None, 423 | code_snippet=code_snippet, 424 | ) 425 | except Exception as e: 426 | logger.warning(f"Error extracting object method: {e}") 427 | return None 428 | 429 | def _extract_assignment_function(self, node) -> Optional[Function]: 430 | """Extract function from assignment: obj.method = function() {}""" 431 | try: 432 | left_node = None 433 | right_node = None 434 | 435 | for child in node.children: 436 | if child.type in ["member_expression", "identifier"]: 437 | left_node = child 438 | elif child.type in ["function_expression", "arrow_function"]: 439 | right_node = child 440 | 441 | if left_node and right_node: 442 | func_name = self._extract_assignment_name(left_node) 443 | if func_name: 444 | line_start = right_node.start_point[0] + 1 445 | line_end = right_node.end_point[0] + 1 446 | parameters = self._extract_parameters(right_node) 447 | code_snippet = self._get_node_text(node) 448 | 449 | return Function( 450 | name=func_name, 451 | file_path=str(self.file_path), 452 | line_start=line_start, 453 | line_end=line_end, 454 | parameters=parameters, 455 | docstring=None, 456 | is_method=False, 457 | class_name=None, 458 | code_snippet=code_snippet, 459 | ) 460 | except Exception as e: 461 | logger.warning(f"Error extracting assignment function: {e}") 462 | return None 463 | 464 | def _extract_assignment_name(self, node) -> Optional[str]: 465 | """Extract function name from assignment left side.""" 466 | if node.type == "identifier": 467 | return self._get_node_text(node) 468 | elif node.type == "member_expression": 469 | property_node = self._find_child_by_type(node, "property_identifier") 470 | if property_node: 471 | return self._get_node_text(property_node) 472 | return None 473 | 474 | 475 | class TreeSitterTSAnalyzer(TreeSitterJSAnalyzer): 476 | """TypeScript analyzer using tree-sitter.""" 477 | 478 | def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None): 479 | self.file_path = Path(file_path) 480 | self.content = content 481 | self.functions: List[Function] = [] 482 | self.call_relationships: List[CallRelationship] = [] 483 | self.limits = limits or create_javascript_limits() 484 | 485 | try: 486 | language_capsule = tree_sitter_typescript.language_typescript() 487 | self.ts_language = Language(language_capsule) 488 | self.parser = Parser(self.ts_language) 489 | logger.debug( 490 | f"TypeScript parser initialized with language object: {type(self.ts_language)}" 491 | ) 492 | 493 | test_code = "function test(): void { console.log('test'); }" 494 | test_tree = self.parser.parse(bytes(test_code, "utf8")) 495 | if test_tree is None or test_tree.root_node is None: 496 | raise RuntimeError("Parser setup test failed for TypeScript") 497 | logger.debug( 498 | f"TypeScript parser test successful - root node type: {test_tree.root_node.type}" 499 | ) 500 | 501 | except Exception as e: 502 | logger.error(f"Failed to initialize TypeScript parser: {e}") 503 | self.parser = None 504 | self.ts_language = None 505 | 506 | logger.info(f"TreeSitterTSAnalyzer initialized for {file_path} with limits: {self.limits}") 507 | 508 | 509 | # Integration functions 510 | def analyze_javascript_file_treesitter( 511 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 512 | ) -> tuple[List[Function], List[CallRelationship]]: 513 | """Analyze a JavaScript file using tree-sitter.""" 514 | try: 515 | logger.info(f"Tree-sitter JS analysis for {file_path}") 516 | if limits is None: 517 | limits = create_javascript_limits() 518 | analyzer = TreeSitterJSAnalyzer(file_path, content, limits) 519 | analyzer.analyze() 520 | logger.info( 521 | f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {limits.nodes_processed} nodes processed" 522 | ) 523 | return analyzer.functions, analyzer.call_relationships 524 | except Exception as e: 525 | logger.error(f"Error in tree-sitter JS analysis for {file_path}: {e}", exc_info=True) 526 | return [], [] 527 | 528 | 529 | def analyze_typescript_file_treesitter( 530 | file_path: str, content: str, limits: Optional[AnalysisLimits] = None 531 | ) -> tuple[List[Function], List[CallRelationship]]: 532 | """Analyze a TypeScript file using tree-sitter.""" 533 | try: 534 | logger.info(f"Tree-sitter TS analysis for {file_path}") 535 | if limits is None: 536 | limits = create_javascript_limits() 537 | analyzer = TreeSitterTSAnalyzer(file_path, content, limits) 538 | analyzer.analyze() 539 | logger.info( 540 | f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {limits.nodes_processed} nodes processed" 541 | ) 542 | return analyzer.functions, analyzer.call_relationships 543 | except Exception as e: 544 | logger.error(f"Error in tree-sitter TS analysis for {file_path}: {e}", exc_info=True) 545 | return [], [] 546 | -------------------------------------------------------------------------------- /src/gitprobe/analysis/call_graph_analyzer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Call Graph Analyzer 3 | 4 | Central orchestrator for multi-language call graph analysis. 5 | Coordinates language-specific analyzers to build comprehensive call graphs 6 | across different programming languages in a repository. 7 | """ 8 | 9 | from pathlib import Path 10 | from typing import Dict, List 11 | import logging 12 | from gitprobe.models.core import Function, CallRelationship 13 | from gitprobe.utils.patterns import CODE_EXTENSIONS 14 | from gitprobe.utils.security import safe_open_text 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class CallGraphAnalyzer: 20 | """ 21 | Multi-language call graph analyzer. 22 | 23 | This analyzer orchestrates language-specific AST analyzers to build 24 | comprehensive call graphs across different programming languages. 25 | 26 | Supported languages: 27 | - Python (fully supported with AST parsing) 28 | - JavaScript (tree-sitter AST parsing - high accuracy, supports exports/imports) 29 | - TypeScript (tree-sitter AST parsing - high accuracy, supports exports/imports) 30 | - C (fully supported with AST parsing) 31 | - C++ (fully supported with AST parsing) 32 | - Go (fully supported with tree-sitter AST parsing) 33 | - Rust (fully supported with tree-sitter AST parsing) 34 | 35 | Key improvements: 36 | - JavaScript/TypeScript now use tree-sitter for 99%+ accuracy 37 | - Properly handles export/import statements, arrow functions, class methods 38 | - Automatically filters out constructors and other non-useful functions 39 | - Better call relationship detection 40 | """ 41 | 42 | def __init__(self): 43 | """Initialize the call graph analyzer.""" 44 | self.functions: Dict[str, Function] = {} 45 | self.call_relationships: List[CallRelationship] = [] 46 | self.c_cpp_global_counter = None 47 | self.js_global_limits = None 48 | logger.info("CallGraphAnalyzer initialized.") 49 | 50 | def analyze_code_files(self, code_files: List[Dict], base_dir: str) -> Dict: 51 | """ 52 | Relationship-maximizing analysis: Analyze all files to build complete call graph, 53 | then return the most connected 800-1000 nodes for optimal frontend rendering. 54 | 55 | This approach: 56 | 1. Analyzes all code files (within limits) 57 | 2. Extracts all functions and relationships 58 | 3. Builds complete call graph 59 | 4. Ranks nodes by connectivity (degree centrality) 60 | 5. Returns top 800-1000 most connected nodes 61 | """ 62 | logger.info(f"Starting relationship-maximizing analysis of {len(code_files)} files") 63 | 64 | self.functions = {} 65 | self.call_relationships = [] 66 | 67 | from gitprobe.core.analysis_limits import reset_global_tracker 68 | 69 | reset_global_tracker() 70 | 71 | from gitprobe.core.analysis_limits import ( 72 | create_python_limits, 73 | create_javascript_limits, 74 | create_go_limits, 75 | create_rust_limits, 76 | create_c_cpp_limits, 77 | ) 78 | 79 | self.limits = { 80 | "python": create_python_limits(), 81 | "javascript": create_javascript_limits(), 82 | "typescript": create_javascript_limits(), 83 | "go": create_go_limits(), 84 | "rust": create_rust_limits(), 85 | "c": create_c_cpp_limits(), 86 | "cpp": create_c_cpp_limits(), 87 | } 88 | 89 | logger.info("Analyzing all code files to maximize relationships") 90 | files_analyzed = 0 91 | for file_info in code_files: 92 | from gitprobe.core.analysis_limits import get_global_tracker 93 | 94 | global_tracker = get_global_tracker() 95 | if global_tracker.should_stop(): 96 | logger.info(f"Global limits reached after {files_analyzed} files") 97 | break 98 | 99 | logger.debug(f"Analyzing: {file_info['path']}") 100 | self._analyze_code_file(base_dir, file_info) 101 | files_analyzed += 1 102 | 103 | if files_analyzed % 20 == 0: 104 | logger.info( 105 | f"Progress: {files_analyzed} files, {len(self.functions)} functions, {len(self.call_relationships)} relationships" 106 | ) 107 | 108 | logger.info( 109 | f"Analysis complete: {files_analyzed} files analyzed, {len(self.functions)} functions, {len(self.call_relationships)} relationships" 110 | ) 111 | 112 | logger.info("Resolving call relationships") 113 | self._resolve_call_relationships() 114 | self._deduplicate_relationships() 115 | 116 | logger.info("Selecting most connected nodes for frontend") 117 | self._select_most_connected_nodes(target_count=900) 118 | 119 | logger.info("Generating visualization data") 120 | viz_data = self._generate_visualization_data() 121 | 122 | return { 123 | "call_graph": { 124 | "total_functions": len(self.functions), 125 | "total_calls": len(self.call_relationships), 126 | "languages_found": list(set(f.get("language") for f in code_files)), 127 | "files_analyzed": files_analyzed, 128 | "analysis_approach": "relationship_maximizing", 129 | }, 130 | "functions": [func.dict() for func in self.functions.values()], 131 | "relationships": [rel.dict() for rel in self.call_relationships], 132 | "visualization": viz_data, 133 | } 134 | 135 | def extract_code_files(self, file_tree: Dict) -> List[Dict]: 136 | """ 137 | Extract code files from file tree structure. 138 | 139 | Filters files based on supported extensions and excludes test/config files. 140 | 141 | Args: 142 | file_tree: Nested dictionary representing file structure 143 | 144 | Returns: 145 | List of code file information dictionaries 146 | """ 147 | code_files = [] 148 | 149 | def traverse(tree): 150 | if tree["type"] == "file": 151 | ext = tree.get("extension", "").lower() 152 | if ext in CODE_EXTENSIONS: 153 | name = tree["name"].lower() 154 | if not any(skip in name for skip in ["test", "spec", "config", "setup"]): 155 | code_files.append( 156 | { 157 | "path": tree["path"], 158 | "name": tree["name"], 159 | "extension": ext, 160 | "language": CODE_EXTENSIONS[ext], 161 | } 162 | ) 163 | elif tree["type"] == "directory" and tree.get("children"): 164 | for child in tree["children"]: 165 | traverse(child) 166 | 167 | traverse(file_tree) 168 | return code_files 169 | 170 | def _analyze_code_file(self, repo_dir: str, file_info: Dict): 171 | """ 172 | Analyze a single code file based on its language. 173 | 174 | Routes to appropriate language-specific analyzer. 175 | 176 | Args: 177 | repo_dir: Repository directory path 178 | file_info: File information dictionary 179 | """ 180 | # file_path = Path(repo_dir) / file_info["path"] 181 | 182 | # logger.debug(f"Reading content of {file_path}") 183 | # try: 184 | # with open(file_path, "r", encoding="utf-8", errors="ignore") as f: 185 | # content = f.read() 186 | base = Path(repo_dir) 187 | file_path = base / file_info["path"] 188 | logger.debug(f"Reading content of {file_path}") 189 | try: 190 | content = safe_open_text(base, file_path) 191 | language = file_info["language"] 192 | logger.info(f"Analyzing {language} file: {file_path}") 193 | if language == "python": 194 | self._analyze_python_file(file_path, content) 195 | elif language == "javascript": 196 | self._analyze_javascript_file(file_path, content) 197 | elif language == "typescript": 198 | self._analyze_typescript_file(file_path, content) 199 | elif language == "c": 200 | self._analyze_c_file(file_path, content) 201 | elif language == "cpp": 202 | self._analyze_cpp_file(file_path, content) 203 | elif language == "go": 204 | self._analyze_go_file(file_path, content) 205 | elif language == "rust": 206 | self._analyze_rust_file(file_path, content) 207 | else: 208 | logger.warning( 209 | f"Unsupported language for call graph analysis: {language} for file {file_path}" 210 | ) 211 | 212 | except Exception as e: 213 | logger.error(f"⚠️ Error analyzing {file_path}: {str(e)}") 214 | 215 | def _analyze_python_file(self, file_path: str, content: str): 216 | """ 217 | Analyze Python file using Python AST analyzer. 218 | 219 | Args: 220 | file_path: Relative path to the Python file 221 | content: File content string 222 | """ 223 | from gitprobe.analyzers.python import analyze_python_file 224 | 225 | try: 226 | functions, relationships = analyze_python_file( 227 | file_path, content, self.limits["python"] 228 | ) 229 | logger.info( 230 | f"Found {len(functions)} functions and {len(relationships)} relationships in {file_path}" 231 | ) 232 | 233 | for func in functions: 234 | func_id = f"{file_path}:{func.name}" 235 | self.functions[func_id] = func 236 | 237 | self.call_relationships.extend(relationships) 238 | except Exception as e: 239 | logger.error(f"Failed to analyze Python file {file_path}: {e}", exc_info=True) 240 | 241 | def _analyze_javascript_file(self, file_path: str, content: str): 242 | """ 243 | Analyze JavaScript file using tree-sitter based AST analyzer with global limits. 244 | 245 | Args: 246 | file_path: Relative path to the JavaScript file 247 | content: File content string 248 | """ 249 | try: 250 | logger.info(f"Starting tree-sitter JavaScript analysis for {file_path}") 251 | 252 | from gitprobe.analyzers.javascript import analyze_javascript_file_treesitter 253 | 254 | functions, relationships = analyze_javascript_file_treesitter( 255 | file_path, content, self.limits["javascript"] 256 | ) 257 | 258 | logger.info( 259 | f"Tree-sitter JavaScript analysis completed for {file_path}: {len(functions)} functions, {len(relationships)} relationships" 260 | ) 261 | 262 | for func in functions: 263 | func_id = f"{file_path}:{func.name}" 264 | self.functions[func_id] = func 265 | 266 | self.call_relationships.extend(relationships) 267 | 268 | except Exception as e: 269 | logger.error(f"Failed to analyze JavaScript file {file_path}: {e}", exc_info=True) 270 | 271 | def _analyze_typescript_file(self, file_path: str, content: str): 272 | """ 273 | Analyze TypeScript file using tree-sitter based AST analyzer with global limits. 274 | 275 | Args: 276 | file_path: Relative path to the TypeScript file 277 | content: File content string 278 | """ 279 | try: 280 | logger.info(f"Starting tree-sitter TypeScript analysis for {file_path}") 281 | 282 | from gitprobe.analyzers.javascript import analyze_typescript_file_treesitter 283 | 284 | functions, relationships = analyze_typescript_file_treesitter( 285 | file_path, content, self.limits["typescript"] 286 | ) 287 | 288 | logger.info( 289 | f"Tree-sitter TypeScript analysis completed for {file_path}: {len(functions)} functions, {len(relationships)} relationships" 290 | ) 291 | 292 | for func in functions: 293 | func_id = f"{file_path}:{func.name}" 294 | self.functions[func_id] = func 295 | 296 | self.call_relationships.extend(relationships) 297 | 298 | except Exception as e: 299 | logger.error(f"Failed to analyze TypeScript file {file_path}: {e}", exc_info=True) 300 | 301 | def _analyze_c_file(self, file_path: str, content: str): 302 | """ 303 | Analyze C file using tree-sitter based analyzer. 304 | 305 | Args: 306 | file_path: Relative path to the C file 307 | content: File content string 308 | """ 309 | from gitprobe.analyzers.c_cpp import analyze_c_file_treesitter 310 | 311 | functions, relationships = analyze_c_file_treesitter(file_path, content, self.limits["c"]) 312 | 313 | for func in functions: 314 | func_id = f"{file_path}:{func.name}" 315 | self.functions[func_id] = func 316 | 317 | self.call_relationships.extend(relationships) 318 | 319 | def _analyze_cpp_file(self, file_path: str, content: str): 320 | """ 321 | Analyze C++ file using tree-sitter based analyzer. 322 | 323 | Args: 324 | file_path: Relative path to the C++ file 325 | content: File content string 326 | """ 327 | from gitprobe.analyzers.c_cpp import analyze_cpp_file_treesitter 328 | 329 | functions, relationships = analyze_cpp_file_treesitter( 330 | file_path, content, self.limits["cpp"] 331 | ) 332 | 333 | for func in functions: 334 | func_id = f"{file_path}:{func.name}" 335 | self.functions[func_id] = func 336 | 337 | self.call_relationships.extend(relationships) 338 | 339 | def _analyze_go_file(self, file_path: str, content: str): 340 | """ 341 | Analyze Go file using Go AST analyzer. 342 | 343 | Args: 344 | file_path: Relative path to the Go file 345 | content: File content string 346 | """ 347 | from gitprobe.analyzers.go import analyze_go_file_treesitter 348 | 349 | try: 350 | functions, relationships = analyze_go_file_treesitter( 351 | file_path, content, self.limits["go"] 352 | ) 353 | logger.info( 354 | f"Found {len(functions)} functions and {len(relationships)} relationships in {file_path}" 355 | ) 356 | 357 | for func in functions: 358 | func_id = f"{file_path}:{func.name}" 359 | self.functions[func_id] = func 360 | 361 | self.call_relationships.extend(relationships) 362 | except Exception as e: 363 | logger.error(f"Failed to analyze Go file {file_path}: {e}", exc_info=True) 364 | 365 | def _analyze_rust_file(self, file_path: str, content: str): 366 | """ 367 | Analyze Rust file using Rust AST analyzer. 368 | 369 | Args: 370 | file_path: Relative path to the Rust file 371 | content: File content string 372 | """ 373 | from gitprobe.analyzers.rust import analyze_rust_file_treesitter 374 | 375 | try: 376 | functions, relationships = analyze_rust_file_treesitter( 377 | file_path, content, self.limits["rust"] 378 | ) 379 | logger.info( 380 | f"Found {len(functions)} functions and {len(relationships)} relationships in {file_path}" 381 | ) 382 | 383 | for func in functions: 384 | func_id = f"{file_path}:{func.name}" 385 | self.functions[func_id] = func 386 | 387 | self.call_relationships.extend(relationships) 388 | except Exception as e: 389 | logger.error(f"Failed to analyze Rust file {file_path}: {e}", exc_info=True) 390 | 391 | def _resolve_call_relationships(self): 392 | """ 393 | Resolve function call relationships across all languages. 394 | 395 | Attempts to match function calls to actual function definitions, 396 | handling cross-language calls where possible. 397 | """ 398 | logger.info("Building function lookup table for resolving relationships.") 399 | func_lookup = {} 400 | for func_id, func_info in self.functions.items(): 401 | func_lookup[func_info.name] = func_id 402 | 403 | resolved_count = 0 404 | for relationship in self.call_relationships: 405 | callee_name = relationship.callee 406 | 407 | if callee_name in func_lookup: 408 | relationship.callee = func_lookup[callee_name] 409 | relationship.is_resolved = True 410 | resolved_count += 1 411 | elif "." in callee_name: 412 | method_name = callee_name.split(".")[-1] 413 | if method_name in func_lookup: 414 | relationship.callee = func_lookup[method_name] 415 | relationship.is_resolved = True 416 | 417 | logger.info(f"Resolved {resolved_count}/{len(self.call_relationships)} call relationships.") 418 | 419 | def _deduplicate_relationships(self): 420 | """ 421 | Deduplicate call relationships based on caller-callee pairs. 422 | 423 | Removes duplicate relationships while preserving the first occurrence. 424 | This helps eliminate noise from multiple calls to the same function. 425 | """ 426 | seen = set() 427 | unique_relationships = [] 428 | 429 | for rel in self.call_relationships: 430 | key = (rel.caller, rel.callee) 431 | if key not in seen: 432 | seen.add(key) 433 | unique_relationships.append(rel) 434 | 435 | logger.debug( 436 | f"Removed {len(self.call_relationships) - len(unique_relationships)} duplicate relationships." 437 | ) 438 | self.call_relationships = unique_relationships 439 | 440 | def _generate_visualization_data(self) -> Dict: 441 | """ 442 | Generate visualization data for graph rendering. 443 | 444 | Creates Cytoscape.js compatible graph data with nodes and edges. 445 | 446 | Returns: 447 | Dict: Visualization data with cytoscape elements and summary 448 | """ 449 | logger.info("Generating Cytoscape-compatible visualization data.") 450 | cytoscape_elements = [] 451 | 452 | logger.debug(f"Adding {len(self.functions)} function nodes.") 453 | for func_id, func_info in self.functions.items(): 454 | node_classes = [] 455 | if func_info.is_method: 456 | node_classes.append("node-method") 457 | else: 458 | node_classes.append("node-function") 459 | 460 | file_ext = Path(func_info.file_path).suffix.lower() 461 | if file_ext == ".py": 462 | node_classes.append("lang-python") 463 | elif file_ext == ".js": 464 | node_classes.append("lang-javascript") 465 | elif file_ext == ".ts": 466 | node_classes.append("lang-typescript") 467 | elif file_ext in [".c", ".h"]: 468 | node_classes.append("lang-c") 469 | elif file_ext in [".cpp", ".cc", ".cxx", ".hpp", ".hxx"]: 470 | node_classes.append("lang-cpp") 471 | 472 | cytoscape_elements.append( 473 | { 474 | "data": { 475 | "id": func_id, 476 | "label": func_info.name, 477 | "file": func_info.file_path, 478 | "type": "method" if func_info.is_method else "function", 479 | "language": CODE_EXTENSIONS.get(file_ext, "unknown"), 480 | }, 481 | "classes": " ".join(node_classes), 482 | } 483 | ) 484 | 485 | resolved_rels = [r for r in self.call_relationships if r.is_resolved] 486 | logger.debug(f"Adding {len(resolved_rels)} relationship edges.") 487 | for rel in resolved_rels: 488 | cytoscape_elements.append( 489 | { 490 | "data": { 491 | "id": f"{rel.caller}->{rel.callee}", 492 | "source": rel.caller, 493 | "target": rel.callee, 494 | "line": rel.call_line, 495 | }, 496 | "classes": "edge-call", 497 | } 498 | ) 499 | 500 | summary = { 501 | "total_nodes": len(self.functions), 502 | "total_edges": len(resolved_rels), 503 | "unresolved_calls": len(self.call_relationships) - len(resolved_rels), 504 | } 505 | logger.info(f"Visualization data generated: {summary}") 506 | 507 | return { 508 | "cytoscape": {"elements": cytoscape_elements}, 509 | "summary": summary, 510 | } 511 | 512 | def generate_llm_format(self) -> Dict: 513 | """Generate clean format optimized for LLM consumption.""" 514 | return { 515 | "functions": [ 516 | { 517 | "name": func.name, 518 | "file": Path(func.file_path).name, 519 | "purpose": (func.docstring.split("\n")[0] if func.docstring else None), 520 | "parameters": func.parameters, 521 | "is_recursive": func.name 522 | in [ 523 | rel.callee 524 | for rel in self.call_relationships 525 | if rel.caller.endswith(func.name) 526 | ], 527 | } 528 | for func in self.functions.values() 529 | ], 530 | "relationships": { 531 | func.name: { 532 | "calls": [ 533 | rel.callee.split(":")[-1] 534 | for rel in self.call_relationships 535 | if rel.caller.endswith(func.name) and rel.is_resolved 536 | ], 537 | "called_by": [ 538 | rel.caller.split(":")[-1] 539 | for rel in self.call_relationships 540 | if rel.callee.endswith(func.name) and rel.is_resolved 541 | ], 542 | } 543 | for func in self.functions.values() 544 | }, 545 | } 546 | 547 | def _select_most_connected_nodes(self, target_count: int): 548 | """ 549 | Select the most connected nodes from the call graph. 550 | 551 | Args: 552 | target_count: The number of nodes to select 553 | """ 554 | if len(self.functions) <= target_count: 555 | logger.info( 556 | f"Have {len(self.functions)} functions, target is {target_count} - keeping all" 557 | ) 558 | return 559 | 560 | if not self.call_relationships: 561 | logger.warning("No call relationships found - keeping all functions by name") 562 | func_ids = list(self.functions.keys())[:target_count] 563 | self.functions = {fid: func for fid, func in self.functions.items() if fid in func_ids} 564 | return 565 | 566 | graph = {} 567 | for rel in self.call_relationships: 568 | if rel.caller in self.functions: 569 | if rel.caller not in graph: 570 | graph[rel.caller] = set() 571 | if rel.callee in self.functions: 572 | if rel.callee not in graph: 573 | graph[rel.callee] = set() 574 | 575 | if rel.caller in graph and rel.callee in graph: 576 | graph[rel.caller].add(rel.callee) 577 | graph[rel.callee].add(rel.caller) 578 | 579 | degree_centrality = {} 580 | for func_id in self.functions.keys(): 581 | degree_centrality[func_id] = len(graph.get(func_id, set())) 582 | 583 | sorted_func_ids = sorted(degree_centrality, key=degree_centrality.get, reverse=True) 584 | 585 | selected_func_ids = sorted_func_ids[:target_count] 586 | 587 | original_func_count = len(self.functions) 588 | self.functions = { 589 | fid: func for fid, func in self.functions.items() if fid in selected_func_ids 590 | } 591 | 592 | original_rel_count = len(self.call_relationships) 593 | self.call_relationships = [ 594 | rel 595 | for rel in self.call_relationships 596 | if rel.caller in selected_func_ids and rel.callee in selected_func_ids 597 | ] 598 | 599 | logger.info( 600 | f"Node selection: {original_func_count} -> {len(self.functions)} functions, " 601 | f"{original_rel_count} -> {len(self.call_relationships)} relationships" 602 | ) 603 | logger.info(f"Kept {len(selected_func_ids)} most connected nodes (target: {target_count})") 604 | --------------------------------------------------------------------------------