├── src
    ├── __init__.py
    └── gitprobe
    │   ├── __init__.py
    │   ├── core
    │       ├── __init__.py
    │       └── analysis_limits.py
    │   ├── models
    │       ├── __init__.py
    │       ├── analysis.py
    │       └── core.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── logging_config.py
    │       ├── security.py
    │       └── patterns.py
    │   ├── web
    │       ├── __init__.py
    │       └── server.py
    │   ├── analysis
    │       ├── __init__.py
    │       ├── repo_analyzer.py
    │       ├── cloning.py
    │       ├── analysis_service.py
    │       └── call_graph_analyzer.py
    │   ├── analyzers
    │       ├── __init__.py
    │       ├── python.py
    │       ├── go.py
    │       ├── c_cpp.py
    │       └── javascript.py
    │   ├── __main__.py
    │   └── cli.py
├── docs
    └── preview.png
├── tests
    ├── __init__.py
    ├── README.md
    └── test_integration.py
├── requirements.txt
├── gitprobe
├── LICENSE.md
├── DEVELOPMENT.md
├── pyproject.toml
├── .gitignore
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/gitprobe/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/src/gitprobe/core/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/src/gitprobe/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/src/gitprobe/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/src/gitprobe/web/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/src/gitprobe/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/src/gitprobe/analyzers/__init__.py:
--------------------------------------------------------------------------------
1 | # Empty
2 | 


--------------------------------------------------------------------------------
/docs/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/git-probe/gitprobe/HEAD/docs/preview.png


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | GitProbe Test Suite
3 | 
4 | Integration and unit tests for GitProbe analyzers and components.
5 | """
6 | 
7 | __version__ = "1.0.0"
8 | __author__ = "GitProbe Team" 


--------------------------------------------------------------------------------
/src/gitprobe/utils/logging_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def setup_logging():
 6 |     """
 7 |     Set up basic logging configuration.
 8 |     """
 9 |     logging.basicConfig(
10 |         level=logging.INFO,
11 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
12 |         stream=sys.stdout,
13 |     )
14 | 
15 | 
16 | logger = logging.getLogger("gitprobe")
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gitpython==3.1.40
 2 | pathspec==0.12.1
 3 | requests==2.31.0
 4 | pydantic>=2.0.0
 5 | fastapi>=0.100.0
 6 | uvicorn>=0.23.0
 7 | python-multipart
 8 | tree-sitter==0.23.2
 9 | # Tree-sitter language packages (compatible versions)
10 | tree-sitter-c==0.21.4
11 | tree-sitter-cpp==0.23.4
12 | tree-sitter-go==0.21.1
13 | tree-sitter-javascript==0.21.4
14 | tree-sitter-typescript==0.21.2
15 | tree-sitter-rust==0.21.2
16 | rich>=13.0.0
17 | 


--------------------------------------------------------------------------------
/gitprobe:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | GitProbe CLI Wrapper
 4 | 
 5 | Simple entry point that sets up Python path and runs the CLI.
 6 | """
 7 | 
 8 | import sys
 9 | import os
10 | from pathlib import Path
11 | 
12 | # Add src directory to Python path
13 | current_dir = Path(__file__).parent
14 | src_dir = current_dir / "src"
15 | if str(src_dir) not in sys.path:
16 |     sys.path.insert(0, str(src_dir))
17 | 
18 | if __name__ == "__main__":
19 |     from gitprobe.cli import main  # type: ignore
20 |     main() 


--------------------------------------------------------------------------------
/src/gitprobe/__main__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | GitProbe Package Main Entry Point
 3 | 
 4 | Allows running GitProbe as a module: python -m gitprobe
 5 | """
 6 | 
 7 | import sys
 8 | from pathlib import Path
 9 | 
10 | 
11 | def main():
12 |     """Main entry point for running GitProbe server."""
13 |     try:
14 |         import uvicorn
15 | 
16 |         print("🚀 Starting GitProbe Server via package...")
17 |         uvicorn.run("gitprobe.web.server:app", host="0.0.0.0", port=8000, reload=True)
18 |     except ImportError:
19 |         print("❌ uvicorn not installed. Please install with: pip install uvicorn")
20 |         sys.exit(1)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 


--------------------------------------------------------------------------------
/src/gitprobe/models/analysis.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Dict, Any, Optional
 3 | from .core import Function, CallRelationship, Repository
 4 | 
 5 | 
 6 | class AnalysisResult(BaseModel):
 7 |     """Result of analyzing a repository"""
 8 | 
 9 |     repository: Repository
10 |     functions: List[Function]
11 |     relationships: List[CallRelationship]
12 |     file_tree: Dict[str, Any]
13 |     summary: Dict[str, Any]
14 |     visualization: Dict[str, Any] = {}
15 |     readme_content: Optional[str] = None
16 | 
17 | 
18 | class NodeSelection(BaseModel):
19 |     """Selected nodes for partial export"""
20 | 
21 |     selected_nodes: List[str] = []
22 |     include_relationships: bool = True
23 |     custom_names: Dict[str, str] = {}
24 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 GitProbe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 


--------------------------------------------------------------------------------
/src/gitprobe/models/core.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional, Dict, Any
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | class Function(BaseModel):
 7 |     """A function found in the codebase"""
 8 | 
 9 |     name: str
10 |     file_path: str
11 |     line_start: int
12 |     line_end: Optional[int] = None
13 |     parameters: Optional[List[str]] = None
14 |     docstring: Optional[str] = None
15 |     is_method: bool = False
16 |     class_name: Optional[str] = None
17 |     code_snippet: Optional[str] = None
18 |     display_name: Optional[str] = None
19 | 
20 |     def get_display_name(self) -> str:
21 |         """Get the name to display (custom or original)"""
22 |         return self.display_name or self.name
23 | 
24 | 
25 | class CallRelationship(BaseModel):
26 |     """A call relationship between two functions"""
27 | 
28 |     caller: str
29 |     callee: str
30 |     call_line: Optional[int] = None
31 |     is_resolved: bool = False
32 | 
33 | 
34 | class Repository(BaseModel):
35 |     """Basic repository information"""
36 | 
37 |     url: str
38 |     name: str
39 |     clone_path: str
40 |     analysis_id: str
41 | 


--------------------------------------------------------------------------------
/src/gitprobe/utils/security.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | def _inside(base: Path, target: Path) -> bool:
 5 |     base_r = base.resolve()
 6 |     try:
 7 |         target_r = target.resolve()
 8 |         return target_r.is_relative_to(base_r)  # py>=3.9
 9 |     except AttributeError:
10 |         return str(target.resolve()).startswith(str(base_r))
11 | 
12 | def assert_safe_path(base_dir: Path, target: Path):
13 |     # Block symlinks (file or dir)
14 |     if target.is_symlink():
15 |         raise PermissionError(f"Symlink blocked: {target}")
16 |     # Block paths that escape repo
17 |     if not _inside(base_dir, target):
18 |         raise PermissionError(f"Path escapes repo: {target} -> {target.resolve()}")
19 | 
20 | def safe_open_text(base_dir: Path, target: Path, encoding="utf-8"):
21 |     assert_safe_path(base_dir, target)
22 |     flags = os.O_RDONLY
23 |     if hasattr(os, "O_NOFOLLOW"):
24 |         flags |= os.O_NOFOLLOW
25 |     fd = os.open(str(target), flags)
26 |     try:
27 |         with os.fdopen(fd, "r", encoding=encoding, errors="replace") as f:
28 |             return f.read()
29 |     finally:
30 |         try:
31 |             os.close(fd)
32 |         except OSError:
33 |             pass
34 | 


--------------------------------------------------------------------------------
/DEVELOPMENT.md:
--------------------------------------------------------------------------------
 1 | # GitProbe Development Guide
 2 | 
 3 | ## Quick Setup
 4 | 
 5 | ### 1. Install in Development Mode
 6 | ```bash
 7 | # Clone the repository
 8 | git clone https://github.com/yourusername/gitprobe.git
 9 | cd gitprobe
10 | 
11 | # Create virtual environment
12 | python -m venv env
13 | source env/bin/activate  # On Windows: env\Scripts\activate
14 | 
15 | # Install in editable mode with dev dependencies
16 | pip install -e ".[dev]"
17 | ```
18 | 
19 | ### 2. Run GitProbe
20 | 
21 | Once installed, you can use GitProbe from anywhere:
22 | 
23 | ```bash
24 | # Analyze a repository
25 | gitprobe analyze microsoft/vscode
26 | 
27 | # Start the server
28 | gitprobe server
29 | 
30 | # Start server with custom settings
31 | gitprobe server --port 8080 --reload
32 | ```
33 | 
34 | ## Alternative Development Setup
35 | 
36 | If you prefer not to install the package:
37 | 
38 | ```bash
39 | # Set Python path and run directly
40 | PYTHONPATH=src python -m gitprobe.cli analyze user/repo
41 | PYTHONPATH=src python -m gitprobe.web.server
42 | ```
43 | 
44 | ## Project Structure
45 | 
46 | ```
47 | gitprobe/
48 | ├── src/gitprobe/           # Main package
49 | │   ├── analyzers/          # Language-specific analyzers
50 | │   ├── analysis/           # Business logic & orchestration
51 | │   ├── core/              # Shared utilities
52 | │   ├── models/            # Data models
53 | │   ├── utils/             # Helper functions
54 | │   ├── web/               # FastAPI server
55 | │   └── cli.py             # Command-line interface
56 | ├── pyproject.toml          # Package configuration
57 | ├── requirements.txt        # Dependencies
58 | └── README.md              # User documentation
59 | ```
60 | 
61 | ## Development Commands
62 | 
63 | ```bash
64 | # Run tests
65 | pytest
66 | 
67 | # Format code
68 | black src/
69 | isort src/
70 | 
71 | # Type checking
72 | mypy src/
73 | 
74 | # Install pre-commit hooks
75 | pre-commit install
76 | ```
77 | 
78 | ## Adding New Languages
79 | 
80 | 1. Create analyzer in `src/gitprobe/analyzers/`
81 | 2. Add language limits in `src/gitprobe/core/analysis_limits.py`
82 | 3. Update `src/gitprobe/analysis/call_graph_analyzer.py`
83 | 4. Add tests and documentation 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "gitprobe"
 7 | version = "0.1.0"
 8 | description = "Advanced repository analysis tool with multi-language call graph generation"
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | license = {text = "MIT"}
12 | authors = [
13 |     {name = "GitProbe Contributors"}
14 | ]
15 | keywords = ["code-analysis", "call-graph", "ast", "repository-analysis"]
16 | classifiers = [
17 |     "Development Status :: 4 - Beta",
18 |     "Intended Audience :: Developers",
19 |     "License :: OSI Approved :: MIT License",
20 |     "Programming Language :: Python :: 3",
21 |     "Programming Language :: Python :: 3.8",
22 |     "Programming Language :: Python :: 3.9",
23 |     "Programming Language :: Python :: 3.10",
24 |     "Programming Language :: Python :: 3.11",
25 |     "Programming Language :: Python :: 3.12",
26 |     "Topic :: Software Development :: Code Generators",
27 |     "Topic :: Software Development :: Libraries :: Python Modules",
28 | ]
29 | 
30 | dependencies = [
31 |     "fastapi>=0.104.0",
32 |     "uvicorn[standard]>=0.24.0",
33 |     "pydantic>=2.0.0",
34 |     "tree-sitter>=0.20.0,<0.21.0",
35 |     "tree-sitter-languages>=1.10.0",
36 |     "GitPython>=3.1.0",
37 | ]
38 | 
39 | [project.optional-dependencies]
40 | dev = [
41 |     "pytest>=7.0.0",
42 |     "pytest-asyncio>=0.21.0",
43 |     "black>=23.0.0",
44 |     "isort>=5.12.0",
45 |     "mypy>=1.5.0",
46 |     "pre-commit>=3.4.0",
47 | ]
48 | 
49 | [project.scripts]
50 | gitprobe = "gitprobe.cli:main"
51 | gitprobe-server = "gitprobe.web.server:cli_main"
52 | 
53 | [project.urls]
54 | Homepage = "https://github.com/yourusername/gitprobe"
55 | Documentation = "https://github.com/yourusername/gitprobe#readme"
56 | Repository = "https://github.com/yourusername/gitprobe.git"
57 | Issues = "https://github.com/yourusername/gitprobe/issues"
58 | 
59 | [tool.setuptools.packages.find]
60 | where = ["src"]
61 | 
62 | [tool.setuptools.package-dir]
63 | "" = "src"
64 | 
65 | [tool.black]
66 | line-length = 100
67 | target-version = ['py38']
68 | 
69 | [tool.isort]
70 | profile = "black"
71 | line_length = 100
72 | 
73 | [tool.mypy]
74 | python_version = "3.8"
75 | strict = true
76 | warn_return_any = true
77 | warn_unused_configs = true 


--------------------------------------------------------------------------------
/src/gitprobe/web/server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | GitProbe FastAPI Server
  3 | 
  4 | Main web server providing REST API endpoints for repository analysis.
  5 | Coordinates between different GitProbe services to provide comprehensive code analysis.
  6 | """
  7 | 
  8 | from fastapi import FastAPI, HTTPException
  9 | from pydantic import BaseModel, field_validator
 10 | from typing import Optional, List
 11 | 
 12 | from gitprobe.analysis.analysis_service import AnalysisService
 13 | from gitprobe.analysis.cloning import sanitize_github_url
 14 | 
 15 | app = FastAPI(
 16 |     title="GitProbe API",
 17 |     description="Repository analysis API using GitProbe services",
 18 |     version="1.0.0",
 19 | )
 20 | 
 21 | 
 22 | class AnalyzeRequest(BaseModel):
 23 |     github_url: str
 24 |     include_patterns: Optional[List[str]] = None
 25 |     exclude_patterns: Optional[List[str]] = None
 26 | 
 27 |     @field_validator("github_url")
 28 |     @classmethod
 29 |     def sanitize_url(cls, v):
 30 |         if not v:
 31 |             raise ValueError("GitHub URL is required")
 32 | 
 33 |         sanitized = sanitize_github_url(v)
 34 | 
 35 |         if "github.com" not in sanitized:
 36 |             raise ValueError("Must be a valid GitHub URL")
 37 | 
 38 |         return sanitized
 39 | 
 40 | 
 41 | class AnalysisResponse(BaseModel):
 42 |     status: str
 43 |     data: dict
 44 | 
 45 | 
 46 | @app.post("/analyze", response_model=AnalysisResponse)
 47 | async def analyze_repo(request: AnalyzeRequest):
 48 |     """Complete repository analysis including call graphs."""
 49 |     try:
 50 |         analysis_service = AnalysisService()
 51 |         analysis_result = analysis_service.analyze_repository_full(
 52 |             request.github_url, request.include_patterns, request.exclude_patterns
 53 |         )
 54 |         return AnalysisResponse(status="success", data=analysis_result.model_dump())
 55 |     except Exception as e:
 56 |         raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
 57 | 
 58 | 
 59 | @app.post("/analyze/structure-only", response_model=AnalysisResponse)
 60 | async def analyze_structure_only(request: AnalyzeRequest):
 61 |     """Lightweight repository structure analysis without call graphs."""
 62 |     try:
 63 |         analysis_service = AnalysisService()
 64 |         result = analysis_service.analyze_repository_structure_only(request.github_url)
 65 |         return AnalysisResponse(status="success", data=result)
 66 |     except Exception as e:
 67 |         raise HTTPException(status_code=500, detail=f"Structure analysis failed: {str(e)}")
 68 | 
 69 | 
 70 | @app.post("/analyze/llm-context")
 71 | async def get_llm_context(request: AnalyzeRequest):
 72 |     """Get clean, LLM-optimized analysis data."""
 73 |     try:
 74 |         analysis_service = AnalysisService()
 75 |         result = analysis_service.analyze_repository_full(
 76 |             request.github_url, request.include_patterns, request.exclude_patterns
 77 |         )
 78 |         llm_data = analysis_service.call_graph_analyzer.generate_llm_format()
 79 |         return {"status": "success", "data": llm_data}
 80 |     except Exception as e:
 81 |         raise HTTPException(status_code=500, detail=f"LLM context analysis failed: {str(e)}")
 82 | 
 83 | 
 84 | @app.get("/")
 85 | async def root():
 86 |     return {"message": "GitProbe API is running"}
 87 | 
 88 | 
 89 | @app.get("/health")
 90 | async def health_check():
 91 |     return {"status": "healthy"}
 92 | 
 93 | 
 94 | def cli_main():
 95 |     """CLI entry point for gitprobe-server command."""
 96 |     import uvicorn
 97 | 
 98 |     uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     cli_main()
103 | 


--------------------------------------------------------------------------------
/src/gitprobe/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | GitProbe Command Line Interface
  3 | 
  4 | Provides command-line access to GitProbe functionality.
  5 | """
  6 | 
  7 | import argparse
  8 | import sys
  9 | import json
 10 | from pathlib import Path
 11 | from typing import Optional, List
 12 | 
 13 | from gitprobe.analysis.analysis_service import AnalysisService
 14 | 
 15 | 
 16 | def analyze_repo(
 17 |     url: str,
 18 |     output: Optional[str] = None,
 19 |     format: str = "json",
 20 |     include: Optional[List[str]] = None,
 21 |     exclude: Optional[List[str]] = None,
 22 |     structure_only: bool = False,
 23 | ) -> None:
 24 |     """Analyze a repository and output results."""
 25 |     print(f"🔍 Analyzing repository: {url}")
 26 | 
 27 |     try:
 28 |         service = AnalysisService()
 29 | 
 30 |         if structure_only:
 31 |             result = service.analyze_repository_structure_only(url)
 32 |         else:
 33 |             analysis_result = service.analyze_repository_full(url, include, exclude)
 34 |             result = analysis_result.model_dump()
 35 | 
 36 |         # Output results
 37 |         if output:
 38 |             output_path = Path(output)
 39 |             with open(output_path, "w") as f:
 40 |                 if format == "json":
 41 |                     json.dump(result, f, indent=2)
 42 |                 else:
 43 |                     f.write(str(result))
 44 |             print(f"✅ Results saved to: {output_path}")
 45 |         else:
 46 |             if format == "json":
 47 |                 print(json.dumps(result, indent=2))
 48 |             else:
 49 |                 print(result)
 50 | 
 51 |     except Exception as e:
 52 |         print(f"❌ Analysis failed: {e}")
 53 |         sys.exit(1)
 54 | 
 55 | 
 56 | def main():
 57 |     """Main CLI entry point."""
 58 |     parser = argparse.ArgumentParser(
 59 |         description="GitProbe - Advanced repository analysis with call graph generation",
 60 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 61 |         epilog="""
 62 | Examples:
 63 |   gitprobe analyze https://github.com/user/repo
 64 |   gitprobe analyze user/repo --output results.json
 65 |   gitprobe analyze https://github.com/user/repo --structure-only
 66 |   gitprobe server --port 8080
 67 |         """,
 68 |     )
 69 | 
 70 |     subparsers = parser.add_subparsers(dest="command", help="Available commands")
 71 | 
 72 |     # Analyze command
 73 |     analyze_parser = subparsers.add_parser("analyze", help="Analyze a repository")
 74 |     analyze_parser.add_argument("url", help="GitHub repository URL or owner/repo")
 75 |     analyze_parser.add_argument("--output", "-o", help="Output file path")
 76 |     analyze_parser.add_argument(
 77 |         "--format", choices=["json", "text"], default="json", help="Output format"
 78 |     )
 79 |     analyze_parser.add_argument("--include", nargs="*", help="File patterns to include")
 80 |     analyze_parser.add_argument("--exclude", nargs="*", help="File patterns to exclude")
 81 |     analyze_parser.add_argument(
 82 |         "--structure-only", action="store_true", help="Analyze structure only (faster)"
 83 |     )
 84 | 
 85 |     # Server command
 86 |     server_parser = subparsers.add_parser("server", help="Start the GitProbe server")
 87 |     server_parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
 88 |     server_parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
 89 |     server_parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
 90 | 
 91 |     args = parser.parse_args()
 92 | 
 93 |     if not args.command:
 94 |         parser.print_help()
 95 |         return
 96 | 
 97 |     if args.command == "analyze":
 98 |         analyze_repo(
 99 |             url=args.url,
100 |             output=args.output,
101 |             format=args.format,
102 |             include=args.include,
103 |             exclude=args.exclude,
104 |             structure_only=args.structure_only,
105 |         )
106 |     elif args.command == "server":
107 |         start_server(host=args.host, port=args.port, reload=args.reload)
108 | 
109 | 
110 | def start_server(host: str = "0.0.0.0", port: int = 8000, reload: bool = False):
111 |     """Start the GitProbe server."""
112 |     try:
113 |         import uvicorn
114 | 
115 |         print(f"🚀 Starting GitProbe server on {host}:{port}")
116 |         uvicorn.run("gitprobe.web.server:app", host=host, port=port, reload=reload)
117 |     except ImportError:
118 |         print("❌ uvicorn not installed. Please install with: pip install uvicorn")
119 |         sys.exit(1)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     main()
124 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Python
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.so
  6 | .Python
  7 | build/
  8 | develop-eggs/
  9 | dist/
 10 | downloads/
 11 | eggs/
 12 | .eggs/
 13 | lib/
 14 | lib64/
 15 | parts/
 16 | sdist/
 17 | var/
 18 | wheels/
 19 | *.egg-info/
 20 | .installed.cfg
 21 | *.egg
 22 | MANIFEST
 23 | 
 24 | # Virtual environments
 25 | env/
 26 | venv/
 27 | ENV/
 28 | env.bak/
 29 | venv.bak/
 30 | .venv/
 31 | 
 32 | # IDE
 33 | .vscode/
 34 | .idea/
 35 | *.swp
 36 | *.swo
 37 | *~
 38 | 
 39 | # OS
 40 | .DS_Store
 41 | .DS_Store?
 42 | ._*
 43 | .Spotlight-V100
 44 | .Trashes
 45 | ehthumbs.db
 46 | Thumbs.db
 47 | 
 48 | # Project specific
 49 | *.html
 50 | *.svg
 51 | *.json
 52 | !requirements.txt
 53 | !package.json
 54 | 
 55 | # Temporary files
 56 | *.tmp
 57 | *.temp
 58 | temp/
 59 | tmp/
 60 | 
 61 | # Logs
 62 | *.log
 63 | logs/
 64 | 
 65 | # Testing
 66 | .coverage
 67 | .pytest_cache/
 68 | .tox/
 69 | .nox/
 70 | htmlcov/
 71 | 
 72 | # Documentation
 73 | docs/_build/
 74 | 
 75 | # Distribution / packaging
 76 | .Python
 77 | build/
 78 | develop-eggs/
 79 | dist/
 80 | downloads/
 81 | eggs/
 82 | .eggs/
 83 | lib/
 84 | lib64/
 85 | parts/
 86 | sdist/
 87 | var/
 88 | wheels/
 89 | share/python-wheels/
 90 | *.egg-info/
 91 | .installed.cfg
 92 | *.egg
 93 | MANIFEST
 94 | 
 95 | tmp/*
 96 | 
 97 | # PyInstaller
 98 | #  Usually these files are written by a python script from a template
 99 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
100 | *.manifest
101 | *.spec
102 | 
103 | # Installer logs
104 | pip-log.txt
105 | pip-delete-this-directory.txt
106 | 
107 | # Unit test / coverage reports
108 | htmlcov/
109 | .tox/
110 | .nox/
111 | .coverage
112 | .coverage.*
113 | .cache
114 | nosetests.xml
115 | coverage.xml
116 | *.cover
117 | *.py,cover
118 | .hypothesis/
119 | .pytest_cache/
120 | cover/
121 | 
122 | # Translations
123 | *.mo
124 | *.pot
125 | 
126 | # Django stuff:
127 | *.log
128 | local_settings.py
129 | db.sqlite3
130 | db.sqlite3-journal
131 | 
132 | # Flask stuff:
133 | instance/
134 | .webassets-cache
135 | 
136 | # Scrapy stuff:
137 | .scrapy
138 | 
139 | # PyBuilder
140 | .pybuilder/
141 | target/
142 | 
143 | # Jupyter Notebook
144 | .ipynb_checkpoints
145 | 
146 | # IPython
147 | profile_default/
148 | ipython_config.py
149 | 
150 | # pyenv
151 | #   For a library or package, you might want to ignore these files since the code is
152 | #   intended to run in multiple environments; otherwise, check them in:
153 | # .python-version
154 | 
155 | # pipenv
156 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
157 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
158 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
159 | #   install all needed dependencies.
160 | #Pipfile.lock
161 | 
162 | # poetry
163 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
164 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
165 | #   commonly ignored for libraries.
166 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
167 | #poetry.lock
168 | 
169 | # pdm
170 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
171 | #pdm.lock
172 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
173 | #   in version control.
174 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
175 | .pdm.toml
176 | .pdm-python
177 | .pdm-build/
178 | 
179 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
180 | __pypackages__/
181 | 
182 | # Celery stuff
183 | celerybeat-schedule
184 | celerybeat.pid
185 | 
186 | # SageMath parsed files
187 | *.sage.py
188 | 
189 | # Environments
190 | .env
191 | .venv
192 | env/
193 | venv/
194 | ENV/
195 | env.bak/
196 | venv.bak/
197 | .python-version
198 | 
199 | # Spyder project settings
200 | .spyderproject
201 | .spyproject
202 | 
203 | # Rope project settings
204 | .ropeproject
205 | 
206 | # mkdocs documentation
207 | /site
208 | 
209 | # mypy
210 | .mypy_cache/
211 | .dmypy.json
212 | dmypy.json
213 | 
214 | # Pyre type checker
215 | .pyre/
216 | 
217 | # pytype static type analyzer
218 | .pytype/
219 | 
220 | # Cython debug symbols
221 | cython_debug/
222 | 
223 | # PyCharm
224 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
225 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
226 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
227 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
228 | #.idea/
229 | .vscode/settings.json
230 | .DS_Store
231 | 
232 | # Project specific
233 | history.txt
234 | cleanup.py
235 | Caddyfile
236 | 
237 | # ignore default output directory
238 | tmp/*
239 | 
240 | # Gitingest
241 | digest.txt
242 | 
243 | .cursor/


--------------------------------------------------------------------------------
/src/gitprobe/analysis/repo_analyzer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Repository Analyzer Module
  3 | 
  4 | This module provides functionality to analyze repository structures and generate
  5 | detailed file tree representations with filtering capabilities.
  6 | """
  7 | 
  8 | import os
  9 | import fnmatch
 10 | import json
 11 | from pathlib import Path
 12 | from typing import Dict, List, Optional, Union
 13 | from gitprobe.utils.patterns import DEFAULT_IGNORE_PATTERNS, DEFAULT_INCLUDE_PATTERNS
 14 | 
 15 | 
 16 | class RepoAnalyzer:
 17 |     def __init__(
 18 |         self,
 19 |         include_patterns: Optional[List[str]] = None,
 20 |         exclude_patterns: Optional[List[str]] = None,
 21 |     ) -> None:
 22 |         self.include_patterns = (
 23 |             include_patterns if include_patterns is not None else DEFAULT_INCLUDE_PATTERNS
 24 |         )
 25 |         self.exclude_patterns = (
 26 |             list(DEFAULT_IGNORE_PATTERNS) + exclude_patterns
 27 |             if exclude_patterns is not None
 28 |             else list(DEFAULT_IGNORE_PATTERNS)
 29 |         )
 30 | 
 31 |     def analyze_repository_structure(self, repo_dir: str) -> Dict:
 32 |         file_tree = self._build_file_tree(repo_dir)
 33 |         return {
 34 |             "file_tree": file_tree,
 35 |             "summary": {
 36 |                 "total_files": self._count_files(file_tree),
 37 |                 "total_size_kb": self._calculate_size(file_tree),
 38 |             },
 39 |         }
 40 | 
 41 |     def _build_file_tree(self, repo_dir: str) -> Dict:
 42 |         def build_tree(path: Path, base_path: Path) -> Optional[Dict]:
 43 |             relative_path = path.relative_to(base_path)
 44 |             relative_path_str = str(relative_path)
 45 | 
 46 |             # 🚫 Reject symlinks
 47 |             if path.is_symlink():
 48 |                 return None
 49 | 
 50 |             # 🚫 Reject escaped paths (e.g., symlinks pointing outside)
 51 |             try:
 52 |                 if not path.resolve().is_relative_to(base_path.resolve()):
 53 |                     return None
 54 |             except AttributeError:
 55 |                 if not str(path.resolve()).startswith(str(base_path.resolve())):
 56 |                     return None
 57 | 
 58 |             if self._should_exclude_path(relative_path_str, path.name):
 59 |                 return None
 60 | 
 61 |             if path.is_file():
 62 |                 if not self._should_include_file(relative_path_str, path.name):
 63 |                     return None
 64 | 
 65 |                 size = path.stat().st_size
 66 |                 return {
 67 |                     "type": "file",
 68 |                     "name": path.name,
 69 |                     "path": relative_path_str,
 70 |                     "extension": path.suffix,
 71 |                     "_size_bytes": size,
 72 |                 }
 73 | 
 74 |             elif path.is_dir():
 75 |                 children = []
 76 |                 try:
 77 |                     for child in sorted(path.iterdir()):
 78 |                         child_tree = build_tree(child, base_path)
 79 |                         if child_tree is not None:
 80 |                             children.append(child_tree)
 81 |                 except PermissionError:
 82 |                     pass
 83 | 
 84 |                 if children or str(relative_path) == ".":
 85 |                     return {
 86 |                         "type": "directory",
 87 |                         "name": path.name,
 88 |                         "path": relative_path_str,
 89 |                         "children": children,
 90 |                     }
 91 |                 return None
 92 | 
 93 |             # Other types (sockets, devices, etc.)
 94 |             return None
 95 | 
 96 |         return build_tree(Path(repo_dir), Path(repo_dir))
 97 | 
 98 |     def _should_exclude_path(self, path: str, filename: str) -> bool:
 99 |         for pattern in self.exclude_patterns:
100 |             if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern):
101 |                 return True
102 |             if pattern.endswith("/") and path.startswith(pattern.rstrip("/")):
103 |                 return True
104 |             if path.startswith(pattern + "/") or path == pattern:
105 |                 return True
106 |             if pattern in path.split("/"):
107 |                 return True
108 |         return False
109 | 
110 |     def _should_include_file(self, path: str, filename: str) -> bool:
111 |         if not self.include_patterns:
112 |             return True
113 |         for pattern in self.include_patterns:
114 |             if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern):
115 |                 return True
116 |         return False
117 | 
118 |     def _count_files(self, tree: Dict) -> int:
119 |         if tree["type"] == "file":
120 |             return 1
121 |         return sum(self._count_files(child) for child in tree.get("children", []))
122 | 
123 |     def _calculate_size(self, tree: Dict) -> float:
124 |         if tree["type"] == "file":
125 |             return tree.get("_size_bytes", 0) / 1024
126 |         return sum(self._calculate_size(child) for child in tree.get("children", []))
127 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
  1 | # GitProbe Integration Tests
  2 | 
  3 | Comprehensive integration test suite for GitProbe's tree-sitter language analyzers. Tests real-world repositories to ensure all language parsers are working correctly.
  4 | 
  5 | ## 🚀 Quick Start
  6 | 
  7 | ```bash
  8 | # Install dependencies
  9 | pip install requests rich
 10 | 
 11 | # Start GitProbe server (in another terminal)
 12 | ./gitprobe server
 13 | 
 14 | # Run all tests
 15 | python tests/test_integration.py
 16 | 
 17 | # Run quick subset (1 repo per language)
 18 | python tests/test_integration.py --quick
 19 | 
 20 | # Test specific language
 21 | python tests/test_integration.py --language python
 22 | 
 23 | # Verbose output with detailed progress
 24 | python tests/test_integration.py --verbose
 25 | ```
 26 | 
 27 | ## 📋 Test Coverage
 28 | 
 29 | The integration tests cover **7 languages** with carefully curated real-world repositories:
 30 | 
 31 | ### Supported Languages
 32 | - **Python** - 4 repositories (rich, requests, flask, cpython)
 33 | - **JavaScript** - 4 repositories (lodash, axios, express, node)
 34 | - **TypeScript** - 3 repositories (vscode, TypeScript, angular)
 35 | - **Rust** - 3 repositories (clap, ripgrep, rust)
 36 | - **Go** - 3 repositories (cobra, hugo, kubernetes)
 37 | - **C** - 3 repositories (cJSON, libuv, curl)
 38 | - **C++** - 3 repositories (fmt, Catch2, protobuf)
 39 | 
 40 | ### Test Repository Selection Criteria
 41 | - **Real-world usage**: Popular, actively maintained projects
 42 | - **Diverse complexity**: From small libraries to large frameworks
 43 | - **Language features**: Covers different language patterns and idioms
 44 | - **Performance testing**: Includes large repositories to test scaling
 45 | 
 46 | ## 🛠️ Usage Examples
 47 | 
 48 | ### Basic Testing
 49 | 
 50 | ```bash
 51 | # Test all languages with all repositories (~25 repositories)
 52 | python tests/test_integration.py
 53 | 
 54 | # Quick test with 1 repository per language (6 repositories)
 55 | python tests/test_integration.py --quick
 56 | ```
 57 | 
 58 | ### Language-Specific Testing
 59 | 
 60 | ```bash
 61 | # Test only Python repositories
 62 | python tests/test_integration.py --language python
 63 | 
 64 | # Test multiple specific languages
 65 | python tests/test_integration.py --language python --language rust
 66 | 
 67 | # Test C/C++ analyzers
 68 | python tests/test_integration.py --language c --language c++
 69 | ```
 70 | 
 71 | ### Advanced Options
 72 | 
 73 | ```bash
 74 | # Verbose output showing each test result
 75 | python tests/test_integration.py --verbose
 76 | 
 77 | # Custom server URL
 78 | python tests/test_integration.py --server http://localhost:9000
 79 | 
 80 | # Longer timeout for large repositories
 81 | python tests/test_integration.py --timeout 300
 82 | 
 83 | # JSON output for CI/CD integration
 84 | python tests/test_integration.py --json > test_results.json
 85 | ```
 86 | 
 87 | ## 📊 Output Formats
 88 | 
 89 | ### Standard Output
 90 | Beautiful terminal output with:
 91 | - Progress indicators with spinners
 92 | - Colored summary table by language
 93 | - Success/failure statistics
 94 | - Performance metrics (functions found, duration)
 95 | - Error details for failed tests
 96 | 
 97 | ### JSON Output
 98 | Structured data perfect for CI/CD integration:
 99 | ```json
100 | {
101 |   "total_tests": 17,
102 |   "passed": 17,
103 |   "failed": 0,
104 |   "success_rate": 100.0,
105 |   "overall_success": true,
106 |   "duration": 125.3,
107 |   "by_language": {
108 |     "Python": {
109 |       "passed": 3,
110 |       "total": 3,
111 |       "results": [...]
112 |     }
113 |   }
114 | }
115 | ```
116 | 
117 | ## 🔧 Configuration
118 | 
119 | ### Environment Requirements
120 | - **GitProbe server**: Must be running on specified URL (default: `http://localhost:8000`)
121 | - **Dependencies**: `requests` and `rich` packages
122 | - **Network access**: Required for cloning public GitHub repositories
123 | - **Disk space**: Temporary clones are created and cleaned up automatically
124 | 
125 | ### Timeout Settings
126 | - **Default**: 120 seconds per repository
127 | - **Large repos**: Consider increasing to 300+ seconds for repositories like kubernetes or rust
128 | - **Quick tests**: Usually complete in 30-60 seconds
129 | 
130 | ### Server Health Check
131 | The test suite automatically:
132 | 1. Checks if GitProbe server is running
133 | 2. Validates server health endpoint
134 | 3. Provides clear error messages if server is unavailable
135 | 
136 | ## 🎯 Test Success Criteria
137 | 
138 | A repository test is considered **successful** if:
139 | - ✅ HTTP 200 response from GitProbe API
140 | - ✅ At least 1 function detected in the codebase
141 | - ✅ No error status in the response
142 | - ✅ Analysis completes within timeout period
143 | 
144 | ## 🔍 Troubleshooting
145 | 
146 | ### Common Issues
147 | 
148 | **Server not running:**
149 | ```
150 | ❌ GitProbe server is not running or unhealthy
151 |    Start server with: ./gitprobe server
152 | ```
153 | *Solution*: Start GitProbe server in another terminal
154 | 
155 | **Timeout errors:**
156 | ```
157 | ❌ rust/rust: Timeout
158 | ```
159 | *Solution*: Increase timeout with `--timeout 300` for large repositories
160 | 
161 | **No functions detected:**
162 | ```
163 | ❌ python/someproject: No functions detected
164 | ```
165 | *Possible causes*:
166 | - Repository has no supported files
167 | - Tree-sitter parser failed to initialize
168 | - Repository structure not recognized
169 | 
170 | **Network issues:**
171 | ```
172 | ❌ python/requests: HTTP 500
173 | ```
174 | *Solution*: Check internet connection and GitHub API limits
175 | 
176 | ### Debug Mode
177 | 
178 | For detailed debugging, combine flags:
179 | ```bash
180 | python tests/test_integration.py --verbose --language python --timeout 300
181 | ```
182 | 
183 | ## 🚦 CI/CD Integration
184 | 
185 | Perfect for continuous integration pipelines:
186 | 
187 | ```yaml
188 | # GitHub Actions example
189 | - name: Run GitProbe Integration Tests
190 |   run: |
191 |     ./gitprobe server &
192 |     sleep 10  # Wait for server startup
193 |     python tests/test_integration.py --quick --json > results.json
194 |     
195 | - name: Check Test Results
196 |   run: |
197 |     if jq -e '.overall_success == false' results.json; then
198 |       echo "Tests failed"
199 |       exit 1
200 |     fi
201 | ```
202 | 
203 | ## 🏗️ Architecture
204 | 
205 | ### Test Structure
206 | - **TestResult**: Dataclass for individual repository results
207 | - **GitProbeIntegrationTests**: Main test runner class
208 | - **Progress tracking**: Real-time progress with rich library
209 | - **Error handling**: Comprehensive timeout and exception handling
210 | 
211 | ### Repository Management
212 | - Repositories are cloned by GitProbe server
213 | - Temporary directories are automatically cleaned up
214 | - No local storage required for test suite
215 | 
216 | ### Extensibility
217 | - Easy to add new test repositories
218 | - Simple language addition process
219 | - Configurable test sets (quick vs. comprehensive)
220 | 
221 | ## 📈 Performance Benchmarks
222 | 
223 | Typical execution times on modern hardware:
224 | 
225 | | Test Set | Repositories | Duration | Use Case |
226 | |----------|-------------|----------|----------|
227 | | Quick | 6 repos | 30-60s | Development, quick validation |
228 | | Full | ~25 repos | 5-15min | CI/CD, comprehensive testing |
229 | | Single Language | 3-4 repos | 1-3min | Language-specific debugging |
230 | 
231 | ## 🤝 Contributing
232 | 
233 | To add new test repositories:
234 | 
235 | 1. Add to appropriate language section in `TEST_REPOSITORIES`
236 | 2. Include description for context
237 | 3. Test with `--language <your_language>` first
238 | 4. Consider adding to `QUICK_TEST_SET` if it's a good representative
239 | 
240 | Example:
241 | ```python
242 | "Python": [
243 |     ("https://github.com/new/repository", "Description of what it tests"),
244 |     # ... existing repos
245 | ]
246 | ``` 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GitProbe
  2 | 
  3 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/your-org/gitprobe/blob/main/LICENSE)
  4 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
  5 | [![FastAPI](https://img.shields.io/badge/FastAPI-0.115+-green.svg)](https://fastapi.tiangolo.com/)
  6 | 
  7 | Turn any GitHub repository into comprehensive code analysis with interactive call graphs and multi-language support.
  8 | 
  9 | ## 🚀 Features
 10 | 
 11 | - **Multi-language Analysis**: Support for Python, JavaScript, TypeScript, Rust, Go, C, and C++
 12 | - **Tree-sitter Powered**: Advanced syntax parsing with tree-sitter for accurate code analysis
 13 | - **Call Graph Generation**: Interactive visualizations showing function relationships
 14 | - **Web API**: RESTful API for integration with other tools and frontends
 15 | - **Real-time Analysis**: Live progress tracking and results
 16 | - **Repository Insights**: File structure, function counts, and relationship mapping
 17 | - **LLM-Ready Output**: Structured JSON optimized for AI analysis
 18 | 
 19 | ## 📸 Preview
 20 | 
 21 | ![GitProbe Preview](docs/preview.png)
 22 | 
 23 | *GitProbe's interactive call graph visualization showing function relationships and code structure analysis*
 24 | 
 25 | ## 📚 Requirements
 26 | 
 27 | - Python 3.8+
 28 | - Git (for repository cloning)
 29 | - Internet access for GitHub repository analysis
 30 | 
 31 | ## 📦 Installation
 32 | 
 33 | ```bash
 34 | # Clone the repository
 35 | git clone https://github.com/your-org/gitprobe.git
 36 | cd gitprobe
 37 | 
 38 | # Create virtual environment
 39 | python -m venv env
 40 | source env/bin/activate  # On Windows: env\Scripts\activate
 41 | 
 42 | # Install dependencies
 43 | pip install -r requirements.txt
 44 | ```
 45 | 
 46 | ## 💡 Command line usage
 47 | 
 48 | ### Start the Web Server
 49 | 
 50 | ```bash
 51 | # Start GitProbe server
 52 | ./gitprobe server
 53 | 
 54 | # Server will be available at http://localhost:8000
 55 | # API documentation at http://localhost:8000/docs
 56 | ```
 57 | 
 58 | ### CLI Analysis (Legacy)
 59 | 
 60 | ```bash
 61 | # Analyze a GitHub repository
 62 | python -m gitprobe https://github.com/user/repository
 63 | 
 64 | # With custom output directory
 65 | python -m gitprobe https://github.com/user/repository --output ./analysis/
 66 | ```
 67 | 
 68 | ## 🌐 Web API Usage
 69 | 
 70 | ### Analyze Repository
 71 | 
 72 | ```bash
 73 | # Start analysis
 74 | curl -X POST "http://localhost:8000/analyze" \
 75 |   -H "Content-Type: application/json" \
 76 |   -d '{"github_url": "https://github.com/psf/requests"}'
 77 | ```
 78 | 
 79 | ### Python API Client
 80 | 
 81 | ```python
 82 | import requests
 83 | 
 84 | # Analyze repository
 85 | response = requests.post("http://localhost:8000/analyze", json={
 86 |     "github_url": "https://github.com/psf/requests",
 87 |     "include_patterns": ["*.py"],
 88 |     "exclude_patterns": ["*test*", "docs/"]
 89 | })
 90 | 
 91 | result = response.json()
 92 | print(f"Found {result['data']['summary']['total_functions']} functions")
 93 | print(f"Languages: {result['data']['summary']['languages_analyzed']}")
 94 | ```
 95 | 
 96 | ### Example Response
 97 | 
 98 | ```json
 99 | {
100 |   "status": "success",
101 |   "data": {
102 |     "summary": {
103 |       "total_functions": 235,
104 |       "total_calls": 657,
105 |       "languages_analyzed": ["python"],
106 |       "files_analyzed": 45
107 |     },
108 |     "functions": [...],
109 |     "relationships": [...],
110 |     "visualization": {
111 |       "cytoscape": {...}
112 |     }
113 |   }
114 | }
115 | ```
116 | 
117 | ## 🧪 Testing
118 | 
119 | GitProbe includes a comprehensive integration test suite that validates all language analyzers:
120 | 
121 | ```bash
122 | # Install test dependencies
123 | pip install rich
124 | 
125 | # Run quick tests (1 repo per language)
126 | python tests/test_integration.py --quick
127 | 
128 | # Test all languages comprehensive
129 | python tests/test_integration.py
130 | 
131 | # Test specific language
132 | python tests/test_integration.py --language python
133 | 
134 | # Verbose output with detailed progress
135 | python tests/test_integration.py --verbose
136 | 
137 | # JSON output for CI/CD
138 | python tests/test_integration.py --json > results.json
139 | ```
140 | 
141 | ### Test Coverage
142 | 
143 | - **Python**: rich, requests, flask, cpython
144 | - **JavaScript**: lodash, axios, express, node.js
145 | - **TypeScript**: vscode, typescript, angular
146 | - **Rust**: clap, ripgrep, rust compiler
147 | - **Go**: cobra, hugo, kubernetes
148 | - **C**: cJSON, libuv, curl
149 | - **C++**: fmt, catch2, protobuf
150 | 
151 | ## 🏗️ Architecture
152 | 
153 | ```
154 | gitprobe/
155 | ├── src/gitprobe/
156 | │   ├── analysis/           # Core analysis engine
157 | │   │   ├── analysis_service.py
158 | │   │   ├── call_graph_analyzer.py
159 | │   │   └── repo_analyzer.py
160 | │   ├── analyzers/          # Language-specific parsers
161 | │   │   ├── python.py       # Python tree-sitter analyzer
162 | │   │   ├── javascript.py   # JavaScript/TypeScript analyzer
163 | │   │   ├── rust.py         # Rust analyzer
164 | │   │   ├── go.py           # Go analyzer
165 | │   │   ├── c_cpp.py        # C/C++ analyzer
166 | │   │   └── ...
167 | │   ├── web/               # FastAPI web server
168 | │   │   └── server.py
169 | │   └── models/            # Data models
170 | │       └── ...
171 | ├── tests/                 # Integration test suite
172 | │   ├── test_integration.py
173 | │   └── README.md
174 | └── requirements.txt
175 | ```
176 | 
177 | ## 🎯 Language Support
178 | 
179 | | Language   | Functions | Calls | Classes | Imports | Status |
180 | |------------|-----------|-------|---------|---------|--------|
181 | | Python     | ✅        | ✅    | ✅      | ✅      | Stable |
182 | | JavaScript | ✅        | ✅    | ✅      | ✅      | Stable |
183 | | TypeScript | ✅        | ✅    | ✅      | ✅      | Stable |
184 | | Rust       | ✅        | ✅    | ✅      | ✅      | Stable |
185 | | Go         | ✅        | ✅    | ✅      | ✅      | Stable |
186 | | C          | ✅        | ✅    | ❌      | ✅      | Stable |
187 | | C++        | ✅        | ✅    | ✅      | ✅      | Stable |
188 | 
189 | ## 🔧 Configuration
190 | 
191 | ### Environment Variables
192 | 
193 | ```bash
194 | # Optional: Custom server configuration
195 | export GITPROBE_HOST=0.0.0.0
196 | export GITPROBE_PORT=8000
197 | ```
198 | 
199 | ### Analysis Options
200 | 
201 | ```python
202 | # Include/exclude patterns
203 | {
204 |   "github_url": "https://github.com/user/repo",
205 |   "include_patterns": ["*.py", "*.js"],
206 |   "exclude_patterns": ["*test*", "node_modules/", "__pycache__/"]
207 | }
208 | ```
209 | 
210 | ## 🤝 Contributing
211 | 
212 | ### Running Tests
213 | 
214 | ```bash
215 | # Start GitProbe server (in one terminal)
216 | ./gitprobe server
217 | 
218 | # Run integration tests (in another terminal)
219 | python tests/test_integration.py --quick
220 | ```
221 | 
222 | ### Adding New Languages
223 | 
224 | 1. Create analyzer in `src/gitprobe/analyzers/`
225 | 2. Add tree-sitter language dependency to `requirements.txt`
226 | 3. Register analyzer in analysis service
227 | 4. Add test repositories to `tests/test_integration.py`
228 | 
229 | ### Development Setup
230 | 
231 | ```bash
232 | # Install in development mode
233 | pip install -e .
234 | 
235 | # Install development dependencies
236 | pip install pytest black isort mypy
237 | 
238 | # Run code formatting
239 | black .
240 | isort .
241 | ```
242 | 
243 | ## 🛠️ Stack
244 | 
245 | - [Tree-sitter](https://tree-sitter.github.io/) - Syntax parsing and analysis
246 | - [FastAPI](https://fastapi.tiangolo.com/) - Web API framework
247 | - [Pydantic](https://docs.pydantic.dev/) - Data validation and modeling
248 | - [Rich](https://rich.readthedocs.io/) - Beautiful terminal output
249 | - [Cytoscape.js](https://cytoscape.org/) - Graph visualization (frontend)
250 | 
251 | ## 🐛 Known Issues
252 | 
253 | - Large repositories (>1000 functions) are limited to 900 functions for performance
254 | - Some complex C++ template syntax may not parse correctly
255 | - Private repositories require local cloning
256 | 
257 | ## 📄 License
258 | 
259 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
260 | 
261 | ---
262 | 
263 | **GitProbe** - Comprehensive multi-language code analysis with interactive call graphs. 


--------------------------------------------------------------------------------
/src/gitprobe/analyzers/python.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Python AST Analyzer
  3 | 
  4 | Analyzes Python source code using the Abstract Syntax Tree (AST) to extract
  5 | function definitions, method information, and function call relationships.
  6 | """
  7 | 
  8 | import ast
  9 | import logging
 10 | from typing import List, Tuple, Optional
 11 | from pathlib import Path
 12 | 
 13 | from gitprobe.models.core import Function, CallRelationship
 14 | from gitprobe.core.analysis_limits import AnalysisLimits, create_python_limits
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class PythonASTAnalyzer(ast.NodeVisitor):
 20 |     """
 21 |     AST visitor to extract function information from Python code.
 22 | 
 23 |     This analyzer traverses Python AST nodes to identify:
 24 |     - Function and method definitions
 25 |     - Function parameters and docstrings
 26 |     - Function call relationships
 27 |     - Class context for methods
 28 |     - Code snippets and line numbers
 29 |     """
 30 | 
 31 |     def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None):
 32 |         """
 33 |         Initialize the Python AST analyzer.
 34 | 
 35 |         Args:
 36 |             file_path: Path to the Python file being analyzed
 37 |             content: Raw content of the Python file
 38 |             limits: Analysis limits configuration
 39 |         """
 40 |         self.file_path = file_path
 41 |         self.content = content
 42 |         self.lines = content.splitlines()
 43 |         self.functions: List[Function] = []
 44 |         self.call_relationships: List[CallRelationship] = []
 45 |         self.current_class_name: str | None = None
 46 |         self.current_function_name: str | None = None
 47 |         self.limits = limits or create_python_limits()
 48 | 
 49 |     def generic_visit(self, node):
 50 |         """Override generic_visit to continue AST traversal with limit checks."""
 51 |         if self.limits.should_stop():
 52 |             return
 53 |         super().generic_visit(node)
 54 | 
 55 |     def visit_ClassDef(self, node: ast.ClassDef):
 56 |         """Visit class definition and track current class context."""
 57 |         if self.limits.should_stop():
 58 |             return
 59 | 
 60 |         if self.limits.increment():
 61 |             return
 62 | 
 63 |         self.current_class_name = node.name
 64 |         self.generic_visit(node)
 65 |         self.current_class_name = None
 66 | 
 67 |     def _process_function_node(self, node: ast.FunctionDef | ast.AsyncFunctionDef):
 68 |         """Helper to process both sync and async function definitions."""
 69 |         if self.limits.should_stop():
 70 |             return
 71 | 
 72 |         if self.limits.increment():
 73 |             return
 74 | 
 75 |         self.current_function_name = node.name
 76 | 
 77 |         function_obj = Function(
 78 |             name=node.name,
 79 |             file_path=str(self.file_path),
 80 |             line_start=node.lineno,
 81 |             line_end=node.end_lineno,
 82 |             parameters=[arg.arg for arg in node.args.args],
 83 |             docstring=ast.get_docstring(node),
 84 |             is_method=self.current_class_name is not None,
 85 |             class_name=self.current_class_name,
 86 |             code_snippet="\n".join(self.lines[node.lineno - 1 : node.end_lineno or node.lineno]),
 87 |         )
 88 | 
 89 |         if self._should_include_function(function_obj):
 90 |             if self.limits.can_add_function():
 91 |                 self.functions.append(function_obj)
 92 |                 if self.limits.add_function():
 93 |                     return
 94 |             else:
 95 |                 return
 96 | 
 97 |         self.generic_visit(node)
 98 |         self.current_function_name = None
 99 | 
100 |     def _should_include_function(self, func: Function) -> bool:
101 |         """Determine if a function should be included in analysis."""
102 |         if func.name.startswith("_test_") or func.name in ["setUp", "tearDown"]:
103 |             return False
104 | 
105 |         return True
106 | 
107 |     def visit_FunctionDef(self, node: ast.FunctionDef):
108 |         """Visit function definition and extract function information."""
109 |         if self.limits.should_stop():
110 |             return
111 |         self._process_function_node(node)
112 | 
113 |     def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
114 |         """Visit async function definition and extract function information."""
115 |         if self.limits.should_stop():
116 |             return
117 |         self._process_function_node(node)
118 | 
119 |     def visit_Call(self, node: ast.Call):
120 |         """Visit function call nodes and record relationships."""
121 |         if self.limits.should_stop():
122 |             return
123 | 
124 |         if self.limits.increment():
125 |             return
126 | 
127 |         if self.current_function_name:
128 |             call_name = self._get_call_name(node.func)
129 |             if call_name:
130 |                 if self.limits.can_add_relationship():
131 |                     relationship = CallRelationship(
132 |                         caller=f"{self.file_path}:{self.current_function_name}",
133 |                         callee=call_name,
134 |                         call_line=node.lineno,
135 |                         is_resolved=False,
136 |                     )
137 |                     self.call_relationships.append(relationship)
138 |                     if self.limits.add_relationship():
139 |                         return
140 |                 else:
141 |                     return
142 |         self.generic_visit(node)
143 | 
144 |     def _get_call_name(self, node) -> str | None:
145 |         """
146 |         Extract function name from a call node.
147 |         Handles simple names, attributes (obj.method), and filters built-ins.
148 |         """
149 |         PYTHON_BUILTINS = {
150 |             "print",
151 |             "len",
152 |             "str",
153 |             "int",
154 |             "float",
155 |             "bool",
156 |             "list",
157 |             "dict",
158 |             "range",
159 |             "enumerate",
160 |             "zip",
161 |             "isinstance",
162 |             "hasattr",
163 |             "open",
164 |             "super",
165 |             "__import__",
166 |         }
167 | 
168 |         if isinstance(node, ast.Name):
169 |             if node.id in PYTHON_BUILTINS:
170 |                 return None
171 |             return node.id
172 |         elif isinstance(node, ast.Attribute):
173 |             if isinstance(node.value, ast.Name):
174 |                 return f"{node.value.id}.{node.attr}"
175 |             return node.attr
176 |         return None
177 | 
178 |     def analyze(self):
179 |         """Analyze the Python file and extract functions and relationships."""
180 |         if not self.limits.start_new_file():
181 |             logger.info(f"Skipping {self.file_path} - global limits reached")
182 |             return
183 | 
184 |         try:
185 |             tree = ast.parse(self.content)
186 |             self.visit(tree)
187 | 
188 |             logger.info(
189 |                 f"Python analysis complete for {self.file_path}: {len(self.functions)} functions, "
190 |                 f"{len(self.call_relationships)} relationships, "
191 |                 f"nodes_processed={self.limits.nodes_processed}"
192 |             )
193 |         except SyntaxError as e:
194 |             logger.warning(f"⚠️ Could not parse {self.file_path}: {e}")
195 |         except Exception as e:
196 |             logger.error(f"⚠️ Error analyzing {self.file_path}: {e}", exc_info=True)
197 | 
198 | 
199 | def analyze_python_file(
200 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
201 | ) -> Tuple[List[Function], List[CallRelationship]]:
202 |     """
203 |     Analyze a Python file and return functions and relationships.
204 | 
205 |     Args:
206 |         file_path: Path to the Python file
207 |         content: Content of the Python file
208 |         limits: Analysis limits configuration
209 | 
210 |     Returns:
211 |         tuple: (functions, call_relationships)
212 |     """
213 |     if limits is None:
214 |         limits = create_python_limits()
215 | 
216 |     analyzer = PythonASTAnalyzer(file_path, content, limits)
217 |     analyzer.analyze()
218 |     return analyzer.functions, analyzer.call_relationships
219 | 


--------------------------------------------------------------------------------
/src/gitprobe/analysis/cloning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | GitProbe Utility Functions
  3 | Repository cloning and cleanup utilities.
  4 | """
  5 | 
  6 | import os
  7 | import shutil
  8 | import tempfile
  9 | import subprocess
 10 | import stat
 11 | import time
 12 | from typing import Optional
 13 | 
 14 | GIT_EXECUTABLE_PATH = shutil.which("git")
 15 | 
 16 | 
 17 | def sanitize_github_url(github_url: str) -> str:
 18 |     """
 19 |     Sanitize GitHub URL to ensure proper format and remove extra path components.
 20 | 
 21 |     Args:
 22 |         github_url: Raw GitHub URL or repository path
 23 | 
 24 |     Returns:
 25 |         str: Sanitized GitHub URL suitable for cloning
 26 |     """
 27 | 
 28 |     url = github_url.strip()
 29 | 
 30 |     protocol = "https://"
 31 |     if url.startswith("https://"):
 32 |         url = url[8:]
 33 |     elif url.startswith("http://"):
 34 |         url = url[7:]
 35 |         protocol = "http://"
 36 | 
 37 |     if url.startswith("www."):
 38 |         url = url[4:]
 39 | 
 40 |     parts = url.split("/")
 41 | 
 42 |     if url.startswith("github.com/"):
 43 |         url_parts = url.split("/")
 44 |         if len(url_parts) >= 3:
 45 |             owner = url_parts[1]
 46 |             repo = url_parts[2]
 47 |         else:
 48 |             return github_url
 49 |     elif "/" in url and not url.startswith("github.com"):
 50 |         url_parts = url.split("/")
 51 |         if len(url_parts) >= 2:
 52 |             owner = url_parts[0]
 53 |             repo = url_parts[1]
 54 |         else:
 55 |             return github_url
 56 |     else:
 57 |         return github_url
 58 | 
 59 |     if repo.endswith(".git"):
 60 |         repo = repo[:-4]
 61 | 
 62 |     return f"{protocol}github.com/{owner}/{repo}"
 63 | 
 64 | 
 65 | def clone_repository(github_url: str) -> str:
 66 |     """
 67 |     Clone a GitHub repository to a temporary directory.
 68 | 
 69 |     Args:
 70 |         github_url: GitHub repository URL (will be sanitized automatically)
 71 | 
 72 |     Returns:
 73 |         str: Path to the cloned repository directory
 74 | 
 75 |     Raises:
 76 |         RuntimeError: If cloning fails or git executable is not found.
 77 |     """
 78 |     if not GIT_EXECUTABLE_PATH:
 79 |         raise RuntimeError(
 80 |             "Git executable not found. Please install Git and ensure it is in the system's PATH."
 81 |         )
 82 | 
 83 |     sanitized_url = sanitize_github_url(github_url)
 84 | 
 85 |     temp_dir = tempfile.mkdtemp(prefix="gitprobe_")
 86 | 
 87 |     try:
 88 |         if os.name == "nt":
 89 |             try:
 90 |                 subprocess.run(
 91 |                     [
 92 |                         GIT_EXECUTABLE_PATH,
 93 |                         "config",
 94 |                         "--global",
 95 |                         "core.longpaths",
 96 |                         "true",
 97 |                     ],
 98 |                     capture_output=True,
 99 |                     text=True,
100 |                 )
101 |             except:
102 |                 pass
103 | 
104 |         subprocess.run(
105 |             [
106 |                 GIT_EXECUTABLE_PATH,
107 |                 "clone",
108 |                 "--depth",
109 |                 "1",
110 |                 "--filter=blob:none",
111 |                 sanitized_url,
112 |                 temp_dir,
113 |             ],
114 |             check=True,
115 |             capture_output=True,
116 |             text=True,
117 |             timeout=300,
118 |         )
119 | 
120 |         if os.name == "nt":
121 |             try:
122 |                 subprocess.run(
123 |                     [
124 |                         GIT_EXECUTABLE_PATH,
125 |                         "-C",
126 |                         temp_dir,
127 |                         "config",
128 |                         "core.sparseCheckout",
129 |                         "true",
130 |                     ],
131 |                     capture_output=True,
132 |                     text=True,
133 |                 )
134 | 
135 |                 sparse_checkout_path = os.path.join(temp_dir, ".git", "info", "sparse-checkout")
136 |                 os.makedirs(os.path.dirname(sparse_checkout_path), exist_ok=True)
137 |                 with open(sparse_checkout_path, "w") as f:
138 |                     f.write("*\n")
139 |                     f.write("!**/tests/**/CvnF9nAXfESwhrtdkjGhX2wAkKHzwr8N2rjExPK8eZYS/**\n")
140 |                     f.write(
141 |                         "!**/0x0000000000000000000000000000000000000000000000000000000000000002/**\n"
142 |                     )
143 | 
144 |                 subprocess.run(
145 |                     [
146 |                         GIT_EXECUTABLE_PATH,
147 |                         "-C",
148 |                         temp_dir,
149 |                         "read-tree",
150 |                         "-m",
151 |                         "-u",
152 |                         "HEAD",
153 |                     ],
154 |                     capture_output=True,
155 |                     text=True,
156 |                 )
157 |             except:
158 |                 pass
159 |         return temp_dir
160 |     except subprocess.TimeoutExpired:
161 |         if os.path.exists(temp_dir):
162 |             cleanup_repository_safe(temp_dir)
163 |         raise RuntimeError(
164 |             f"Repository cloning timed out after 5 minutes. The repository may be too large or network is slow."
165 |         )
166 |     except subprocess.CalledProcessError as e:
167 |         if os.path.exists(temp_dir):
168 |             cleanup_repository_safe(temp_dir)
169 |         raise RuntimeError(f"Failed to clone repository: {e.stderr}")
170 |     except FileNotFoundError:
171 |         if os.path.exists(temp_dir):
172 |             cleanup_repository_safe(temp_dir)
173 |         raise RuntimeError(
174 |             f"Git executable not found at '{GIT_EXECUTABLE_PATH}'. "
175 |             "Please ensure Git is installed and the path is correct."
176 |         )
177 | 
178 | 
179 | def cleanup_repository_safe(repo_dir: str) -> bool:
180 |     """
181 |     Windows-safe removal of the cloned repository directory.
182 |     Handles read-only files and permission issues common on Windows.
183 | 
184 |     Args:
185 |         repo_dir: Path to the repository directory to remove
186 | 
187 |     Returns:
188 |         bool: True if cleanup successful, False otherwise
189 |     """
190 | 
191 |     def handle_remove_readonly(func, path, exc):
192 |         """Error handler for Windows read-only files."""
193 |         if os.path.exists(path):
194 |             os.chmod(path, stat.S_IWRITE)
195 |             func(path)
196 | 
197 |     try:
198 |         if os.path.exists(repo_dir):
199 |             if os.name == "nt":
200 |                 shutil.rmtree(repo_dir, onerror=handle_remove_readonly)
201 |             else:
202 |                 shutil.rmtree(repo_dir)
203 |             return True
204 |         return False
205 |     except PermissionError as e:
206 |         try:
207 |             time.sleep(1)
208 |             if os.path.exists(repo_dir):
209 |                 for root, dirs, files in os.walk(repo_dir):
210 |                     for dir in dirs:
211 |                         os.chmod(os.path.join(root, dir), stat.S_IWRITE)
212 |                     for file in files:
213 |                         file_path = os.path.join(root, file)
214 |                         if os.path.exists(file_path):
215 |                             os.chmod(file_path, stat.S_IWRITE)
216 |                 shutil.rmtree(repo_dir)
217 |             return True
218 |         except Exception as retry_e:
219 |             print(f"⚠️ Warning: Failed to cleanup {repo_dir} after retry: {str(retry_e)}")
220 |             return False
221 |     except Exception as e:
222 |         print(f"⚠️ Warning: Failed to cleanup {repo_dir}: {str(e)}")
223 |         return False
224 | 
225 | 
226 | def cleanup_repository(repo_dir: str) -> bool:
227 |     """
228 |     Remove the cloned repository directory (wrapper for backward compatibility).
229 | 
230 |     Args:
231 |         repo_dir: Path to the repository directory to remove
232 | 
233 |     Returns:
234 |         bool: True if cleanup successful, False otherwise
235 |     """
236 |     return cleanup_repository_safe(repo_dir)
237 | 
238 | 
239 | def parse_github_url(github_url: str) -> dict:
240 |     """
241 |     Parse GitHub URL to extract owner and repository name.
242 | 
243 |     Args:
244 |         github_url: GitHub repository URL
245 | 
246 |     Returns:
247 |         dict: Repository information
248 |     """
249 |     parts = github_url.rstrip("/").split("/")
250 |     if len(parts) >= 2:
251 |         owner = parts[-2]
252 |         name = parts[-1].replace(".git", "")
253 |         return {
254 |             "owner": owner,
255 |             "name": name,
256 |             "full_name": f"{owner}/{name}",
257 |             "url": github_url,
258 |         }
259 |     return {
260 |         "owner": "unknown",
261 |         "name": "unknown",
262 |         "full_name": "unknown",
263 |         "url": github_url,
264 |     }
265 | 


--------------------------------------------------------------------------------
/src/gitprobe/core/analysis_limits.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Shared Analysis Limits
  3 | 
  4 | Common analysis limits and performance controls used across all language analyzers.
  5 | Ensures consistent behavior and resource management across Python, JavaScript, Go, Rust, etc.
  6 | """
  7 | 
  8 | import time
  9 | import logging
 10 | from typing import Optional
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class GlobalLimitTracker:
 16 |     """
 17 |     Global limit tracker shared across ALL language analyzers.
 18 |     Enforces hard caps on total functions and relationships across the entire analysis.
 19 |     """
 20 | 
 21 |     def __init__(self, max_total_functions: int = 5000, max_total_relationships: int = 8000):
 22 |         self.max_total_functions = max_total_functions
 23 |         self.max_total_relationships = max_total_relationships
 24 |         self.total_functions = 0
 25 |         self.total_relationships = 0
 26 |         self.global_limit_reached = False
 27 | 
 28 |     def can_add_function(self) -> bool:
 29 |         """Check if we can add another function without exceeding global limits."""
 30 |         if self.global_limit_reached:
 31 |             return False
 32 |         return self.total_functions < self.max_total_functions
 33 | 
 34 |     def can_add_relationship(self) -> bool:
 35 |         """Check if we can add another relationship without exceeding global limits."""
 36 |         if self.global_limit_reached:
 37 |             return False
 38 |         return self.total_relationships < self.max_total_relationships
 39 | 
 40 |     def add_function(self) -> bool:
 41 |         """Add a function and return True if global limit reached."""
 42 |         if self.global_limit_reached:
 43 |             return True
 44 | 
 45 |         self.total_functions += 1
 46 |         if self.total_functions >= self.max_total_functions:
 47 |             logger.warning(f"Global function limit reached: {self.max_total_functions}")
 48 |             self.global_limit_reached = True
 49 |             return True
 50 |         return False
 51 | 
 52 |     def add_relationship(self) -> bool:
 53 |         """Add a relationship and return True if global limit reached."""
 54 |         if self.global_limit_reached:
 55 |             return True
 56 | 
 57 |         self.total_relationships += 1
 58 |         if self.total_relationships >= self.max_total_relationships:
 59 |             logger.warning(f"Global relationship limit reached: {self.max_total_relationships}")
 60 |             self.global_limit_reached = True
 61 |             return True
 62 |         return False
 63 | 
 64 |     def should_stop(self) -> bool:
 65 |         """Check if analysis should stop due to global limits."""
 66 |         return self.global_limit_reached
 67 | 
 68 | 
 69 | _global_tracker = None
 70 | 
 71 | 
 72 | def get_global_tracker() -> GlobalLimitTracker:
 73 |     """Get the global limit tracker instance."""
 74 |     global _global_tracker
 75 |     if _global_tracker is None:
 76 |         _global_tracker = GlobalLimitTracker()
 77 |     return _global_tracker
 78 | 
 79 | 
 80 | def reset_global_tracker():
 81 |     """Reset the global tracker (for testing or new analysis runs)."""
 82 |     global _global_tracker
 83 |     _global_tracker = GlobalLimitTracker()
 84 | 
 85 | 
 86 | class AnalysisLimits:
 87 |     """
 88 |     Unified analysis limits for all language analyzers.
 89 | 
 90 |     Provides consistent resource management and performance controls across
 91 |     Python, JavaScript, TypeScript, Go, Rust, and C/C++ analyzers.
 92 | 
 93 |     Uses a hybrid approach with both per-file and global limits to ensure:
 94 |     - Breadth over depth (sample from many files vs exhaustive analysis of few)
 95 |     - Fast analysis for real-time LLM interactions
 96 |     - Representative sampling across the codebase
 97 |     - Consistent resource usage across languages
 98 |     """
 99 | 
100 |     def __init__(
101 |         self,
102 |         max_nodes_per_file: int = 3000,
103 |         max_time_per_file: float = 15.0,
104 |         max_files_analyzed: int = 999999,
105 |         max_total_time: float = 180.0,
106 |         language: str = "unknown",
107 |     ):
108 |         self.max_nodes_per_file = max_nodes_per_file
109 |         self.max_time_per_file = max_time_per_file
110 |         self.max_files_analyzed = max_files_analyzed
111 |         self.max_total_time = max_total_time
112 |         self.language = language
113 | 
114 |         self.nodes_processed = 0
115 |         self.start_time: Optional[float] = None
116 |         self.limit_reached = False
117 | 
118 |         self.files_analyzed = 0
119 |         self.global_start_time: Optional[float] = None
120 |         self.global_limit_reached = False
121 | 
122 |         self.global_tracker = get_global_tracker()
123 | 
124 |     def start_new_file(self) -> bool:
125 |         """
126 |         Start analyzing a new file. Returns True if global limits allow it.
127 |         Resets per-file counters but maintains global state.
128 |         """
129 |         if self.global_tracker.should_stop():
130 |             logger.info(f"Skipping {self.language} file - global analysis limits reached")
131 |             return False
132 | 
133 |         if self.global_start_time is None:
134 |             self.global_start_time = time.time()
135 | 
136 |         if self.files_analyzed >= self.max_files_analyzed:
137 |             logger.info(
138 |                 f"Skipping {self.language} file - reached global file limit: {self.max_files_analyzed}"
139 |             )
140 |             self.global_limit_reached = True
141 |             return False
142 | 
143 |         if self.global_start_time:
144 |             global_elapsed = time.time() - self.global_start_time
145 |             if global_elapsed >= self.max_total_time:
146 |                 logger.info(
147 |                     f"Skipping {self.language} file - reached global time limit: {self.max_total_time}s"
148 |                 )
149 |                 self.global_limit_reached = True
150 |                 return False
151 | 
152 |         self.nodes_processed = 0
153 |         self.start_time = time.time()
154 |         self.limit_reached = False
155 |         self.files_analyzed += 1
156 | 
157 |         return True
158 | 
159 |     def increment(self) -> bool:
160 |         """
161 |         Increment node counter and check all limits.
162 |         Returns True if any limit exceeded and analysis should stop.
163 |         """
164 |         if self.limit_reached or self.global_limit_reached:
165 |             return True
166 | 
167 |         self.nodes_processed += 1
168 | 
169 |         if self.start_time:
170 |             elapsed = time.time() - self.start_time
171 |             if elapsed >= self.max_time_per_file:
172 |                 logger.debug(
173 |                     f"{self.language} analysis hit per-file time limit: {self.max_time_per_file}s"
174 |                 )
175 |                 self.limit_reached = True
176 |                 return True
177 | 
178 |         if self.nodes_processed >= self.max_nodes_per_file:
179 |             logger.debug(
180 |                 f"{self.language} analysis hit per-file node limit: {self.max_nodes_per_file} nodes"
181 |             )
182 |             self.limit_reached = True
183 |             return True
184 | 
185 |         if self.files_analyzed >= self.max_files_analyzed:
186 |             logger.warning(
187 |                 f"{self.language} analysis hit global file limit: {self.max_files_analyzed} files"
188 |             )
189 |             self.global_limit_reached = True
190 |             return True
191 | 
192 |         if self.global_start_time:
193 |             global_elapsed = time.time() - self.global_start_time
194 |             if global_elapsed >= self.max_total_time:
195 |                 logger.warning(
196 |                     f"{self.language} analysis hit global time limit: {self.max_total_time}s"
197 |                 )
198 |                 self.global_limit_reached = True
199 |                 return True
200 | 
201 |         return False
202 | 
203 |     def should_stop(self) -> bool:
204 |         """Check if analysis should stop due to any limits."""
205 |         return self.limit_reached or self.global_limit_reached or self.global_tracker.should_stop()
206 | 
207 |     def can_add_function(self) -> bool:
208 |         """Check if we can add another function without exceeding global limits."""
209 |         return self.global_tracker.can_add_function() and not self.should_stop()
210 | 
211 |     def can_add_relationship(self) -> bool:
212 |         """Check if we can add another relationship without exceeding global limits."""
213 |         return self.global_tracker.can_add_relationship() and not self.should_stop()
214 | 
215 |     def add_function(self) -> bool:
216 |         """Add a function to global count. Returns True if global limit reached."""
217 |         return self.global_tracker.add_function()
218 | 
219 |     def add_relationship(self) -> bool:
220 |         """Add a relationship to global count. Returns True if global limit reached."""
221 |         return self.global_tracker.add_relationship()
222 | 
223 |     def get_stats(self) -> dict:
224 |         """Get current analysis statistics."""
225 |         global_elapsed = 0.0
226 |         if self.global_start_time:
227 |             global_elapsed = time.time() - self.global_start_time
228 | 
229 |         file_elapsed = 0.0
230 |         if self.start_time:
231 |             file_elapsed = time.time() - self.start_time
232 | 
233 |         return {
234 |             "language": self.language,
235 |             "files_analyzed": self.files_analyzed,
236 |             "max_files": self.max_files_analyzed,
237 |             "global_time_elapsed": round(global_elapsed, 2),
238 |             "max_global_time": self.max_total_time,
239 |             "current_file_nodes": self.nodes_processed,
240 |             "max_nodes_per_file": self.max_nodes_per_file,
241 |             "current_file_time": round(file_elapsed, 2),
242 |             "max_time_per_file": self.max_time_per_file,
243 |             "limit_reached": self.limit_reached,
244 |             "global_limit_reached": self.global_limit_reached,
245 |         }
246 | 
247 |     def __str__(self) -> str:
248 |         """String representation for logging."""
249 |         return (
250 |             f"AnalysisLimits({self.language}: "
251 |             f"{self.max_nodes_per_file} nodes/file, "
252 |             f"{self.max_time_per_file}s/file, "
253 |             f"{self.max_total_time}s total)"
254 |         )
255 | 
256 | 
257 | def create_python_limits() -> AnalysisLimits:
258 |     """Create analysis limits optimized for Python files."""
259 |     return AnalysisLimits(
260 |         max_nodes_per_file=300,
261 |         max_time_per_file=5.0,
262 |         max_total_time=60.0,
263 |         language="python",
264 |     )
265 | 
266 | 
267 | def create_javascript_limits() -> AnalysisLimits:
268 |     """Create analysis limits optimized for JavaScript/TypeScript files."""
269 |     return AnalysisLimits(
270 |         max_nodes_per_file=250,
271 |         max_time_per_file=3.0,
272 |         max_total_time=45.0,
273 |         language="javascript",
274 |     )
275 | 
276 | 
277 | def create_go_limits() -> AnalysisLimits:
278 |     """Create analysis limits optimized for Go files."""
279 |     return AnalysisLimits(
280 |         max_nodes_per_file=200,
281 |         max_time_per_file=3.0,
282 |         max_total_time=30.0,
283 |         language="go",
284 |     )
285 | 
286 | 
287 | def create_rust_limits() -> AnalysisLimits:
288 |     """Create analysis limits optimized for Rust files."""
289 |     return AnalysisLimits(
290 |         max_nodes_per_file=200,
291 |         max_time_per_file=4.0,
292 |         max_total_time=30.0,
293 |         language="rust",
294 |     )
295 | 
296 | 
297 | def create_c_cpp_limits() -> AnalysisLimits:
298 |     """Create analysis limits optimized for C/C++ files."""
299 |     return AnalysisLimits(
300 |         max_nodes_per_file=200,
301 |         max_time_per_file=4.0,
302 |         max_total_time=30.0,
303 |         language="c_cpp",
304 |     )
305 | 


--------------------------------------------------------------------------------
/src/gitprobe/analysis/analysis_service.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Analysis Service
  3 | 
  4 | Centralized service for repository analysis with support for multiple languages.
  5 | Handles the orchestration of repository cloning, structure analysis, and multi-language
  6 | AST parsing for call graph generation.
  7 | """
  8 | 
  9 | import logging
 10 | from typing import Dict, List, Optional, Any
 11 | from pathlib import Path
 12 | from gitprobe.utils.security import safe_open_text, assert_safe_path
 13 | from gitprobe.analysis.repo_analyzer import RepoAnalyzer
 14 | from gitprobe.analysis.call_graph_analyzer import CallGraphAnalyzer
 15 | from gitprobe.analysis.cloning import clone_repository, cleanup_repository, parse_github_url
 16 | from gitprobe.models.analysis import AnalysisResult
 17 | from gitprobe.models.core import Repository
 18 | 
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class AnalysisService:
 24 |     """
 25 |     Centralized analysis service supporting multiple programming languages.
 26 | 
 27 |     This service orchestrates the complete analysis workflow:
 28 |     1. Repository cloning and validation
 29 |     2. File structure analysis with filtering
 30 |     3. Multi-language AST parsing and call graph generation
 31 |     4. Result consolidation and cleanup
 32 | 
 33 |     Supports:
 34 |     - Python (fully implemented)
 35 |     - JavaScript/TypeScript (fully implemented)
 36 |     - C/C++ (fully implemented)
 37 |     - Go (fully implemented)
 38 |     - Rust (fully implemented)
 39 |     - Additional languages (extensible)
 40 |     """
 41 | 
 42 |     def __init__(self):
 43 |         """Initialize the analysis service with language-specific analyzers."""
 44 |         self.call_graph_analyzer = CallGraphAnalyzer()
 45 |         self._temp_directories = []
 46 | 
 47 |     def analyze_repository_full(
 48 |         self,
 49 |         github_url: str,
 50 |         include_patterns: Optional[List[str]] = None,
 51 |         exclude_patterns: Optional[List[str]] = None,
 52 |     ) -> AnalysisResult:
 53 |         """
 54 |         Perform complete repository analysis including call graph generation.
 55 | 
 56 |         Args:
 57 |             github_url: GitHub repository URL to analyze
 58 |             include_patterns: File patterns to include (e.g., ['*.py', '*.js'])
 59 |             exclude_patterns: Additional patterns to exclude
 60 | 
 61 |         Returns:
 62 |             AnalysisResult: Complete analysis with functions, relationships, and visualization
 63 | 
 64 |         Raises:
 65 |             ValueError: If GitHub URL is invalid
 66 |             RuntimeError: If analysis fails
 67 |         """
 68 |         temp_dir = None
 69 |         try:
 70 |             logger.info(f"Starting full analysis of {github_url}")
 71 | 
 72 |             temp_dir = self._clone_repository(github_url)
 73 |             repo_info = self._parse_repository_info(github_url)
 74 | 
 75 |             logger.info("Analyzing repository file structure...")
 76 |             structure_result = self._analyze_structure(temp_dir, include_patterns, exclude_patterns)
 77 |             logger.info(f"Found {structure_result['summary']['total_files']} files to analyze.")
 78 | 
 79 |             logger.info("Starting call graph analysis...")
 80 |             call_graph_result = self._analyze_call_graph(structure_result["file_tree"], temp_dir)
 81 |             logger.info(
 82 |                 f"Call graph analysis complete. Found {call_graph_result['call_graph']['total_functions']} functions."
 83 |             )
 84 | 
 85 |             readme_content = self._read_readme_file(temp_dir)
 86 | 
 87 |             analysis_result = AnalysisResult(
 88 |                 repository=Repository(
 89 |                     url=repo_info["url"],
 90 |                     name=repo_info["name"],
 91 |                     clone_path=temp_dir,
 92 |                     analysis_id=f"{repo_info['owner']}-{repo_info['name']}",
 93 |                 ),
 94 |                 functions=call_graph_result["functions"],
 95 |                 relationships=call_graph_result["relationships"],
 96 |                 file_tree=structure_result["file_tree"],
 97 |                 summary={
 98 |                     **structure_result["summary"],
 99 |                     **call_graph_result["call_graph"],
100 |                     "analysis_type": "full",
101 |                     "languages_analyzed": call_graph_result["call_graph"]["languages_found"],
102 |                 },
103 |                 visualization=call_graph_result["visualization"],
104 |                 readme_content=readme_content,
105 |             )
106 | 
107 |             logger.info(f"Cleaning up temporary repository directory: {temp_dir}")
108 |             self._cleanup_repository(temp_dir)
109 | 
110 |             logger.info(
111 |                 f"Analysis completed: {analysis_result.summary['total_functions']} functions found"
112 |             )
113 |             return analysis_result
114 | 
115 |         except Exception as e:
116 |             logger.error(f"Analysis failed: {str(e)}", exc_info=True)
117 |             if "temp_dir" in locals() and Path(temp_dir).exists():
118 |                 self._cleanup_repository(temp_dir)
119 |             raise RuntimeError(f"Repository analysis failed: {str(e)}")
120 | 
121 |     def analyze_repository_structure_only(
122 |         self,
123 |         github_url: str,
124 |         include_patterns: Optional[List[str]] = None,
125 |         exclude_patterns: Optional[List[str]] = None,
126 |     ) -> Dict[str, Any]:
127 |         """
128 |         Perform lightweight structure-only analysis without call graph generation.
129 | 
130 |         Args:
131 |             github_url: GitHub repository URL to analyze
132 |             include_patterns: File patterns to include
133 |             exclude_patterns: Additional patterns to exclude
134 | 
135 |         Returns:
136 |             Dict: Repository structure with file tree and summary statistics
137 |         """
138 |         temp_dir = None
139 |         try:
140 |             logger.info(f"Starting structure analysis of {github_url}")
141 | 
142 |             temp_dir = self._clone_repository(github_url)
143 |             repo_info = self._parse_repository_info(github_url)
144 | 
145 |             structure_result = self._analyze_structure(temp_dir, include_patterns, exclude_patterns)
146 | 
147 |             result = {
148 |                 "repository": repo_info,
149 |                 "file_tree": structure_result["file_tree"],
150 |                 "file_summary": {
151 |                     **structure_result["summary"],
152 |                     "analysis_type": "structure_only",
153 |                 },
154 |             }
155 | 
156 |             self._cleanup_repository(temp_dir)
157 | 
158 |             logger.info(
159 |                 f"Structure analysis completed: {result['file_summary']['total_files']} files found"
160 |             )
161 |             return result
162 | 
163 |         except Exception as e:
164 |             if temp_dir:
165 |                 self._cleanup_repository(temp_dir)
166 |             logger.error(f"Structure analysis failed for {github_url}: {str(e)}")
167 |             raise RuntimeError(f"Structure analysis failed: {str(e)}") from e
168 | 
169 |     def _clone_repository(self, github_url: str) -> str:
170 |         """Clone repository and return temp dir path."""
171 |         logger.info(f"Cloning {github_url}...")
172 |         temp_dir = clone_repository(github_url)
173 |         logger.info(f"Repository cloned to {temp_dir}")
174 |         self._temp_directories.append(temp_dir)
175 |         return temp_dir
176 | 
177 |     def _parse_repository_info(self, github_url: str) -> Dict[str, str]:
178 |         """Parse GitHub URL and extract repository metadata."""
179 |         return parse_github_url(github_url)
180 | 
181 |     def _analyze_structure(
182 |         self,
183 |         repo_dir: str,
184 |         include_patterns: Optional[List[str]],
185 |         exclude_patterns: Optional[List[str]],
186 |     ) -> Dict[str, Any]:
187 |         """Analyze repository file structure with filtering."""
188 |         logger.info(
189 |             f"Initializing RepoAnalyzer with include: {include_patterns}, exclude: {exclude_patterns}"
190 |         )
191 |         repo_analyzer = RepoAnalyzer(include_patterns, exclude_patterns)
192 |         return repo_analyzer.analyze_repository_structure(repo_dir)
193 | 
194 |     def _read_readme_file(self, repo_dir: str) -> Optional[str]:
195 |         """Find and read the README file from the repository root."""
196 |         # possible_readme_names = ["README.md", "README", "readme.md", "README.txt"]
197 |         # for name in possible_readme_names:
198 |         #     readme_path = Path(repo_dir) / name
199 |         #     if readme_path.exists():
200 |         #         try:
201 |         #             logger.info(f"Found README file at {readme_path}")
202 |         #             return readme_path.read_text(encoding="utf-8")
203 |         #         except Exception as e:
204 |         #             logger.warning(f"Could not read README file at {readme_path}: {e}")
205 |         #             return None
206 |         # logger.info("No README file found in repository root.")
207 |         # return None
208 |         base = Path(repo_dir)
209 |         possible_readme_names = ["README.md", "README", "readme.md", "README.txt"]
210 |         for name in possible_readme_names:
211 |             p = base / name
212 |             if p.exists():
213 |                 try:
214 |                     assert_safe_path(base, p)
215 |                     logger.info(f"Found README file at {p}")
216 |                     return safe_open_text(base, p, encoding="utf-8")
217 |                 except Exception as e:
218 |                     logger.warning(f"Skipping unsafe/ unreadable README at {p}: {e}")
219 |                     return None
220 |         logger.info("No README file found in repository root.")
221 |         return None
222 | 
223 |     def _analyze_call_graph(self, file_tree: Dict[str, Any], repo_dir: str) -> Dict[str, Any]:
224 |         """
225 |         Perform multi-language call graph analysis.
226 | 
227 |         This method will be expanded to handle:
228 |         - Python AST analysis (current)
229 |         - JavaScript/TypeScript AST analysis (planned)
230 |         - Additional language support (future)
231 |         """
232 |         logger.info("Extracting code files from file tree...")
233 |         code_files = self.call_graph_analyzer.extract_code_files(file_tree)
234 | 
235 |         logger.info(f"Found {len(code_files)} total code files. Filtering for supported languages.")
236 |         supported_files = self._filter_supported_languages(code_files)
237 |         logger.info(f"Analyzing {len(supported_files)} supported files.")
238 | 
239 |         result = self.call_graph_analyzer.analyze_code_files(supported_files, repo_dir)
240 | 
241 |         result["call_graph"]["supported_languages"] = self._get_supported_languages()
242 |         result["call_graph"]["unsupported_files"] = len(code_files) - len(supported_files)
243 | 
244 |         return result
245 | 
246 |     def _filter_supported_languages(self, code_files: List[Dict]) -> List[Dict]:
247 |         """
248 |         Filter code files to only include supported languages.
249 | 
250 |         Supports Python, JavaScript, TypeScript, C, C++, Go, and Rust.
251 |         """
252 |         supported_languages = {
253 |             "python",
254 |             "javascript",
255 |             "typescript",
256 |             "c",
257 |             "cpp",
258 |             "go",
259 |             "rust",
260 |         }
261 | 
262 |         return [
263 |             file_info
264 |             for file_info in code_files
265 |             if file_info.get("language") in supported_languages
266 |         ]
267 | 
268 |     def _get_supported_languages(self) -> List[str]:
269 |         """Get list of currently supported languages for analysis."""
270 |         return ["python", "javascript", "typescript", "c", "cpp", "go", "rust"]
271 | 
272 |     def _cleanup_repository(self, temp_dir: str):
273 |         """Clean up cloned repository."""
274 |         logger.info(f"Attempting to clean up {temp_dir}")
275 |         cleanup_repository(temp_dir)
276 |         if temp_dir in self._temp_directories:
277 |             self._temp_directories.remove(temp_dir)
278 | 
279 |     def cleanup_all(self):
280 |         """Clean up all tracked temporary directories."""
281 |         for temp_dir in self._temp_directories[:]:
282 |             self._cleanup_repository(temp_dir)
283 | 
284 |     def __del__(self):
285 |         """Ensure cleanup on service destruction."""
286 |         self.cleanup_all()
287 | 
288 | 
289 | def analyze_repository(
290 |     github_url: str, include_patterns=None, exclude_patterns=None
291 | ) -> tuple[AnalysisResult, None]:
292 |     """
293 |     Backward compatibility function.
294 | 
295 |     Returns:
296 |         tuple: (AnalysisResult, None) - None instead of temp_dir since cleanup is handled internally
297 |     """
298 |     service = AnalysisService()
299 |     result = service.analyze_repository_full(github_url, include_patterns, exclude_patterns)
300 |     return result, None
301 | 
302 | 
303 | def analyze_repository_structure_only(
304 |     github_url: str, include_patterns=None, exclude_patterns=None
305 | ) -> tuple[Dict, None]:
306 |     """
307 |     Backward compatibility function.
308 | 
309 |     Returns:
310 |         tuple: (structure_result, None) - None instead of temp_dir since cleanup is handled internally
311 |     """
312 |     service = AnalysisService()
313 |     result = service.analyze_repository_structure_only(
314 |         github_url, include_patterns, exclude_patterns
315 |     )
316 |     return result, None
317 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | GitProbe Integration Test Suite
  4 | 
  5 | Comprehensive integration tests for all GitProbe tree-sitter language analyzers.
  6 | Tests real-world repositories to ensure all language parsers are working correctly.
  7 | 
  8 | Usage:
  9 |     python tests/test_integration.py                     # Run all tests
 10 |     python tests/test_integration.py --language python   # Test specific language
 11 |     python tests/test_integration.py --quick             # Run quick subset
 12 |     python tests/test_integration.py --verbose           # Detailed output
 13 | 
 14 | Author: GitProbe Team
 15 | License: MIT
 16 | """
 17 | 
 18 | import argparse
 19 | import json
 20 | import sys
 21 | import time
 22 | from typing import Dict, List, Tuple, Optional
 23 | from dataclasses import dataclass
 24 | from pathlib import Path
 25 | 
 26 | try:
 27 |     import requests
 28 |     from rich.console import Console
 29 |     from rich.table import Table
 30 |     from rich.progress import Progress, SpinnerColumn, TextColumn
 31 |     from rich.panel import Panel
 32 |     from rich.text import Text
 33 | except ImportError:
 34 |     print("❌ Missing dependencies. Install with:")
 35 |     print("   pip install requests rich")
 36 |     sys.exit(1)
 37 | 
 38 | 
 39 | @dataclass
 40 | class TestResult:
 41 |     """Result of a single repository test."""
 42 |     repo_name: str
 43 |     language: str
 44 |     success: bool
 45 |     functions: int
 46 |     calls: int
 47 |     error: Optional[str] = None
 48 |     duration: float = 0.0
 49 | 
 50 | 
 51 | class GitProbeIntegrationTests:
 52 |     """Main integration test runner for GitProbe analyzers."""
 53 |     
 54 |     # Curated test repositories by language
 55 |     TEST_REPOSITORIES = {
 56 |         "Python": [
 57 |             ("https://github.com/Textualize/rich", "Modern terminal formatting"),
 58 |             ("https://github.com/psf/requests", "HTTP library for humans"), 
 59 |             ("https://github.com/pallets/flask", "Lightweight web framework"),
 60 |             ("https://github.com/python/cpython", "Python interpreter (large)"),
 61 |         ],
 62 |         "JavaScript": [
 63 |             ("https://github.com/lodash/lodash", "Modern utility library"),
 64 |             ("https://github.com/axios/axios", "Promise-based HTTP client"),
 65 |             ("https://github.com/expressjs/express", "Fast web framework"),
 66 |             ("https://github.com/nodejs/node", "Node.js runtime (large)"),
 67 |         ],
 68 |         "TypeScript": [
 69 |             ("https://github.com/microsoft/vscode", "Code editor (large)"),
 70 |             ("https://github.com/microsoft/TypeScript", "TypeScript compiler"),
 71 |             ("https://github.com/angular/angular", "Angular framework (large)"),
 72 |         ],
 73 |         "Rust": [
 74 |             ("https://github.com/clap-rs/clap", "Command line parser"),
 75 |             ("https://github.com/BurntSushi/ripgrep", "Fast grep alternative"),
 76 |             ("https://github.com/rust-lang/rust", "Rust compiler (very large)"),
 77 |         ],
 78 |         "Go": [
 79 |             ("https://github.com/spf13/cobra", "CLI library"),
 80 |             ("https://github.com/gohugoio/hugo", "Static site generator"),
 81 |             ("https://github.com/kubernetes/kubernetes", "Container orchestration (very large)"),
 82 |         ],
 83 |         "C": [
 84 |             ("https://github.com/DaveGamble/cJSON", "JSON parser in C"),
 85 |             ("https://github.com/libuv/libuv", "Cross-platform async I/O"),
 86 |             ("https://github.com/curl/curl", "Data transfer library"),
 87 |         ],
 88 |         "C++": [
 89 |             ("https://github.com/fmtlib/fmt", "Modern formatting library"),
 90 |             ("https://github.com/catchorg/Catch2", "Modern test framework"),
 91 |             ("https://github.com/protocolbuffers/protobuf", "Protocol buffers"),
 92 |         ]
 93 |     }
 94 |     
 95 |     # Quick subset for fast testing
 96 |     QUICK_TEST_SET = {
 97 |         "Python": [("https://github.com/psf/requests", "HTTP library")],
 98 |         "JavaScript": [("https://github.com/axios/axios", "HTTP client")],
 99 |         "Rust": [("https://github.com/clap-rs/clap", "CLI parser")],
100 |         "Go": [("https://github.com/spf13/cobra", "CLI library")],
101 |         "C": [("https://github.com/DaveGamble/cJSON", "JSON parser")],
102 |         "C++": [("https://github.com/fmtlib/fmt", "Formatting library")],
103 |     }
104 | 
105 |     def __init__(self, server_url: str = "http://localhost:8000", timeout: int = 120):
106 |         """Initialize test runner."""
107 |         self.server_url = server_url
108 |         self.timeout = timeout
109 |         self.console = Console()
110 |         self.results: List[TestResult] = []
111 | 
112 |     def check_server_health(self) -> bool:
113 |         """Check if GitProbe server is running and healthy."""
114 |         try:
115 |             response = requests.get(f"{self.server_url}/health", timeout=5)
116 |             return response.status_code == 200
117 |         except requests.RequestException:
118 |             return False
119 | 
120 |     def test_repository(self, repo_url: str, language: str, description: str = "") -> TestResult:
121 |         """Test analysis of a single repository."""
122 |         repo_name = repo_url.split('/')[-1]
123 |         start_time = time.time()
124 |         
125 |         try:
126 |             response = requests.post(
127 |                 f"{self.server_url}/analyze",
128 |                 json={"github_url": repo_url},
129 |                 timeout=self.timeout
130 |             )
131 |             
132 |             duration = time.time() - start_time
133 |             
134 |             if response.status_code == 200:
135 |                 data = response.json()
136 |                 summary = data.get("data", {}).get("summary", {})
137 |                 
138 |                 functions = summary.get("total_functions", 0)
139 |                 calls = summary.get("total_calls", 0)
140 |                 
141 |                 # Consider success if we found functions and no errors
142 |                 has_errors = "error" in data.get("status", "").lower()
143 |                 success = functions > 0 and not has_errors
144 |                 
145 |                 return TestResult(
146 |                     repo_name=repo_name,
147 |                     language=language,
148 |                     success=success,
149 |                     functions=functions,
150 |                     calls=calls,
151 |                     duration=duration
152 |                 )
153 |             else:
154 |                 return TestResult(
155 |                     repo_name=repo_name,
156 |                     language=language,
157 |                     success=False,
158 |                     functions=0,
159 |                     calls=0,
160 |                     error=f"HTTP {response.status_code}",
161 |                     duration=duration
162 |                 )
163 |                 
164 |         except requests.exceptions.Timeout:
165 |             return TestResult(
166 |                 repo_name=repo_name,
167 |                 language=language,
168 |                 success=False,
169 |                 functions=0,
170 |                 calls=0,
171 |                 error="Timeout",
172 |                 duration=self.timeout
173 |             )
174 |         except Exception as e:
175 |             return TestResult(
176 |                 repo_name=repo_name,
177 |                 language=language,
178 |                 success=False,
179 |                 functions=0,
180 |                 calls=0,
181 |                 error=str(e),
182 |                 duration=time.time() - start_time
183 |             )
184 | 
185 |     def run_tests(self, languages: Optional[List[str]] = None, quick: bool = False, verbose: bool = False) -> Dict:
186 |         """Run integration tests and return detailed results."""
187 |         
188 |         # Check server health first
189 |         if not self.check_server_health():
190 |             self.console.print("❌ [red]GitProbe server is not running or unhealthy[/red]")
191 |             self.console.print("   Start server with: [cyan]./gitprobe server[/cyan]")
192 |             return {"error": "Server not available"}
193 | 
194 |         # Select test set
195 |         test_set = self.QUICK_TEST_SET if quick else self.TEST_REPOSITORIES
196 |         
197 |         # Filter by languages if specified
198 |         if languages:
199 |             test_set = {lang: repos for lang, repos in test_set.items() 
200 |                        if lang.lower() in [l.lower() for l in languages]}
201 | 
202 |         if not test_set:
203 |             self.console.print("❌ [red]No tests to run with current filters[/red]")
204 |             return {"error": "No tests selected"}
205 | 
206 |         # Display test plan
207 |         total_tests = sum(len(repos) for repos in test_set.values())
208 |         self.console.print(Panel(
209 |             f"🧪 [bold blue]GitProbe Integration Test Suite[/bold blue]\n\n"
210 |             f"Testing {len(test_set)} languages, {total_tests} repositories\n"
211 |             f"Server: {self.server_url}\n"
212 |             f"Timeout: {self.timeout}s per repository",
213 |             title="Test Configuration"
214 |         ))
215 | 
216 |         # Run tests with progress tracking
217 |         with Progress(
218 |             SpinnerColumn(),
219 |             TextColumn("[progress.description]{task.description}"),
220 |             console=self.console
221 |         ) as progress:
222 |             
223 |             for language, repos in test_set.items():
224 |                 lang_task = progress.add_task(f"Testing {language}...", total=len(repos))
225 |                 
226 |                 for repo_url, description in repos:
227 |                     repo_name = repo_url.split('/')[-1]
228 |                     progress.update(lang_task, description=f"Testing {language}: {repo_name}")
229 |                     
230 |                     result = self.test_repository(repo_url, language, description)
231 |                     self.results.append(result)
232 |                     
233 |                     if verbose:
234 |                         status = "✅" if result.success else "❌"
235 |                         details = f"({result.functions} functions, {result.calls} calls, {result.duration:.1f}s)"
236 |                         if result.error:
237 |                             details = f"Error: {result.error}"
238 |                         self.console.print(f"  {status} {repo_name}: {details}")
239 |                     
240 |                     progress.advance(lang_task)
241 | 
242 |         return self._generate_report()
243 | 
244 |     def _generate_report(self) -> Dict:
245 |         """Generate comprehensive test report."""
246 |         # Calculate statistics
247 |         total_tests = len(self.results)
248 |         passed_tests = sum(1 for r in self.results if r.success)
249 |         failed_tests = total_tests - passed_tests
250 |         
251 |         # Group by language
252 |         by_language = {}
253 |         for result in self.results:
254 |             if result.language not in by_language:
255 |                 by_language[result.language] = []
256 |             by_language[result.language].append(result)
257 | 
258 |         # Create summary table
259 |         table = Table(title="📊 Test Results Summary")
260 |         table.add_column("Language", style="cyan", no_wrap=True)
261 |         table.add_column("Passed", style="green", justify="center")
262 |         table.add_column("Failed", style="red", justify="center")
263 |         table.add_column("Success Rate", justify="center")
264 |         table.add_column("Avg Functions", justify="right")
265 |         table.add_column("Total Duration", justify="right")
266 | 
267 |         overall_success = True
268 |         total_duration = 0
269 | 
270 |         for language, results in by_language.items():
271 |             passed = sum(1 for r in results if r.success)
272 |             total = len(results)
273 |             failed = total - passed
274 |             success_rate = (passed / total * 100) if total > 0 else 0
275 |             avg_functions = sum(r.functions for r in results if r.success) / max(passed, 1)
276 |             lang_duration = sum(r.duration for r in results)
277 |             total_duration += lang_duration
278 |             
279 |             if failed > 0:
280 |                 overall_success = False
281 |             
282 |             status_style = "green" if failed == 0 else "yellow" if passed > 0 else "red"
283 |             table.add_row(
284 |                 f"[{status_style}]{language}[/{status_style}]",
285 |                 str(passed),
286 |                 str(failed),
287 |                 f"{success_rate:.0f}%",
288 |                 f"{avg_functions:.0f}" if passed > 0 else "0",
289 |                 f"{lang_duration:.1f}s"
290 |             )
291 | 
292 |         self.console.print("\n")
293 |         self.console.print(table)
294 | 
295 |         # Overall status
296 |         if overall_success:
297 |             self.console.print("\n🎉 [bold green]All analyzers working perfectly![/bold green]")
298 |         elif passed_tests > 0:
299 |             self.console.print(f"\n⚠️  [yellow]Partial success: {passed_tests}/{total_tests} tests passed[/yellow]")
300 |         else:
301 |             self.console.print(f"\n❌ [red]All tests failed - check GitProbe server[/red]")
302 | 
303 |         # Show failures if any
304 |         failures = [r for r in self.results if not r.success]
305 |         if failures:
306 |             self.console.print(f"\n[red]Failed Tests ({len(failures)}):[/red]")
307 |             for failure in failures:
308 |                 error_msg = failure.error or "No functions detected"
309 |                 self.console.print(f"  ❌ {failure.language}/{failure.repo_name}: {error_msg}")
310 | 
311 |         return {
312 |             "total_tests": total_tests,
313 |             "passed": passed_tests,
314 |             "failed": failed_tests,
315 |             "success_rate": (passed_tests / total_tests * 100) if total_tests > 0 else 0,
316 |             "overall_success": overall_success,
317 |             "duration": total_duration,
318 |             "by_language": {
319 |                 lang: {
320 |                     "passed": sum(1 for r in results if r.success),
321 |                     "total": len(results),
322 |                     "results": [
323 |                         {
324 |                             "repo": r.repo_name,
325 |                             "success": r.success,
326 |                             "functions": r.functions,
327 |                             "calls": r.calls,
328 |                             "error": r.error,
329 |                             "duration": r.duration
330 |                         }
331 |                         for r in results
332 |                     ]
333 |                 }
334 |                 for lang, results in by_language.items()
335 |             }
336 |         }
337 | 
338 | 
339 | def main():
340 |     """CLI entry point."""
341 |     parser = argparse.ArgumentParser(
342 |         description="GitProbe Integration Test Suite",
343 |         formatter_class=argparse.RawDescriptionHelpFormatter,
344 |         epilog="""
345 | Examples:
346 |   python tests/test_integration.py                     # Run all tests
347 |   python tests/test_integration.py --quick             # Quick test subset
348 |   python tests/test_integration.py --language python   # Test Python only
349 |   python tests/test_integration.py --language python --language rust  # Multiple languages
350 |   python tests/test_integration.py --verbose           # Detailed output
351 |   python tests/test_integration.py --server http://localhost:9000  # Custom server
352 |         """
353 |     )
354 |     
355 |     parser.add_argument(
356 |         "--language", 
357 |         action="append", 
358 |         help="Test specific language(s) only (can be used multiple times)"
359 |     )
360 |     parser.add_argument(
361 |         "--quick", 
362 |         action="store_true", 
363 |         help="Run quick test subset (1 repo per language)"
364 |     )
365 |     parser.add_argument(
366 |         "--verbose", 
367 |         action="store_true", 
368 |         help="Show detailed test progress"
369 |     )
370 |     parser.add_argument(
371 |         "--server", 
372 |         default="http://localhost:8000",
373 |         help="GitProbe server URL (default: http://localhost:8000)"
374 |     )
375 |     parser.add_argument(
376 |         "--timeout", 
377 |         type=int, 
378 |         default=120,
379 |         help="Request timeout in seconds (default: 120)"
380 |     )
381 |     parser.add_argument(
382 |         "--json", 
383 |         action="store_true", 
384 |         help="Output results as JSON"
385 |     )
386 | 
387 |     args = parser.parse_args()
388 | 
389 |     # Run tests
390 |     runner = GitProbeIntegrationTests(server_url=args.server, timeout=args.timeout)
391 |     report = runner.run_tests(
392 |         languages=args.language,
393 |         quick=args.quick,
394 |         verbose=args.verbose
395 |     )
396 | 
397 |     # Output results
398 |     if args.json:
399 |         print(json.dumps(report, indent=2))
400 |     
401 |     # Exit with error code if tests failed
402 |     if "error" in report:
403 |         sys.exit(1)
404 |     elif not report.get("overall_success", False):
405 |         sys.exit(1)
406 |     else:
407 |         sys.exit(0)
408 | 
409 | 
410 | if __name__ == "__main__":
411 |     main() 


--------------------------------------------------------------------------------
/src/gitprobe/analyzers/go.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Go analyzer using tree-sitter for accurate AST parsing and function extraction.
  3 | """
  4 | 
  5 | import logging
  6 | from typing import List, Set, Optional
  7 | from pathlib import Path
  8 | 
  9 | from tree_sitter import Parser, Language
 10 | import tree_sitter_go
 11 | 
 12 | from gitprobe.models.core import Function, CallRelationship
 13 | from gitprobe.core.analysis_limits import AnalysisLimits, create_go_limits
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class TreeSitterGoAnalyzer:
 19 |     """Go analyzer using tree-sitter for proper AST parsing."""
 20 | 
 21 |     def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None):
 22 |         self.file_path = Path(file_path)
 23 |         self.content = content
 24 |         self.functions: List[Function] = []
 25 |         self.call_relationships: List[CallRelationship] = []
 26 |         self.limits = limits or create_go_limits()
 27 | 
 28 |         try:
 29 |             language_capsule = tree_sitter_go.language()
 30 |             self.go_language = Language(language_capsule)
 31 |             self.parser = Parser(self.go_language)
 32 |             logger.debug(f"Go parser initialized with language object: {type(self.go_language)}")
 33 | 
 34 |             test_code = 'package main\nfunc main() { println("test") }'
 35 |             test_tree = self.parser.parse(bytes(test_code, "utf8"))
 36 |             if test_tree is None or test_tree.root_node is None:
 37 |                 raise RuntimeError("Parser setup test failed for Go")
 38 |             logger.debug(f"Go parser test successful - root node type: {test_tree.root_node.type}")
 39 | 
 40 |         except Exception as e:
 41 |             logger.error(f"Failed to initialize Go parser: {e}")
 42 |             self.parser = None
 43 |             self.go_language = None
 44 | 
 45 |         logger.info(f"TreeSitterGoAnalyzer initialized for {file_path} with limits: {self.limits}")
 46 | 
 47 |     def analyze(self) -> None:
 48 |         """Analyze the Go content and extract functions and call relationships."""
 49 |         if not self.limits.start_new_file():
 50 |             logger.info(f"Skipping {self.file_path} - global limits reached")
 51 |             return
 52 | 
 53 |         if self.parser is None:
 54 |             logger.warning(f"Skipping {self.file_path} - parser initialization failed")
 55 |             return
 56 | 
 57 |         try:
 58 |             tree = self.parser.parse(bytes(self.content, "utf8"))
 59 |             root_node = tree.root_node
 60 | 
 61 |             logger.info(f"Parsed AST with root node type: {root_node.type}")
 62 | 
 63 |             self._extract_functions(root_node)
 64 | 
 65 |             if not self.limits.should_stop():
 66 |                 self._extract_call_relationships(root_node)
 67 | 
 68 |             logger.info(
 69 |                 f"Analysis complete: {len(self.functions)} functions, {len(self.call_relationships)} relationships, {self.limits.nodes_processed} nodes processed"
 70 |             )
 71 | 
 72 |         except Exception as e:
 73 |             logger.error(f"Error analyzing Go file {self.file_path}: {e}", exc_info=True)
 74 | 
 75 |     def _extract_functions(self, node) -> None:
 76 |         """Extract all function definitions from the AST."""
 77 |         self._traverse_for_functions(node)
 78 |         self.functions.sort(key=lambda f: f.line_start)
 79 | 
 80 |     def _traverse_for_functions(self, node) -> None:
 81 |         """Recursively traverse AST nodes to find functions."""
 82 |         if self.limits.should_stop():
 83 |             return
 84 | 
 85 |         if node.type == "function_declaration":
 86 |             func = self._extract_function_declaration(node)
 87 |             if func and self._should_include_function(func):
 88 |                 if self.limits.can_add_function():
 89 |                     self.functions.append(func)
 90 |                     if self.limits.add_function():
 91 |                         return
 92 |                 else:
 93 |                     return
 94 | 
 95 |         elif node.type == "method_declaration":
 96 |             func = self._extract_method_declaration(node)
 97 |             if func and self._should_include_function(func):
 98 |                 if self.limits.can_add_function():
 99 |                     self.functions.append(func)
100 |                     if self.limits.add_function():
101 |                         return
102 |                 else:
103 |                     return
104 | 
105 |         elif node.type == "func_literal":
106 |             func = self._extract_func_literal(node)
107 |             if func and self._should_include_function(func):
108 |                 if self.limits.can_add_function():
109 |                     self.functions.append(func)
110 |                     if self.limits.add_function():
111 |                         return
112 |                 else:
113 |                     return
114 | 
115 |         for child in node.children:
116 |             self._traverse_for_functions(child)
117 |             if self.limits.should_stop():
118 |                 break
119 | 
120 |     def _extract_function_declaration(self, node) -> Optional[Function]:
121 |         """Extract regular function declaration: func name() {}"""
122 |         try:
123 |             name_node = self._find_child_by_type(node, "identifier")
124 |             if not name_node:
125 |                 return None
126 | 
127 |             func_name = self._get_node_text(name_node)
128 |             line_start = node.start_point[0] + 1
129 |             line_end = node.end_point[0] + 1
130 |             parameters = self._extract_parameters(node)
131 |             code_snippet = self._get_node_text(node)
132 | 
133 |             return Function(
134 |                 name=func_name,
135 |                 file_path=str(self.file_path),
136 |                 line_start=line_start,
137 |                 line_end=line_end,
138 |                 parameters=parameters,
139 |                 docstring=self._extract_docstring(node),
140 |                 is_method=False,
141 |                 class_name=None,
142 |                 code_snippet=code_snippet,
143 |             )
144 |         except Exception as e:
145 |             logger.warning(f"Error extracting function declaration: {e}")
146 |             return None
147 | 
148 |     def _extract_method_declaration(self, node) -> Optional[Function]:
149 |         """Extract method declaration: func (receiver) methodName() {}"""
150 |         try:
151 |             name_node = self._find_child_by_type(node, "identifier")
152 |             if not name_node:
153 |                 return None
154 | 
155 |             func_name = self._get_node_text(name_node)
156 |             line_start = node.start_point[0] + 1
157 |             line_end = node.end_point[0] + 1
158 |             parameters = self._extract_parameters(node)
159 |             code_snippet = self._get_node_text(node)
160 |             receiver_type = self._extract_receiver_type(node)
161 | 
162 |             return Function(
163 |                 name=func_name,
164 |                 file_path=str(self.file_path),
165 |                 line_start=line_start,
166 |                 line_end=line_end,
167 |                 parameters=parameters,
168 |                 docstring=self._extract_docstring(node),
169 |                 is_method=True,
170 |                 class_name=receiver_type,
171 |                 code_snippet=code_snippet,
172 |             )
173 |         except Exception as e:
174 |             logger.warning(f"Error extracting method declaration: {e}")
175 |             return None
176 | 
177 |     def _extract_func_literal(self, node) -> Optional[Function]:
178 |         """Extract anonymous function/closure: func() {}"""
179 |         try:
180 |             line_start = node.start_point[0] + 1
181 |             line_end = node.end_point[0] + 1
182 |             parameters = self._extract_parameters(node)
183 |             code_snippet = self._get_node_text(node)
184 | 
185 |             func_name = f"anonymous_func_line_{line_start}"
186 | 
187 |             return Function(
188 |                 name=func_name,
189 |                 file_path=str(self.file_path),
190 |                 line_start=line_start,
191 |                 line_end=line_end,
192 |                 parameters=parameters,
193 |                 docstring=None,
194 |                 is_method=False,
195 |                 class_name=None,
196 |                 code_snippet=code_snippet,
197 |             )
198 |         except Exception as e:
199 |             logger.warning(f"Error extracting func literal: {e}")
200 |             return None
201 | 
202 |     def _should_include_function(self, func: Function) -> bool:
203 |         """Determine if a function should be included in the analysis."""
204 |         excluded_names = {
205 |             "init",
206 |             "main",
207 |         }
208 | 
209 |         if func.name.lower() in excluded_names:
210 |             logger.debug(f"Skipping excluded function: {func.name}")
211 |             return False
212 | 
213 |         if func.line_end - func.line_start < 2:
214 |             logger.debug(f"Skipping short function: {func.name}")
215 |             return False
216 | 
217 |         if func.name.startswith("anonymous_func") and func.line_end - func.line_start < 3:
218 |             logger.debug(f"Skipping simple anonymous function: {func.name}")
219 |             return False
220 | 
221 |         return True
222 | 
223 |     def _extract_parameters(self, node) -> List[str]:
224 |         """Extract parameter names from a function node."""
225 |         parameters = []
226 |         params_node = self._find_child_by_type(node, "parameter_list")
227 |         if params_node:
228 |             for child in params_node.children:
229 |                 if child.type == "parameter_declaration":
230 |                     param_name = self._find_child_by_type(child, "identifier")
231 |                     if param_name:
232 |                         parameters.append(self._get_node_text(param_name))
233 |                 elif child.type == "variadic_parameter_declaration":
234 |                     param_name = self._find_child_by_type(child, "identifier")
235 |                     if param_name:
236 |                         parameters.append(f"...{self._get_node_text(param_name)}")
237 |         return parameters
238 | 
239 |     def _extract_receiver_type(self, node) -> Optional[str]:
240 |         """Extract receiver type from method declaration."""
241 |         receiver_node = self._find_child_by_type(node, "parameter_list")
242 |         if receiver_node and receiver_node.children:
243 |             first_param = receiver_node.children[0] if receiver_node.children else None
244 |             if first_param and first_param.type == "parameter_declaration":
245 |                 type_nodes = [
246 |                     child
247 |                     for child in first_param.children
248 |                     if child.type in ["type_identifier", "pointer_type"]
249 |                 ]
250 |                 if type_nodes:
251 |                     return self._get_node_text(type_nodes[0])
252 |         return None
253 | 
254 |     def _extract_docstring(self, node) -> Optional[str]:
255 |         """Extract Go doc comment from function."""
256 |         if node.prev_sibling and node.prev_sibling.type == "comment":
257 |             comment_text = self._get_node_text(node.prev_sibling)
258 |             lines = comment_text.split("\n")
259 |             cleaned_lines = []
260 |             for line in lines:
261 |                 line = line.strip()
262 |                 if line.startswith("//"):
263 |                     cleaned_lines.append(line[2:].strip())
264 |                 elif line.startswith("/*") and line.endswith("*/"):
265 |                     cleaned_lines.append(line[2:-2].strip())
266 |             return "\n".join(cleaned_lines) if cleaned_lines else None
267 |         return None
268 | 
269 |     def _extract_call_relationships(self, node) -> None:
270 |         """Extract function call relationships from the AST."""
271 |         func_ranges = {}
272 |         for func in self.functions:
273 |             for line in range(func.line_start, func.line_end + 1):
274 |                 func_ranges[line] = func
275 | 
276 |         self._traverse_for_calls(node, func_ranges)
277 | 
278 |     def _traverse_for_calls(self, node, func_ranges: dict) -> None:
279 |         """Recursively find function calls."""
280 |         if self.limits.should_stop():
281 |             return
282 | 
283 |         if node.type == "call_expression":
284 |             call_info = self._extract_call_from_node(node, func_ranges)
285 |             if call_info:
286 |                 if self.limits.can_add_relationship():
287 |                     self.call_relationships.append(call_info)
288 |                     if self.limits.add_relationship():
289 |                         return
290 |                 else:
291 |                     return
292 | 
293 |         for child in node.children:
294 |             self._traverse_for_calls(child, func_ranges)
295 |             if self.limits.should_stop():
296 |                 break
297 | 
298 |     def _extract_call_from_node(self, node, func_ranges: dict) -> Optional[CallRelationship]:
299 |         """Extract call relationship from a call_expression node."""
300 |         try:
301 |             call_line = node.start_point[0] + 1
302 |             caller_func = func_ranges.get(call_line)
303 |             if not caller_func:
304 |                 return None
305 | 
306 |             callee_name = self._extract_callee_name(node)
307 |             if not callee_name or self._is_builtin_function(callee_name):
308 |                 return None
309 | 
310 |             caller_id = f"{self.file_path}:{caller_func.name}"
311 |             return CallRelationship(
312 |                 caller=caller_id,
313 |                 callee=callee_name,
314 |                 call_line=call_line,
315 |                 is_resolved=False,
316 |             )
317 |         except Exception as e:
318 |             logger.warning(f"Error extracting call relationship: {e}")
319 |             return None
320 | 
321 |     def _extract_callee_name(self, call_node) -> Optional[str]:
322 |         """Extract the name of the called function."""
323 |         if call_node.children:
324 |             callee_node = call_node.children[0]
325 | 
326 |             if callee_node.type == "identifier":
327 |                 return self._get_node_text(callee_node)
328 |             elif callee_node.type == "selector_expression":
329 |                 field_node = self._find_child_by_type(callee_node, "field_identifier")
330 |                 if field_node:
331 |                     return self._get_node_text(field_node)
332 |             elif callee_node.type == "qualified_type":
333 |                 name_node = self._find_child_by_type(callee_node, "type_identifier")
334 |                 if name_node:
335 |                     return self._get_node_text(name_node)
336 |         return None
337 | 
338 |     def _is_builtin_function(self, name: str) -> bool:
339 |         """Check if function name is a Go built-in."""
340 |         builtins = {
341 |             "append",
342 |             "cap",
343 |             "close",
344 |             "complex",
345 |             "copy",
346 |             "delete",
347 |             "imag",
348 |             "len",
349 |             "make",
350 |             "new",
351 |             "panic",
352 |             "print",
353 |             "println",
354 |             "real",
355 |             "recover",
356 |             "fmt",
357 |             "log",
358 |             "os",
359 |             "io",
360 |             "strings",
361 |             "strconv",
362 |             "time",
363 |             "context",
364 |             "errors",
365 |             "sync",
366 |             "http",
367 |             "json",
368 |             "encoding",
369 |             "reflect",
370 |             "sort",
371 |             "math",
372 |             "rand",
373 |             "crypto",
374 |             "hash",
375 |             "net",
376 |             "url",
377 |             "path",
378 |             "filepath",
379 |             "buffer",
380 |             "bytes",
381 |             "regexp",
382 |             "template",
383 |             "html",
384 |             "xml",
385 |             "sql",
386 |             "runtime",
387 |             "unsafe",
388 |             "atomic",
389 |             "testing",
390 |             "flag",
391 |             "tar",
392 |             "zip",
393 |             "gzip",
394 |             "base64",
395 |             "hex",
396 |             "pprof",
397 |             "debug",
398 |             "trace",
399 |             "plugin",
400 |         }
401 |         return name in builtins
402 | 
403 |     # Helper methods
404 |     def _find_child_by_type(self, node, node_type: str):
405 |         """Find first child node of specified type."""
406 |         for child in node.children:
407 |             if child.type == node_type:
408 |                 return child
409 |         return None
410 | 
411 |     def _find_children_by_type(self, node, node_type: str):
412 |         """Find all child nodes of specified type."""
413 |         return [child for child in node.children if child.type == node_type]
414 | 
415 |     def _get_node_text(self, node) -> str:
416 |         """Get the text content of a node."""
417 |         start_byte = node.start_byte
418 |         end_byte = node.end_byte
419 |         return self.content.encode("utf8")[start_byte:end_byte].decode("utf8")
420 | 
421 | 
422 | # Integration functions
423 | def analyze_go_file_treesitter(
424 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
425 | ) -> tuple[List[Function], List[CallRelationship]]:
426 |     """Analyze a Go file using tree-sitter."""
427 |     try:
428 |         logger.info(f"Tree-sitter Go analysis for {file_path}")
429 |         analyzer = TreeSitterGoAnalyzer(file_path, content, limits)
430 |         analyzer.analyze()
431 |         logger.info(
432 |             f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {analyzer.limits.nodes_processed} nodes processed"
433 |         )
434 |         return analyzer.functions, analyzer.call_relationships
435 |     except Exception as e:
436 |         logger.error(f"Error in tree-sitter Go analysis for {file_path}: {e}", exc_info=True)
437 |         return [], []
438 | 


--------------------------------------------------------------------------------
/src/gitprobe/utils/patterns.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code analysis patterns for different programming languages.
  3 | 
  4 | This module contains patterns used to identify entry points, high-connectivity files,
  5 | and function definitions across multiple programming languages.
  6 | """
  7 | 
  8 | from typing import List, Dict
  9 | 
 10 | DEFAULT_IGNORE_PATTERNS = {
 11 |     ".github",
 12 |     ".vscode",
 13 |     ".git",
 14 |     ".gitignore",
 15 |     ".gitmodules",
 16 |     ".gitignore",
 17 |     # Python
 18 |     "*.pyc",
 19 |     "*.pyo",
 20 |     "*.pyd",
 21 |     "__pycache__",
 22 |     ".pytest_cache",
 23 |     ".coverage",
 24 |     ".tox",
 25 |     ".nox",
 26 |     ".mypy_cache",
 27 |     ".ruff_cache",
 28 |     ".hypothesis",
 29 |     "poetry.lock",
 30 |     "Pipfile.lock",
 31 |     # JavaScript/FileSystemNode
 32 |     "node_modules",
 33 |     "bower_components",
 34 |     "package-lock.json",
 35 |     "yarn.lock",
 36 |     ".npm",
 37 |     ".yarn",
 38 |     ".pnpm-store",
 39 |     "bun.lock",
 40 |     "bun.lockb",
 41 |     # Java
 42 |     "*.class",
 43 |     "*.jar",
 44 |     "*.war",
 45 |     "*.ear",
 46 |     "*.nar",
 47 |     ".gradle/",
 48 |     "build/",
 49 |     ".settings/",
 50 |     ".classpath",
 51 |     "gradle-app.setting",
 52 |     "*.gradle",
 53 |     # IDEs and editors / Java
 54 |     ".project",
 55 |     # C/C++
 56 |     "*.o",
 57 |     "*.obj",
 58 |     "*.dll",
 59 |     "*.dylib",
 60 |     "*.exe",
 61 |     "*.lib",
 62 |     "*.out",
 63 |     "*.a",
 64 |     "*.pdb",
 65 |     # Swift/Xcode
 66 |     ".build/",
 67 |     "*.xcodeproj/",
 68 |     "*.xcworkspace/",
 69 |     "*.pbxuser",
 70 |     "*.mode1v3",
 71 |     "*.mode2v3",
 72 |     "*.perspectivev3",
 73 |     "*.xcuserstate",
 74 |     "xcuserdata/",
 75 |     ".swiftpm/",
 76 |     # Ruby
 77 |     "*.gem",
 78 |     ".bundle/",
 79 |     "vendor/bundle",
 80 |     "Gemfile.lock",
 81 |     ".ruby-version",
 82 |     ".ruby-gemset",
 83 |     ".rvmrc",
 84 |     # Rust
 85 |     "Cargo.lock",
 86 |     "**/*.rs.bk",
 87 |     # Java / Rust
 88 |     "target/",
 89 |     # Go
 90 |     "pkg/",
 91 |     # .NET/C#
 92 |     "obj/",
 93 |     "*.suo",
 94 |     "*.user",
 95 |     "*.userosscache",
 96 |     "*.sln.docstates",
 97 |     "packages/",
 98 |     "*.nupkg",
 99 |     # Go / .NET / C#
100 |     "bin/",
101 |     # Version control
102 |     ".git",
103 |     ".svn",
104 |     ".hg",
105 |     ".gitignore",
106 |     ".gitattributes",
107 |     ".gitmodules",
108 |     # Images and media
109 |     "*.svg",
110 |     "*.png",
111 |     "*.jpg",
112 |     "*.jpeg",
113 |     "*.gif",
114 |     "*.ico",
115 |     "*.pdf",
116 |     "*.mov",
117 |     "*.mp4",
118 |     "*.mp3",
119 |     "*.wav",
120 |     # Virtual environments
121 |     "venv",
122 |     ".venv",
123 |     "env",
124 |     ".env",
125 |     "virtualenv",
126 |     # IDEs and editors
127 |     ".idea",
128 |     ".vscode",
129 |     ".vs",
130 |     "*.swo",
131 |     "*.swn",
132 |     ".settings",
133 |     "*.sublime-*",
134 |     # Temporary and cache files
135 |     "*.log",
136 |     "*.bak",
137 |     "*.swp",
138 |     "*.tmp",
139 |     "*.temp",
140 |     ".cache",
141 |     ".sass-cache",
142 |     ".eslintcache",
143 |     ".DS_Store",
144 |     "Thumbs.db",
145 |     "desktop.ini",
146 |     # Build directories and artifacts
147 |     "build",
148 |     "dist",
149 |     "target",
150 |     "out",
151 |     "*.egg-info",
152 |     "*.egg",
153 |     "*.whl",
154 |     "*.so",
155 |     # Documentation
156 |     "site-packages",
157 |     ".docusaurus",
158 |     ".next",
159 |     ".nuxt",
160 |     # Other common patterns
161 |     ## Minified files
162 |     "*.min.js",
163 |     "*.min.css",
164 |     ## Source maps
165 |     "*.map",
166 |     ## Terraform
167 |     ".terraform",
168 |     "*.tfstate*",
169 |     ## Dependencies in various languages
170 |     "vendor/",
171 |     # Gitingest
172 |     "digest.txt",
173 | }
174 | 
175 | 
176 | DEFAULT_INCLUDE_PATTERNS = [
177 |     "*.py",
178 |     "*.js",
179 |     "*.ts",
180 |     "*.jsx",
181 |     "*.tsx",
182 |     "*.java",
183 |     "*.cpp",
184 |     "*.c",
185 |     "*.h",
186 |     "*.cs",
187 |     "*.go",
188 |     "*.rs",
189 |     "*.php",
190 |     "*.rb",
191 |     "*.swift",
192 |     "*.kt",
193 |     "*.scala",
194 |     "*.clj",
195 |     "*.hs",
196 |     "*.ml",
197 |     "*.html",
198 |     "*.css",
199 |     "*.scss",
200 |     "*.sass",
201 |     "*.json",
202 |     "*.yaml",
203 |     "*.yml",
204 |     "*.xml",
205 |     "*.md",
206 |     "*.txt",
207 |     "*.toml",
208 |     "*.cfg",
209 |     "*.ini",
210 | ]
211 | 
212 | CODE_EXTENSIONS = {
213 |     ".py": "python",
214 |     ".js": "javascript",
215 |     ".ts": "typescript",
216 |     ".jsx": "javascript",
217 |     ".tsx": "typescript",
218 |     ".java": "java",
219 |     ".cpp": "cpp",
220 |     ".cc": "cpp",
221 |     ".cxx": "cpp",
222 |     ".c++": "cpp",
223 |     ".c": "c",
224 |     ".h": "c",
225 |     ".hpp": "cpp",
226 |     ".hxx": "cpp",
227 |     ".h++": "cpp",
228 |     ".rs": "rust",
229 |     ".go": "go",
230 |     ".php": "php",
231 |     ".rb": "ruby",
232 |     ".swift": "swift",
233 |     ".kt": "kotlin",
234 |     ".scala": "scala",
235 |     ".cs": "csharp",
236 | }
237 | 
238 | # Entry point file patterns for all supported languages
239 | ENTRY_POINT_PATTERNS = {
240 |     # Python
241 |     "main.py",
242 |     "app.py",
243 |     "server.py",
244 |     "__main__.py",
245 |     "run.py",
246 |     "start.py",
247 |     "manage.py",
248 |     "wsgi.py",
249 |     "asgi.py",
250 |     "gunicorn.py",  # Django/Flask patterns
251 |     # JavaScript/TypeScript
252 |     "index.js",
253 |     "app.js",
254 |     "server.js",
255 |     "main.js",
256 |     "index.ts",
257 |     "app.ts",
258 |     "server.ts",
259 |     "main.ts",
260 |     "start.js",
261 |     "start.ts",
262 |     "bootstrap.js",
263 |     "bootstrap.ts",
264 |     "entry.js",
265 |     "entry.ts",
266 |     # Go
267 |     "main.go",
268 |     "cmd.go",
269 |     "server.go",
270 |     "app.go",
271 |     "root.go",
272 |     "start.go",
273 |     # Rust
274 |     "main.rs",
275 |     "lib.rs",
276 |     "server.rs",
277 |     "app.rs",
278 |     "start.rs",
279 |     "bin.rs",
280 |     # C/C++
281 |     "main.c",
282 |     "main.cpp",
283 |     "main.cc",
284 |     "main.cxx",
285 |     "app.c",
286 |     "app.cpp",
287 |     "start.c",
288 |     "start.cpp",
289 |     "entry.c",
290 |     "entry.cpp",
291 | }
292 | 
293 | # Additional entry point path patterns (for when filename patterns fail)
294 | ENTRY_POINT_PATH_PATTERNS = [
295 |     "cmd/main",
296 |     "cmd/root",
297 |     "cmd/server",  # Go command patterns
298 |     "src/main",
299 |     "src/app",
300 |     "src/server",  # Common src patterns
301 |     "bin/main",
302 |     "bin/app",
303 |     "bin/server",  # Binary patterns
304 |     "app/main",
305 |     "app/server",
306 |     "app/start",  # App directory patterns
307 |     "scripts/start",
308 |     "scripts/run",  # Script patterns
309 | ]
310 | 
311 | # Flexible entry point name patterns (partial matches)
312 | ENTRY_POINT_NAME_PATTERNS = [
313 |     "main",
314 |     "app",
315 |     "server",
316 |     "start",
317 |     "run",
318 |     "entry",
319 |     "bootstrap",
320 |     "init",
321 |     "cmd",
322 |     "cli",
323 |     "daemon",
324 |     "service",
325 |     "worker",
326 |     "launcher",
327 | ]
328 | 
329 | # High connectivity file patterns (files likely to have many function calls)
330 | HIGH_CONNECTIVITY_PATTERNS = {
331 |     # General patterns
332 |     "router",
333 |     "controller",
334 |     "service",
335 |     "handler",
336 |     "middleware",
337 |     "api",
338 |     "core",
339 |     "engine",
340 |     "manager",
341 |     "processor",
342 |     "client",
343 |     # Language-specific patterns
344 |     "mod",
345 |     "module",  # Rust modules
346 |     "pkg",
347 |     "package",  # Go packages
348 |     "lib",
349 |     "util",
350 |     "utils",
351 |     "helper",
352 |     "helpers",
353 |     # Framework patterns
354 |     "express",
355 |     "fastapi",
356 |     "gin",
357 |     "actix",
358 |     "rocket",  # Web frameworks
359 |     "db",
360 |     "database",
361 |     "model",
362 |     "entity",
363 |     "repo",
364 |     "repository",
365 |     # Additional patterns
366 |     "config",
367 |     "settings",
368 |     "constants",
369 |     "types",
370 |     "interfaces",
371 |     # Generic library patterns (added for broader coverage)
372 |     "console",
373 |     "text",
374 |     "style",
375 |     "render",
376 |     "display",
377 |     "format",
378 |     "parse",
379 |     "parser",
380 |     "convert",
381 |     "transform",
382 |     "process",
383 |     "table",
384 |     "tree",
385 |     "list",
386 |     "grid",
387 |     "layout",
388 |     "widget",
389 |     "color",
390 |     "theme",
391 |     "visual",
392 |     "graphic",
393 |     "draw",
394 |     "paint",
395 |     "file",
396 |     "io",
397 |     "stream",
398 |     "buffer",
399 |     "cache",
400 |     "store",
401 |     "base",
402 |     "common",
403 |     "shared",
404 |     "global",
405 |     "main",
406 |     "index",
407 | }
408 | 
409 | # Source directory patterns across all languages
410 | SOURCE_DIRECTORY_PATTERNS = [
411 |     "src/",
412 |     "lib/",
413 |     "core/",
414 |     "pkg/",  # General
415 |     "cmd/",
416 |     "internal/",  # Go specific
417 |     "crates/",
418 |     "modules/",  # Rust specific
419 |     "include/",
420 |     "source/",  # C/C++ specific
421 |     "components/",
422 |     "services/",
423 |     "utils/",  # Framework patterns
424 | ]
425 | 
426 | # Function definition patterns for quick file scanning
427 | FUNCTION_DEFINITION_PATTERNS = {
428 |     "python": ["def {name}"],
429 |     "javascript": ["function {name}", "const {name}", "export {name}"],
430 |     "typescript": ["function {name}", "const {name}", "export {name}"],
431 |     "go": ["func {name}"],
432 |     "rust": ["fn {name}", "pub fn {name}"],
433 |     "c": ["void {name}", "int {name}", "{name}("],
434 |     "cpp": ["void {name}", "int {name}", "{name}("],
435 |     "general": ["{name}("],  # Fallback pattern
436 | }
437 | 
438 | # Critical function name patterns
439 | CRITICAL_FUNCTION_NAMES = {"main", "index", "app", "server", "start", "init", "run", "new"}
440 | 
441 | # Export/public function patterns for critical function detection
442 | EXPORT_PATTERNS = [
443 |     # JavaScript/TypeScript exports
444 |     "export default",
445 |     "module.exports =",
446 |     "exports.",
447 |     # Rust public functions
448 |     "pub fn main",
449 |     "pub fn new",
450 |     "pub fn",
451 |     # Go exported functions (capitalized)
452 |     "func main",
453 |     "func new",
454 |     # C/C++ main functions
455 |     "int main",
456 |     "void main",
457 |     "public static void main",
458 |     # Python special methods
459 |     'if __name__ == "__main__"',
460 | ]
461 | 
462 | # Fallback patterns when standard patterns don't work
463 | FALLBACK_PATTERNS = {
464 |     "any_main_file": ["main"],  # Any file with "main" in name
465 |     "any_app_file": ["app"],  # Any file with "app" in name
466 |     "any_server_file": ["server", "srv"],  # Any server-related file
467 |     "any_index_file": ["index", "idx"],  # Any index file
468 |     "largest_files": True,  # Fall back to largest files by line count
469 | }
470 | 
471 | 
472 | def get_function_patterns_for_language(language: str) -> list:
473 |     """
474 |     Get function definition patterns for a specific language.
475 | 
476 |     Args:
477 |         language: Programming language name
478 | 
479 |     Returns:
480 |         List of function definition patterns for the language
481 |     """
482 |     return FUNCTION_DEFINITION_PATTERNS.get(
483 |         language.lower(), FUNCTION_DEFINITION_PATTERNS["general"]
484 |     )
485 | 
486 | 
487 | def is_entry_point_file(filename: str) -> bool:
488 |     """
489 |     Check if a filename matches entry point patterns.
490 | 
491 |     Args:
492 |         filename: Name of the file to check
493 | 
494 |     Returns:
495 |         True if the file is likely an entry point
496 |     """
497 |     filename_lower = filename.lower()
498 | 
499 |     # Exact match
500 |     if filename_lower in ENTRY_POINT_PATTERNS:
501 |         return True
502 | 
503 |     # Partial name matching for flexibility
504 |     for pattern in ENTRY_POINT_NAME_PATTERNS:
505 |         if pattern in filename_lower and any(
506 |             ext in filename_lower for ext in [".py", ".js", ".ts", ".go", ".rs", ".c", ".cpp"]
507 |         ):
508 |             return True
509 | 
510 |     return False
511 | 
512 | 
513 | def is_entry_point_path(filepath: str) -> bool:
514 |     """
515 |     Check if a file path matches entry point path patterns.
516 | 
517 |     Args:
518 |         filepath: Full path of the file to check
519 | 
520 |     Returns:
521 |         True if the path suggests an entry point
522 |     """
523 |     filepath_lower = filepath.lower()
524 | 
525 |     for pattern in ENTRY_POINT_PATH_PATTERNS:
526 |         if pattern in filepath_lower:
527 |             return True
528 | 
529 |     return False
530 | 
531 | 
532 | def has_high_connectivity_potential(filename: str, filepath: str) -> bool:
533 |     """
534 |     Check if a file has high connectivity potential based on name and path.
535 | 
536 |     Args:
537 |         filename: Name of the file
538 |         filepath: Full path of the file
539 | 
540 |     Returns:
541 |         True if the file likely has high connectivity
542 |     """
543 |     filename_lower = filename.lower()
544 |     filepath_lower = filepath.lower()
545 | 
546 |     # Check filename patterns
547 |     if any(pattern in filename_lower for pattern in HIGH_CONNECTIVITY_PATTERNS):
548 |         return True
549 | 
550 |     # Check filepath patterns
551 |     if any(pattern in filepath_lower for pattern in HIGH_CONNECTIVITY_PATTERNS):
552 |         return True
553 | 
554 |     # Check source directory patterns
555 |     if any(pattern in filepath_lower for pattern in SOURCE_DIRECTORY_PATTERNS):
556 |         return True
557 | 
558 |     return False
559 | 
560 | 
561 | def is_critical_function(func_name: str, code_snippet: str = None) -> bool:
562 |     """
563 |     Check if a function is critical based on name and code patterns.
564 | 
565 |     Args:
566 |         func_name: Name of the function
567 |         code_snippet: Optional code snippet to analyze
568 | 
569 |     Returns:
570 |         True if the function is considered critical
571 |     """
572 |     # Check critical function names
573 |     if func_name.lower() in CRITICAL_FUNCTION_NAMES:
574 |         return True
575 | 
576 |     # Check export patterns in code snippet
577 |     if code_snippet:
578 |         snippet_lower = code_snippet.lower()
579 |         if any(pattern in snippet_lower for pattern in EXPORT_PATTERNS):
580 |             return True
581 | 
582 |     return False
583 | 
584 | 
585 | def find_fallback_entry_points(code_files: List[Dict], max_files: int = 5) -> List[Dict]:
586 |     """
587 |     Find fallback entry points when standard patterns don't match.
588 | 
589 |     Args:
590 |         code_files: List of all code files
591 |         max_files: Maximum number of fallback files to return
592 | 
593 |     Returns:
594 |         List of files that could serve as entry points
595 |     """
596 |     fallback_files = []
597 | 
598 |     # Try fallback name patterns
599 |     for file_info in code_files:
600 |         filename = file_info["name"].lower()
601 |         filepath = file_info["path"].lower()
602 | 
603 |         # Check for any main-like files
604 |         if any(pattern in filename for pattern in ["main", "app", "server", "start", "index"]):
605 |             fallback_files.append(file_info)
606 | 
607 |         # Check for entry point paths
608 |         elif is_entry_point_path(filepath):
609 |             fallback_files.append(file_info)
610 | 
611 |     # If still nothing, try files in root or common directories
612 |     if not fallback_files:
613 |         for file_info in code_files:
614 |             filepath = file_info["path"]
615 |             # Files in root directory or immediate subdirectories
616 |             if filepath.count("/") <= 1:
617 |                 fallback_files.append(file_info)
618 | 
619 |     # Sort by likelihood (prefer shorter paths, common names)
620 |     def fallback_priority(file_info):
621 |         path = file_info["path"].lower()
622 |         name = file_info["name"].lower()
623 | 
624 |         score = 0
625 |         # Prefer shorter paths (closer to root)
626 |         score -= path.count("/")
627 |         # Prefer common entry point names
628 |         if any(pattern in name for pattern in ["main", "app", "index"]):
629 |             score -= 10
630 |         # Prefer certain extensions
631 |         if any(ext in name for ext in [".py", ".js", ".go", ".rs"]):
632 |             score -= 5
633 | 
634 |         return score
635 | 
636 |     fallback_files.sort(key=fallback_priority)
637 |     return fallback_files[:max_files]
638 | 
639 | 
640 | def find_fallback_connectivity_files(code_files: List[Dict], max_files: int = 10) -> List[Dict]:
641 |     """
642 |     Find fallback high-connectivity files when standard patterns don't match.
643 | 
644 |     Args:
645 |         code_files: List of all code files
646 |         max_files: Maximum number of fallback files to return
647 | 
648 |     Returns:
649 |         List of files that likely have good connectivity
650 |     """
651 |     fallback_files = []
652 | 
653 |     # Include all files from common source directories
654 |     for file_info in code_files:
655 |         filepath = file_info["path"].lower()
656 | 
657 |         # Any file in src, lib, or similar directories
658 |         if any(pattern in filepath for pattern in ["src/", "lib/", "app/", "pkg/", "core/"]):
659 |             fallback_files.append(file_info)
660 | 
661 |     # If still not enough, include files with certain extensions
662 |     if len(fallback_files) < max_files:
663 |         for file_info in code_files:
664 |             if file_info not in fallback_files:
665 |                 name = file_info["name"].lower()
666 |                 # Include common source file extensions
667 |                 if any(ext in name for ext in [".py", ".js", ".ts", ".go", ".rs", ".c", ".cpp"]):
668 |                     # Skip test files
669 |                     if not any(test_pattern in name for test_pattern in ["test", "spec", "_test"]):
670 |                         fallback_files.append(file_info)
671 | 
672 |     return fallback_files[:max_files]
673 | 


--------------------------------------------------------------------------------
/src/gitprobe/analyzers/c_cpp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Advanced C/C++ analyzer using Tree-sitter for accurate AST parsing.
  3 | 
  4 | This module provides C and C++ source code analysis using tree-sitter,
  5 | which is faster and more reliable than libclang or pycparser for basic
  6 | function and call relationship extraction.
  7 | """
  8 | 
  9 | import logging
 10 | from typing import List, Tuple, Dict, Any, Optional, Set
 11 | from pathlib import Path
 12 | 
 13 | from tree_sitter import Parser, Language
 14 | import tree_sitter_c
 15 | import tree_sitter_cpp
 16 | 
 17 | from gitprobe.models.core import Function, CallRelationship
 18 | from gitprobe.core.analysis_limits import AnalysisLimits, create_c_cpp_limits
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class TreeSitterCAnalyzer:
 24 |     """C/C++ analyzer using tree-sitter for proper AST parsing."""
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         file_path: str,
 29 |         content: str,
 30 |         language: str = "c",
 31 |         limits: Optional[AnalysisLimits] = None,
 32 |     ):
 33 |         self.file_path = str(file_path)
 34 |         self.content = content
 35 |         self.language = language.lower()
 36 |         self.lines = content.splitlines()
 37 |         self.functions: List[Function] = []
 38 |         self.call_relationships: List[CallRelationship] = []
 39 |         self.limits = limits or create_c_cpp_limits()
 40 | 
 41 |         is_cpp = (
 42 |             self.language == "cpp"
 43 |             or self.language == "c++"
 44 |             or Path(file_path).suffix.lower()
 45 |             in [".cpp", ".cc", ".cxx", ".c++", ".hpp", ".hxx", ".h++"]
 46 |         )
 47 | 
 48 |         try:
 49 |             if is_cpp:
 50 |                 language_capsule = tree_sitter_cpp.language()
 51 |                 self.language_obj = Language(language_capsule)
 52 |                 self.parser = Parser(self.language_obj)
 53 |                 logger.debug(
 54 |                     f"C++ parser initialized with language object: {type(self.language_obj)}"
 55 |                 )
 56 |             else:
 57 |                 language_capsule = tree_sitter_c.language()
 58 |                 self.language_obj = Language(language_capsule)
 59 |                 self.parser = Parser(self.language_obj)
 60 |                 logger.debug(
 61 |                     f"C parser initialized with language object: {type(self.language_obj)}"
 62 |                 )
 63 | 
 64 |             test_code = "int main() { return 0; }" if not is_cpp else "int main() { return 0; }"
 65 |             test_tree = self.parser.parse(bytes(test_code, "utf8"))
 66 |             if test_tree is None or test_tree.root_node is None:
 67 |                 raise RuntimeError(f"Parser setup test failed for {self.language.upper()}")
 68 |             logger.debug(f"Parser test successful - root node type: {test_tree.root_node.type}")
 69 | 
 70 |         except Exception as e:
 71 |             logger.error(f"Failed to initialize {self.language.upper()} parser: {e}")
 72 |             self.parser = None
 73 |             self.language_obj = None
 74 | 
 75 |         logger.info(
 76 |             f"TreeSitterCAnalyzer initialized for {file_path} ({self.language.upper()}) with limits: {self.limits}"
 77 |         )
 78 | 
 79 |     def analyze(self) -> None:
 80 |         """Analyze C/C++ code using tree-sitter."""
 81 |         if not self.limits.start_new_file():
 82 |             logger.info(f"Skipping {self.file_path} - global limits reached")
 83 |             return
 84 | 
 85 |         if self.parser is None:
 86 |             logger.warning(f"Skipping {self.file_path} - parser initialization failed")
 87 |             return
 88 | 
 89 |         try:
 90 |             logger.debug(
 91 |                 f"Attempting to parse {len(self.content)} bytes of {self.language.upper()} code"
 92 |             )
 93 |             logger.debug(f"Parser language object: {self.language_obj}")
 94 | 
 95 |             tree = self.parser.parse(bytes(self.content, "utf8"))
 96 | 
 97 |             if tree is None:
 98 |                 raise ValueError("Parser returned None tree")
 99 | 
100 |             root_node = tree.root_node
101 |             if root_node is None:
102 |                 raise ValueError("Tree has no root node")
103 | 
104 |             logger.info(f"Parsed AST with root node type: {root_node.type}")
105 | 
106 |             if root_node.has_error:
107 |                 logger.warning(f"Parse tree contains errors for {self.file_path}")
108 | 
109 |             self._extract_functions(root_node)
110 | 
111 |             if not self.limits.should_stop():
112 |                 self._extract_calls(root_node)
113 | 
114 |             logger.info(
115 |                 f"Tree-sitter {self.language.upper()} analysis complete: "
116 |                 f"{len(self.functions)} functions, {len(self.call_relationships)} calls, "
117 |                 f"{self.limits.nodes_processed} nodes processed"
118 |             )
119 | 
120 |         except Exception as e:
121 |             logger.error(
122 |                 f"Tree-sitter {self.language.upper()} analysis failed for {self.file_path}: {e}",
123 |                 exc_info=True,
124 |             )
125 | 
126 |     def _extract_functions(self, node):
127 |         """Extract function definitions from the AST."""
128 |         if self.limits.should_stop():
129 |             return
130 | 
131 |         if node.type == "function_definition":
132 |             func = self._create_function_from_node(node)
133 |             if func:
134 |                 if self.limits.can_add_function():
135 |                     self.functions.append(func)
136 |                     if self.limits.add_function():
137 |                         return
138 |                 else:
139 |                     return
140 |         elif node.type == "function_declarator":
141 |             func = self._create_function_from_declarator(node)
142 |             if func:
143 |                 if self.limits.can_add_function():
144 |                     self.functions.append(func)
145 |                     if self.limits.add_function():
146 |                         return
147 |                 else:
148 |                     return
149 |         elif self.language in ["cpp", "c++"] and node.type in [
150 |             "method_definition",
151 |             "constructor_definition",
152 |             "destructor_definition",
153 |         ]:
154 |             func = self._create_method_from_node(node)
155 |             if func:
156 |                 if self.limits.can_add_function():
157 |                     self.functions.append(func)
158 |                     if self.limits.add_function():
159 |                         return
160 |                 else:
161 |                     return
162 | 
163 |         for child in node.children:
164 |             self._extract_functions(child)
165 |             if self.limits.should_stop():
166 |                 break
167 | 
168 |     def _create_function_from_node(self, node) -> Optional[Function]:
169 |         """Create a Function object from a function_definition node."""
170 |         try:
171 |             declarator = self._find_child_by_type(node, "function_declarator")
172 |             if not declarator:
173 |                 return None
174 | 
175 |             identifier = self._find_child_by_type(declarator, "identifier")
176 |             if not identifier:
177 |                 return None
178 | 
179 |             func_name = self._get_node_text(identifier)
180 | 
181 |             line_start = node.start_point[0] + 1
182 |             line_end = node.end_point[0] + 1
183 | 
184 |             params = self._extract_parameters(declarator)
185 | 
186 |             code_snippet = self._get_node_text(node)
187 | 
188 |             is_method = self._is_method(node)
189 |             class_name = self._get_class_name(node) if is_method else None
190 | 
191 |             return Function(
192 |                 name=func_name,
193 |                 file_path=self.file_path,
194 |                 line_start=line_start,
195 |                 line_end=line_end,
196 |                 parameters=params,
197 |                 code_snippet=code_snippet,
198 |                 is_method=is_method,
199 |                 class_name=class_name,
200 |                 docstring=None,
201 |             )
202 | 
203 |         except Exception as e:
204 |             logger.warning(f"Failed to create function from node: {e}")
205 |             return None
206 | 
207 |     def _create_function_from_declarator(self, node) -> Optional[Function]:
208 |         """Create a Function object from a function_declarator node (for declarations)."""
209 |         try:
210 |             identifier = self._find_child_by_type(node, "identifier")
211 |             if not identifier:
212 |                 return None
213 | 
214 |             func_name = self._get_node_text(identifier)
215 | 
216 |             line_start = node.start_point[0] + 1
217 |             line_end = node.end_point[0] + 1
218 | 
219 |             params = self._extract_parameters(node)
220 | 
221 |             code_snippet = (
222 |                 self._get_node_text(node.parent) if node.parent else self._get_node_text(node)
223 |             )
224 | 
225 |             return Function(
226 |                 name=func_name,
227 |                 file_path=self.file_path,
228 |                 line_start=line_start,
229 |                 line_end=line_end,
230 |                 parameters=params,
231 |                 code_snippet=code_snippet,
232 |                 is_method=False,
233 |                 class_name=None,
234 |                 docstring=None,
235 |             )
236 | 
237 |         except Exception as e:
238 |             logger.warning(f"Failed to create function from declarator: {e}")
239 |             return None
240 | 
241 |     def _create_method_from_node(self, node) -> Optional[Function]:
242 |         """Create a Function object from a method_definition node."""
243 |         try:
244 |             declarator = self._find_child_by_type(node, "function_declarator")
245 |             if not declarator:
246 |                 return None
247 | 
248 |             identifier = self._find_child_by_type(declarator, "identifier")
249 |             if not identifier:
250 |                 if node.type == "destructor_definition":
251 |                     destructor_name = self._find_child_by_type(node, "destructor_name")
252 |                     if destructor_name:
253 |                         identifier = self._find_child_by_type(destructor_name, "identifier")
254 | 
255 |                 if not identifier:
256 |                     return None
257 | 
258 |             func_name = self._get_node_text(identifier)
259 | 
260 |             line_start = node.start_point[0] + 1
261 |             line_end = node.end_point[0] + 1
262 | 
263 |             params = self._extract_parameters(declarator)
264 | 
265 |             code_snippet = self._get_node_text(node)
266 | 
267 |             class_name = self._get_class_name(node)
268 | 
269 |             return Function(
270 |                 name=func_name,
271 |                 file_path=self.file_path,
272 |                 line_start=line_start,
273 |                 line_end=line_end,
274 |                 parameters=params,
275 |                 code_snippet=code_snippet,
276 |                 is_method=True,
277 |                 class_name=class_name,
278 |                 docstring=None,
279 |             )
280 | 
281 |         except Exception as e:
282 |             logger.warning(f"Failed to create method from node: {e}")
283 |             return None
284 | 
285 |     def _extract_parameters(self, declarator_node) -> List[str]:
286 |         """Extract parameter names from function declarator."""
287 |         params = []
288 | 
289 |         param_list = self._find_child_by_type(declarator_node, "parameter_list")
290 |         if param_list:
291 |             for child in param_list.children:
292 |                 if child.type == "parameter_declaration":
293 |                     param_name = self._extract_parameter_name(child)
294 |                     if param_name:
295 |                         params.append(param_name)
296 | 
297 |         return params
298 | 
299 |     def _extract_parameter_name(self, param_node) -> Optional[str]:
300 |         """Extract parameter name from parameter_declaration node."""
301 |         for child in param_node.children:
302 |             if child.type == "identifier":
303 |                 return self._get_node_text(child)
304 |             elif child.type in ["pointer_declarator", "array_declarator"]:
305 |                 identifier = self._find_child_by_type(child, "identifier")
306 |                 if identifier:
307 |                     return self._get_node_text(identifier)
308 |         return None
309 | 
310 |     def _extract_calls(self, node):
311 |         """Extract function calls from the AST."""
312 |         if self.limits.should_stop():
313 |             return
314 | 
315 |         if node.type == "call_expression":
316 |             self._process_call_expression(node)
317 |             if self.limits.increment():
318 |                 return
319 | 
320 |         for child in node.children:
321 |             self._extract_calls(child)
322 |             if self.limits.should_stop():
323 |                 break
324 | 
325 |     def _process_call_expression(self, node):
326 |         """Process a call_expression node to extract call relationships."""
327 |         try:
328 |             function_node = node.children[0] if node.children else None
329 |             if not function_node:
330 |                 return
331 | 
332 |             callee_name = None
333 | 
334 |             if function_node.type == "identifier":
335 |                 callee_name = self._get_node_text(function_node)
336 |             elif function_node.type == "field_expression":
337 |                 field = self._find_child_by_type(function_node, "field_identifier")
338 |                 if field:
339 |                     callee_name = self._get_node_text(field)
340 |             elif function_node.type == "scoped_identifier":
341 |                 identifier = self._find_child_by_type(function_node, "identifier")
342 |                 if identifier:
343 |                     callee_name = self._get_node_text(identifier)
344 | 
345 |             if callee_name and not self._is_builtin_function(callee_name):
346 |                 containing_func = self._find_containing_function(node.start_point[0] + 1)
347 |                 if containing_func and containing_func.name != callee_name:
348 |                     call_line = node.start_point[0] + 1
349 | 
350 |                     relationship = CallRelationship(
351 |                         caller=f"{self.file_path}:{containing_func.name}",
352 |                         callee=callee_name,
353 |                         call_line=call_line,
354 |                         is_resolved=False,
355 |                     )
356 |                     if self.limits.can_add_relationship():
357 |                         self.call_relationships.append(relationship)
358 |                         self.limits.add_relationship()
359 | 
360 |         except Exception as e:
361 |             logger.warning(f"Failed to process call expression: {e}")
362 | 
363 |     def _find_containing_function(self, line_number: int) -> Optional[Function]:
364 |         """Find the function that contains the given line number."""
365 |         for func in self.functions:
366 |             if func.line_start is not None and func.line_end is not None:
367 |                 if func.line_start <= line_number <= func.line_end:
368 |                     return func
369 |         return None
370 | 
371 |     def _is_method(self, node) -> bool:
372 |         """Check if the function is a method (inside a class/struct)."""
373 |         parent = node.parent
374 |         while parent:
375 |             if parent.type in ["class_specifier", "struct_specifier"]:
376 |                 return True
377 |             parent = parent.parent
378 |         return False
379 | 
380 |     def _get_class_name(self, node) -> Optional[str]:
381 |         """Get the class name containing this method."""
382 |         parent = node.parent
383 |         while parent:
384 |             if parent.type in ["class_specifier", "struct_specifier"]:
385 |                 for child in parent.children:
386 |                     if child.type == "type_identifier":
387 |                         return self._get_node_text(child)
388 |             parent = parent.parent
389 |         return None
390 | 
391 |     def _is_builtin_function(self, name: str) -> bool:
392 |         """Check if function name is a C/C++ built-in."""
393 |         builtins = {
394 |             "printf",
395 |             "scanf",
396 |             "malloc",
397 |             "free",
398 |             "calloc",
399 |             "realloc",
400 |             "strlen",
401 |             "strcpy",
402 |             "strcmp",
403 |             "strcat",
404 |             "memcpy",
405 |             "memset",
406 |             "exit",
407 |             "abort",
408 |             "assert",
409 |             "sizeof",
410 |         }
411 |         return name in builtins
412 | 
413 |     def _find_child_by_type(self, node, target_type: str):
414 |         """Find the first child node of the specified type."""
415 |         for child in node.children:
416 |             if child.type == target_type:
417 |                 return child
418 |         return None
419 | 
420 |     def _get_node_text(self, node) -> str:
421 |         """Get the text content of a node."""
422 |         return self.content[node.start_byte : node.end_byte]
423 | 
424 | 
425 | def analyze_c_file_treesitter(
426 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
427 | ) -> Tuple[List[Function], List[CallRelationship]]:
428 |     """
429 |     Analyze a C file using Tree-sitter.
430 | 
431 |     Args:
432 |         file_path: Path to the C file
433 |         content: Content of the C file
434 |         limits: Analysis limits
435 | 
436 |     Returns:
437 |         Tuple of (functions, call_relationships)
438 |     """
439 |     try:
440 |         logger.info(f"Tree-sitter C analysis for {file_path}")
441 |         if limits is None:
442 |             limits = create_c_cpp_limits()
443 |         analyzer = TreeSitterCAnalyzer(file_path, content, language="c", limits=limits)
444 |         analyzer.analyze()
445 |         logger.info(
446 |             f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {analyzer.limits.nodes_processed} nodes processed"
447 |         )
448 |         return analyzer.functions, analyzer.call_relationships
449 |     except Exception as e:
450 |         logger.error(f"Error in tree-sitter C analysis for {file_path}: {e}", exc_info=True)
451 |         return [], []
452 | 
453 | 
454 | def analyze_cpp_file_treesitter(
455 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
456 | ) -> Tuple[List[Function], List[CallRelationship]]:
457 |     """
458 |     Analyze a C++ file using Tree-sitter.
459 | 
460 |     Args:
461 |         file_path: Path to the C++ file
462 |         content: Content of the C++ file
463 |         limits: Analysis limits
464 | 
465 |     Returns:
466 |         Tuple of (functions, call_relationships)
467 |     """
468 |     try:
469 |         logger.info(f"Tree-sitter C++ analysis for {file_path}")
470 |         if limits is None:
471 |             limits = create_c_cpp_limits()
472 |         analyzer = TreeSitterCAnalyzer(file_path, content, language="cpp", limits=limits)
473 |         analyzer.analyze()
474 |         logger.info(
475 |             f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {analyzer.limits.nodes_processed} nodes processed"
476 |         )
477 |         return analyzer.functions, analyzer.call_relationships
478 |     except Exception as e:
479 |         logger.error(f"Error in tree-sitter C++ analysis for {file_path}: {e}", exc_info=True)
480 |         return [], []
481 | 
482 | 
483 | def analyze_c_file(
484 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
485 | ) -> Tuple[List[Function], List[CallRelationship]]:
486 |     """Main entry point for C file analysis."""
487 |     return analyze_c_file_treesitter(file_path, content, limits)
488 | 
489 | 
490 | def analyze_cpp_file(
491 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
492 | ) -> Tuple[List[Function], List[CallRelationship]]:
493 |     """Main entry point for C++ file analysis."""
494 |     return analyze_cpp_file_treesitter(file_path, content, limits)
495 | 


--------------------------------------------------------------------------------
/src/gitprobe/analyzers/javascript.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Advanced JavaScript/TypeScript analyzer using Tree-sitter for accurate AST parsing.
  3 | 
  4 | This module provides proper AST-based analysis for JavaScript and TypeScript files,
  5 | replacing the regex-based approach with a more accurate tree-sitter implementation.
  6 | """
  7 | 
  8 | import logging
  9 | from typing import List, Set, Optional
 10 | from pathlib import Path
 11 | 
 12 | from tree_sitter import Parser, Language
 13 | import tree_sitter_javascript
 14 | import tree_sitter_typescript
 15 | 
 16 | from gitprobe.models.core import Function, CallRelationship
 17 | from gitprobe.core.analysis_limits import AnalysisLimits, create_javascript_limits
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class TreeSitterJSAnalyzer:
 23 |     """JavaScript analyzer using tree-sitter for proper AST parsing."""
 24 | 
 25 |     def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None):
 26 |         self.file_path = Path(file_path)
 27 |         self.content = content
 28 |         self.functions: List[Function] = []
 29 |         self.call_relationships: List[CallRelationship] = []
 30 |         self.limits = limits or create_javascript_limits()
 31 | 
 32 |         try:
 33 |             language_capsule = tree_sitter_javascript.language()
 34 |             self.js_language = Language(language_capsule)
 35 |             self.parser = Parser(self.js_language)
 36 |             logger.debug(
 37 |                 f"JavaScript parser initialized with language object: {type(self.js_language)}"
 38 |             )
 39 | 
 40 |             test_code = "function test() { console.log('test'); }"
 41 |             test_tree = self.parser.parse(bytes(test_code, "utf8"))
 42 |             if test_tree is None or test_tree.root_node is None:
 43 |                 raise RuntimeError("Parser setup test failed for JavaScript")
 44 |             logger.debug(
 45 |                 f"JavaScript parser test successful - root node type: {test_tree.root_node.type}"
 46 |             )
 47 | 
 48 |         except Exception as e:
 49 |             logger.error(f"Failed to initialize JavaScript parser: {e}")
 50 |             self.parser = None
 51 |             self.js_language = None
 52 | 
 53 |         logger.info(f"TreeSitterJSAnalyzer initialized for {file_path} with limits: {self.limits}")
 54 | 
 55 |     def analyze(self) -> None:
 56 |         """Analyze the JavaScript content and extract functions and call relationships."""
 57 |         if not self.limits.start_new_file():
 58 |             logger.info(f"Skipping {self.file_path} - global limits reached")
 59 |             return
 60 | 
 61 |         if self.parser is None:
 62 |             logger.warning(f"Skipping {self.file_path} - parser initialization failed")
 63 |             return
 64 | 
 65 |         try:
 66 |             tree = self.parser.parse(bytes(self.content, "utf8"))
 67 |             root_node = tree.root_node
 68 | 
 69 |             logger.info(f"Parsed AST with root node type: {root_node.type}")
 70 | 
 71 |             self._extract_functions(root_node)
 72 | 
 73 |             if not self.limits.should_stop():
 74 |                 self._extract_call_relationships(root_node)
 75 | 
 76 |             logger.info(
 77 |                 f"Analysis complete: {len(self.functions)} functions, {len(self.call_relationships)} relationships, {self.limits.nodes_processed} nodes processed"
 78 |             )
 79 | 
 80 |         except Exception as e:
 81 |             logger.error(f"Error analyzing JavaScript file {self.file_path}: {e}", exc_info=True)
 82 | 
 83 |     def _extract_functions(self, node) -> None:
 84 |         """Extract all function definitions from the AST."""
 85 |         self._traverse_for_functions(node)
 86 |         self.functions.sort(key=lambda f: f.line_start)
 87 | 
 88 |     def _traverse_for_functions(self, node) -> None:
 89 |         """Recursively traverse AST nodes to find functions."""
 90 | 
 91 |         if node.type == "function_declaration":
 92 |             func = self._extract_function_declaration(node)
 93 |             if func and self._should_include_function(func):
 94 |                 if self.limits.can_add_function():
 95 |                     self.functions.append(func)
 96 |                     if self.limits.add_function():
 97 |                         return
 98 |                 else:
 99 |                     return
100 | 
101 |         elif node.type == "export_statement":
102 |             func = self._extract_exported_function(node)
103 |             if func and self._should_include_function(func):
104 |                 if self.limits.can_add_function():
105 |                     self.functions.append(func)
106 |                     if self.limits.add_function():
107 |                         return
108 |                 else:
109 |                     return
110 | 
111 |         elif node.type == "lexical_declaration":
112 |             func = self._extract_arrow_function_from_declaration(node)
113 |             if func and self._should_include_function(func):
114 |                 if self.limits.can_add_function():
115 |                     self.functions.append(func)
116 |                     if self.limits.add_function():
117 |                         return
118 |                 else:
119 |                     return
120 | 
121 |         elif node.type == "method_definition":
122 |             func = self._extract_method_definition(node)
123 |             if func and self._should_include_function(func):
124 |                 if self.limits.can_add_function():
125 |                     self.functions.append(func)
126 |                     if self.limits.add_function():
127 |                         return
128 |                 else:
129 |                     return
130 | 
131 |         elif node.type == "pair":
132 |             func = self._extract_object_method(node)
133 |             if func and self._should_include_function(func):
134 |                 if self.limits.can_add_function():
135 |                     self.functions.append(func)
136 |                     if self.limits.add_function():
137 |                         return
138 |                 else:
139 |                     return
140 | 
141 |         elif node.type == "assignment_expression":
142 |             func = self._extract_assignment_function(node)
143 |             if func and self._should_include_function(func):
144 |                 if self.limits.can_add_function():
145 |                     self.functions.append(func)
146 |                     if self.limits.add_function():
147 |                         return
148 |                 else:
149 |                     return
150 | 
151 |         for child in node.children:
152 |             if self.limits.should_stop():
153 |                 break
154 |             self._traverse_for_functions(child)
155 | 
156 |     def _extract_function_declaration(self, node) -> Optional[Function]:
157 |         """Extract regular function declaration: function name() {}"""
158 |         try:
159 |             name_node = self._find_child_by_type(node, "identifier")
160 |             if not name_node:
161 |                 return None
162 | 
163 |             func_name = self._get_node_text(name_node)
164 |             line_start = node.start_point[0] + 1
165 |             line_end = node.end_point[0] + 1
166 |             parameters = self._extract_parameters(node)
167 |             code_snippet = self._get_node_text(node)
168 | 
169 |             return Function(
170 |                 name=func_name,
171 |                 file_path=str(self.file_path),
172 |                 line_start=line_start,
173 |                 line_end=line_end,
174 |                 parameters=parameters,
175 |                 docstring=None,
176 |                 is_method=False,
177 |                 class_name=None,
178 |                 code_snippet=code_snippet,
179 |             )
180 |         except Exception as e:
181 |             logger.warning(f"Error extracting function declaration: {e}")
182 |             return None
183 | 
184 |     def _extract_exported_function(self, node) -> Optional[Function]:
185 |         """Extract export function or export default function"""
186 |         try:
187 |             func_decl = self._find_child_by_type(node, "function_declaration")
188 |             if func_decl:
189 |                 func = self._extract_function_declaration(func_decl)
190 |                 if func:
191 |                     export_text = self._get_node_text(node)
192 |                     if "export default" in export_text and "function (" in export_text:
193 |                         func.name = "default"
194 |                 return func
195 |         except Exception as e:
196 |             logger.warning(f"Error extracting exported function: {e}")
197 |         return None
198 | 
199 |     def _extract_arrow_function_from_declaration(self, node) -> Optional[Function]:
200 |         """Extract arrow function or function expression from const/let/var declarations"""
201 |         try:
202 |             for child in node.children:
203 |                 if child.type == "variable_declarator":
204 |                     name_node = self._find_child_by_type(child, "identifier")
205 |                     func_node = self._find_child_by_type(
206 |                         child, "arrow_function"
207 |                     ) or self._find_child_by_type(child, "function_expression")
208 | 
209 |                     if name_node and func_node:
210 |                         func_name = self._get_node_text(name_node)
211 |                         line_start = func_node.start_point[0] + 1
212 |                         line_end = func_node.end_point[0] + 1
213 |                         parameters = self._extract_parameters(func_node)
214 |                         code_snippet = self._get_node_text(child)
215 | 
216 |                         return Function(
217 |                             name=func_name,
218 |                             file_path=str(self.file_path),
219 |                             line_start=line_start,
220 |                             line_end=line_end,
221 |                             parameters=parameters,
222 |                             docstring=None,
223 |                             is_method=False,
224 |                             class_name=None,
225 |                             code_snippet=code_snippet,
226 |                         )
227 |         except Exception as e:
228 |             logger.warning(f"Error extracting function from declaration: {e}")
229 |         return None
230 | 
231 |     def _extract_method_definition(self, node) -> Optional[Function]:
232 |         """Extract class method definition"""
233 |         try:
234 |             property_name = self._find_child_by_type(node, "property_identifier")
235 |             if not property_name:
236 |                 return None
237 | 
238 |             func_name = self._get_node_text(property_name)
239 |             line_start = node.start_point[0] + 1
240 |             line_end = node.end_point[0] + 1
241 |             parameters = self._extract_parameters(node)
242 |             code_snippet = self._get_node_text(node)
243 |             class_name = self._find_containing_class_name(node)
244 | 
245 |             return Function(
246 |                 name=func_name,
247 |                 file_path=str(self.file_path),
248 |                 line_start=line_start,
249 |                 line_end=line_end,
250 |                 parameters=parameters,
251 |                 docstring=None,
252 |                 is_method=True,
253 |                 class_name=class_name,
254 |                 code_snippet=code_snippet,
255 |             )
256 |         except Exception as e:
257 |             logger.warning(f"Error extracting method definition: {e}")
258 |             return None
259 | 
260 |     def _should_include_function(self, func: Function) -> bool:
261 |         """Determine if a function should be included in the analysis."""
262 |         excluded_names = {
263 |             "constructor",
264 |         }
265 | 
266 |         if func.name.lower() in excluded_names:
267 |             logger.debug(f"Skipping excluded function: {func.name}")
268 |             return False
269 | 
270 |         return True
271 | 
272 |     def _extract_parameters(self, node) -> List[str]:
273 |         """Extract parameter names from a function node."""
274 |         parameters = []
275 |         params_node = self._find_child_by_type(node, "formal_parameters")
276 |         if params_node:
277 |             for child in params_node.children:
278 |                 if child.type == "identifier":
279 |                     parameters.append(self._get_node_text(child))
280 |         return parameters
281 | 
282 |     def _extract_call_relationships(self, node) -> None:
283 |         """Extract function call relationships from the AST."""
284 |         func_ranges = {}
285 |         for func in self.functions:
286 |             for line in range(func.line_start, func.line_end + 1):
287 |                 func_ranges[line] = func
288 | 
289 |         self._traverse_for_calls(node, func_ranges)
290 | 
291 |     def _traverse_for_calls(self, node, func_ranges: dict) -> None:
292 |         """Recursively find function calls."""
293 | 
294 |         if node.type == "call_expression":
295 |             call_info = self._extract_call_from_node(node, func_ranges)
296 |             if call_info:
297 |                 if self.limits.can_add_relationship():
298 |                     self.call_relationships.append(call_info)
299 |                     if self.limits.add_relationship():
300 |                         return
301 |                 else:
302 |                     return
303 | 
304 |         for child in node.children:
305 |             if self.limits.should_stop():
306 |                 break
307 |             self._traverse_for_calls(child, func_ranges)
308 | 
309 |     def _extract_call_from_node(self, node, func_ranges: dict) -> Optional[CallRelationship]:
310 |         """Extract call relationship from a call_expression node."""
311 |         try:
312 |             call_line = node.start_point[0] + 1
313 |             caller_func = func_ranges.get(call_line)
314 | 
315 |             if not caller_func:
316 |                 return None
317 | 
318 |             callee_name = self._extract_callee_name(node)
319 |             if not callee_name or self._is_builtin_function(callee_name):
320 |                 return None
321 | 
322 |             caller_id = f"{self.file_path}:{caller_func.name}"
323 |             return CallRelationship(
324 |                 caller=caller_id,
325 |                 callee=callee_name,
326 |                 call_line=call_line,
327 |                 is_resolved=False,
328 |             )
329 |         except Exception as e:
330 |             logger.warning(f"Error extracting call relationship: {e}")
331 |             return None
332 | 
333 |     def _extract_callee_name(self, call_node) -> Optional[str]:
334 |         """Extract the name of the called function."""
335 |         if call_node.children:
336 |             callee_node = call_node.children[0]
337 | 
338 |             if callee_node.type == "identifier":
339 |                 return self._get_node_text(callee_node)
340 |             elif callee_node.type == "member_expression":
341 |                 property_node = self._find_child_by_type(callee_node, "property_identifier")
342 |                 if property_node:
343 |                     return self._get_node_text(property_node)
344 |         return None
345 | 
346 |     def _is_builtin_function(self, name: str) -> bool:
347 |         """Check if function name is a JavaScript built-in."""
348 |         builtins = {
349 |             "setTimeout",
350 |             "setInterval",
351 |             "clearTimeout",
352 |             "clearInterval",
353 |             "parseInt",
354 |             "parseFloat",
355 |             "isNaN",
356 |             "isFinite",
357 |             "encodeURIComponent",
358 |             "decodeURIComponent",
359 |             "eval",
360 |             "require",
361 |         }
362 |         return name in builtins
363 | 
364 |     def _find_child_by_type(self, node, node_type: str):
365 |         """Find first child node of specified type."""
366 |         for child in node.children:
367 |             if child.type == node_type:
368 |                 return child
369 |         return None
370 | 
371 |     def _get_node_text(self, node) -> str:
372 |         """Get the text content of a node."""
373 |         start_byte = node.start_byte
374 |         end_byte = node.end_byte
375 |         return self.content.encode("utf8")[start_byte:end_byte].decode("utf8")
376 | 
377 |     def _find_containing_class_name(self, method_node) -> Optional[str]:
378 |         """Find the name of the class containing a method."""
379 |         current = method_node.parent
380 |         while current:
381 |             if current.type == "class_declaration":
382 |                 name_node = self._find_child_by_type(current, "identifier")
383 |                 if name_node:
384 |                     return self._get_node_text(name_node)
385 |             current = current.parent
386 |         return None
387 | 
388 |     def _extract_object_method(self, node) -> Optional[Function]:
389 |         """Extract method from object literal: { method() {} } or { method: function() {} }"""
390 |         try:
391 |             key_node = None
392 |             value_node = None
393 | 
394 |             for child in node.children:
395 |                 if child.type in ["property_identifier", "identifier"]:
396 |                     key_node = child
397 |                 elif child.type in ["function_expression", "arrow_function"]:
398 |                     value_node = child
399 |                 elif child.type == "function_signature":
400 |                     value_node = node
401 | 
402 |             if key_node and value_node:
403 |                 func_name = self._get_node_text(key_node)
404 |                 line_start = value_node.start_point[0] + 1
405 |                 line_end = value_node.end_point[0] + 1
406 | 
407 |                 if value_node == node:
408 |                     parameters = self._extract_parameters(node)
409 |                 else:
410 |                     parameters = self._extract_parameters(value_node)
411 | 
412 |                 code_snippet = self._get_node_text(node)
413 | 
414 |                 return Function(
415 |                     name=func_name,
416 |                     file_path=str(self.file_path),
417 |                     line_start=line_start,
418 |                     line_end=line_end,
419 |                     parameters=parameters,
420 |                     docstring=None,
421 |                     is_method=False,
422 |                     class_name=None,
423 |                     code_snippet=code_snippet,
424 |                 )
425 |         except Exception as e:
426 |             logger.warning(f"Error extracting object method: {e}")
427 |         return None
428 | 
429 |     def _extract_assignment_function(self, node) -> Optional[Function]:
430 |         """Extract function from assignment: obj.method = function() {}"""
431 |         try:
432 |             left_node = None
433 |             right_node = None
434 | 
435 |             for child in node.children:
436 |                 if child.type in ["member_expression", "identifier"]:
437 |                     left_node = child
438 |                 elif child.type in ["function_expression", "arrow_function"]:
439 |                     right_node = child
440 | 
441 |             if left_node and right_node:
442 |                 func_name = self._extract_assignment_name(left_node)
443 |                 if func_name:
444 |                     line_start = right_node.start_point[0] + 1
445 |                     line_end = right_node.end_point[0] + 1
446 |                     parameters = self._extract_parameters(right_node)
447 |                     code_snippet = self._get_node_text(node)
448 | 
449 |                     return Function(
450 |                         name=func_name,
451 |                         file_path=str(self.file_path),
452 |                         line_start=line_start,
453 |                         line_end=line_end,
454 |                         parameters=parameters,
455 |                         docstring=None,
456 |                         is_method=False,
457 |                         class_name=None,
458 |                         code_snippet=code_snippet,
459 |                     )
460 |         except Exception as e:
461 |             logger.warning(f"Error extracting assignment function: {e}")
462 |         return None
463 | 
464 |     def _extract_assignment_name(self, node) -> Optional[str]:
465 |         """Extract function name from assignment left side."""
466 |         if node.type == "identifier":
467 |             return self._get_node_text(node)
468 |         elif node.type == "member_expression":
469 |             property_node = self._find_child_by_type(node, "property_identifier")
470 |             if property_node:
471 |                 return self._get_node_text(property_node)
472 |         return None
473 | 
474 | 
475 | class TreeSitterTSAnalyzer(TreeSitterJSAnalyzer):
476 |     """TypeScript analyzer using tree-sitter."""
477 | 
478 |     def __init__(self, file_path: str, content: str, limits: Optional[AnalysisLimits] = None):
479 |         self.file_path = Path(file_path)
480 |         self.content = content
481 |         self.functions: List[Function] = []
482 |         self.call_relationships: List[CallRelationship] = []
483 |         self.limits = limits or create_javascript_limits()
484 | 
485 |         try:
486 |             language_capsule = tree_sitter_typescript.language_typescript()
487 |             self.ts_language = Language(language_capsule)
488 |             self.parser = Parser(self.ts_language)
489 |             logger.debug(
490 |                 f"TypeScript parser initialized with language object: {type(self.ts_language)}"
491 |             )
492 | 
493 |             test_code = "function test(): void { console.log('test'); }"
494 |             test_tree = self.parser.parse(bytes(test_code, "utf8"))
495 |             if test_tree is None or test_tree.root_node is None:
496 |                 raise RuntimeError("Parser setup test failed for TypeScript")
497 |             logger.debug(
498 |                 f"TypeScript parser test successful - root node type: {test_tree.root_node.type}"
499 |             )
500 | 
501 |         except Exception as e:
502 |             logger.error(f"Failed to initialize TypeScript parser: {e}")
503 |             self.parser = None
504 |             self.ts_language = None
505 | 
506 |         logger.info(f"TreeSitterTSAnalyzer initialized for {file_path} with limits: {self.limits}")
507 | 
508 | 
509 | # Integration functions
510 | def analyze_javascript_file_treesitter(
511 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
512 | ) -> tuple[List[Function], List[CallRelationship]]:
513 |     """Analyze a JavaScript file using tree-sitter."""
514 |     try:
515 |         logger.info(f"Tree-sitter JS analysis for {file_path}")
516 |         if limits is None:
517 |             limits = create_javascript_limits()
518 |         analyzer = TreeSitterJSAnalyzer(file_path, content, limits)
519 |         analyzer.analyze()
520 |         logger.info(
521 |             f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {limits.nodes_processed} nodes processed"
522 |         )
523 |         return analyzer.functions, analyzer.call_relationships
524 |     except Exception as e:
525 |         logger.error(f"Error in tree-sitter JS analysis for {file_path}: {e}", exc_info=True)
526 |         return [], []
527 | 
528 | 
529 | def analyze_typescript_file_treesitter(
530 |     file_path: str, content: str, limits: Optional[AnalysisLimits] = None
531 | ) -> tuple[List[Function], List[CallRelationship]]:
532 |     """Analyze a TypeScript file using tree-sitter."""
533 |     try:
534 |         logger.info(f"Tree-sitter TS analysis for {file_path}")
535 |         if limits is None:
536 |             limits = create_javascript_limits()
537 |         analyzer = TreeSitterTSAnalyzer(file_path, content, limits)
538 |         analyzer.analyze()
539 |         logger.info(
540 |             f"Found {len(analyzer.functions)} functions, {len(analyzer.call_relationships)} calls, {limits.nodes_processed} nodes processed"
541 |         )
542 |         return analyzer.functions, analyzer.call_relationships
543 |     except Exception as e:
544 |         logger.error(f"Error in tree-sitter TS analysis for {file_path}: {e}", exc_info=True)
545 |         return [], []
546 | 


--------------------------------------------------------------------------------
/src/gitprobe/analysis/call_graph_analyzer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Call Graph Analyzer
  3 | 
  4 | Central orchestrator for multi-language call graph analysis.
  5 | Coordinates language-specific analyzers to build comprehensive call graphs
  6 | across different programming languages in a repository.
  7 | """
  8 | 
  9 | from pathlib import Path
 10 | from typing import Dict, List
 11 | import logging
 12 | from gitprobe.models.core import Function, CallRelationship
 13 | from gitprobe.utils.patterns import CODE_EXTENSIONS
 14 | from gitprobe.utils.security import safe_open_text
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class CallGraphAnalyzer:
 20 |     """
 21 |     Multi-language call graph analyzer.
 22 | 
 23 |     This analyzer orchestrates language-specific AST analyzers to build
 24 |     comprehensive call graphs across different programming languages.
 25 | 
 26 |     Supported languages:
 27 |     - Python (fully supported with AST parsing)
 28 |     - JavaScript (tree-sitter AST parsing - high accuracy, supports exports/imports)
 29 |     - TypeScript (tree-sitter AST parsing - high accuracy, supports exports/imports)
 30 |     - C (fully supported with AST parsing)
 31 |     - C++ (fully supported with AST parsing)
 32 |     - Go (fully supported with tree-sitter AST parsing)
 33 |     - Rust (fully supported with tree-sitter AST parsing)
 34 | 
 35 |     Key improvements:
 36 |     - JavaScript/TypeScript now use tree-sitter for 99%+ accuracy
 37 |     - Properly handles export/import statements, arrow functions, class methods
 38 |     - Automatically filters out constructors and other non-useful functions
 39 |     - Better call relationship detection
 40 |     """
 41 | 
 42 |     def __init__(self):
 43 |         """Initialize the call graph analyzer."""
 44 |         self.functions: Dict[str, Function] = {}
 45 |         self.call_relationships: List[CallRelationship] = []
 46 |         self.c_cpp_global_counter = None
 47 |         self.js_global_limits = None
 48 |         logger.info("CallGraphAnalyzer initialized.")
 49 | 
 50 |     def analyze_code_files(self, code_files: List[Dict], base_dir: str) -> Dict:
 51 |         """
 52 |         Relationship-maximizing analysis: Analyze all files to build complete call graph,
 53 |         then return the most connected 800-1000 nodes for optimal frontend rendering.
 54 | 
 55 |         This approach:
 56 |         1. Analyzes all code files (within limits)
 57 |         2. Extracts all functions and relationships
 58 |         3. Builds complete call graph
 59 |         4. Ranks nodes by connectivity (degree centrality)
 60 |         5. Returns top 800-1000 most connected nodes
 61 |         """
 62 |         logger.info(f"Starting relationship-maximizing analysis of {len(code_files)} files")
 63 | 
 64 |         self.functions = {}
 65 |         self.call_relationships = []
 66 | 
 67 |         from gitprobe.core.analysis_limits import reset_global_tracker
 68 | 
 69 |         reset_global_tracker()
 70 | 
 71 |         from gitprobe.core.analysis_limits import (
 72 |             create_python_limits,
 73 |             create_javascript_limits,
 74 |             create_go_limits,
 75 |             create_rust_limits,
 76 |             create_c_cpp_limits,
 77 |         )
 78 | 
 79 |         self.limits = {
 80 |             "python": create_python_limits(),
 81 |             "javascript": create_javascript_limits(),
 82 |             "typescript": create_javascript_limits(),
 83 |             "go": create_go_limits(),
 84 |             "rust": create_rust_limits(),
 85 |             "c": create_c_cpp_limits(),
 86 |             "cpp": create_c_cpp_limits(),
 87 |         }
 88 | 
 89 |         logger.info("Analyzing all code files to maximize relationships")
 90 |         files_analyzed = 0
 91 |         for file_info in code_files:
 92 |             from gitprobe.core.analysis_limits import get_global_tracker
 93 | 
 94 |             global_tracker = get_global_tracker()
 95 |             if global_tracker.should_stop():
 96 |                 logger.info(f"Global limits reached after {files_analyzed} files")
 97 |                 break
 98 | 
 99 |             logger.debug(f"Analyzing: {file_info['path']}")
100 |             self._analyze_code_file(base_dir, file_info)
101 |             files_analyzed += 1
102 | 
103 |             if files_analyzed % 20 == 0:
104 |                 logger.info(
105 |                     f"Progress: {files_analyzed} files, {len(self.functions)} functions, {len(self.call_relationships)} relationships"
106 |                 )
107 | 
108 |         logger.info(
109 |             f"Analysis complete: {files_analyzed} files analyzed, {len(self.functions)} functions, {len(self.call_relationships)} relationships"
110 |         )
111 | 
112 |         logger.info("Resolving call relationships")
113 |         self._resolve_call_relationships()
114 |         self._deduplicate_relationships()
115 | 
116 |         logger.info("Selecting most connected nodes for frontend")
117 |         self._select_most_connected_nodes(target_count=900)
118 | 
119 |         logger.info("Generating visualization data")
120 |         viz_data = self._generate_visualization_data()
121 | 
122 |         return {
123 |             "call_graph": {
124 |                 "total_functions": len(self.functions),
125 |                 "total_calls": len(self.call_relationships),
126 |                 "languages_found": list(set(f.get("language") for f in code_files)),
127 |                 "files_analyzed": files_analyzed,
128 |                 "analysis_approach": "relationship_maximizing",
129 |             },
130 |             "functions": [func.dict() for func in self.functions.values()],
131 |             "relationships": [rel.dict() for rel in self.call_relationships],
132 |             "visualization": viz_data,
133 |         }
134 | 
135 |     def extract_code_files(self, file_tree: Dict) -> List[Dict]:
136 |         """
137 |         Extract code files from file tree structure.
138 | 
139 |         Filters files based on supported extensions and excludes test/config files.
140 | 
141 |         Args:
142 |             file_tree: Nested dictionary representing file structure
143 | 
144 |         Returns:
145 |             List of code file information dictionaries
146 |         """
147 |         code_files = []
148 | 
149 |         def traverse(tree):
150 |             if tree["type"] == "file":
151 |                 ext = tree.get("extension", "").lower()
152 |                 if ext in CODE_EXTENSIONS:
153 |                     name = tree["name"].lower()
154 |                     if not any(skip in name for skip in ["test", "spec", "config", "setup"]):
155 |                         code_files.append(
156 |                             {
157 |                                 "path": tree["path"],
158 |                                 "name": tree["name"],
159 |                                 "extension": ext,
160 |                                 "language": CODE_EXTENSIONS[ext],
161 |                             }
162 |                         )
163 |             elif tree["type"] == "directory" and tree.get("children"):
164 |                 for child in tree["children"]:
165 |                     traverse(child)
166 | 
167 |         traverse(file_tree)
168 |         return code_files
169 | 
170 |     def _analyze_code_file(self, repo_dir: str, file_info: Dict):
171 |         """
172 |         Analyze a single code file based on its language.
173 | 
174 |         Routes to appropriate language-specific analyzer.
175 | 
176 |         Args:
177 |             repo_dir: Repository directory path
178 |             file_info: File information dictionary
179 |         """
180 |         # file_path = Path(repo_dir) / file_info["path"]
181 | 
182 |         # logger.debug(f"Reading content of {file_path}")
183 |         # try:
184 |         #     with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
185 |         #         content = f.read()
186 |         base = Path(repo_dir)
187 |         file_path = base / file_info["path"]
188 |         logger.debug(f"Reading content of {file_path}")
189 |         try:
190 |             content = safe_open_text(base, file_path)
191 |             language = file_info["language"]
192 |             logger.info(f"Analyzing {language} file: {file_path}")
193 |             if language == "python":
194 |                 self._analyze_python_file(file_path, content)
195 |             elif language == "javascript":
196 |                 self._analyze_javascript_file(file_path, content)
197 |             elif language == "typescript":
198 |                 self._analyze_typescript_file(file_path, content)
199 |             elif language == "c":
200 |                 self._analyze_c_file(file_path, content)
201 |             elif language == "cpp":
202 |                 self._analyze_cpp_file(file_path, content)
203 |             elif language == "go":
204 |                 self._analyze_go_file(file_path, content)
205 |             elif language == "rust":
206 |                 self._analyze_rust_file(file_path, content)
207 |             else:
208 |                 logger.warning(
209 |                     f"Unsupported language for call graph analysis: {language} for file {file_path}"
210 |                 )
211 | 
212 |         except Exception as e:
213 |             logger.error(f"⚠️ Error analyzing {file_path}: {str(e)}")
214 | 
215 |     def _analyze_python_file(self, file_path: str, content: str):
216 |         """
217 |         Analyze Python file using Python AST analyzer.
218 | 
219 |         Args:
220 |             file_path: Relative path to the Python file
221 |             content: File content string
222 |         """
223 |         from gitprobe.analyzers.python import analyze_python_file
224 | 
225 |         try:
226 |             functions, relationships = analyze_python_file(
227 |                 file_path, content, self.limits["python"]
228 |             )
229 |             logger.info(
230 |                 f"Found {len(functions)} functions and {len(relationships)} relationships in {file_path}"
231 |             )
232 | 
233 |             for func in functions:
234 |                 func_id = f"{file_path}:{func.name}"
235 |                 self.functions[func_id] = func
236 | 
237 |             self.call_relationships.extend(relationships)
238 |         except Exception as e:
239 |             logger.error(f"Failed to analyze Python file {file_path}: {e}", exc_info=True)
240 | 
241 |     def _analyze_javascript_file(self, file_path: str, content: str):
242 |         """
243 |         Analyze JavaScript file using tree-sitter based AST analyzer with global limits.
244 | 
245 |         Args:
246 |             file_path: Relative path to the JavaScript file
247 |             content: File content string
248 |         """
249 |         try:
250 |             logger.info(f"Starting tree-sitter JavaScript analysis for {file_path}")
251 | 
252 |             from gitprobe.analyzers.javascript import analyze_javascript_file_treesitter
253 | 
254 |             functions, relationships = analyze_javascript_file_treesitter(
255 |                 file_path, content, self.limits["javascript"]
256 |             )
257 | 
258 |             logger.info(
259 |                 f"Tree-sitter JavaScript analysis completed for {file_path}: {len(functions)} functions, {len(relationships)} relationships"
260 |             )
261 | 
262 |             for func in functions:
263 |                 func_id = f"{file_path}:{func.name}"
264 |                 self.functions[func_id] = func
265 | 
266 |             self.call_relationships.extend(relationships)
267 | 
268 |         except Exception as e:
269 |             logger.error(f"Failed to analyze JavaScript file {file_path}: {e}", exc_info=True)
270 | 
271 |     def _analyze_typescript_file(self, file_path: str, content: str):
272 |         """
273 |         Analyze TypeScript file using tree-sitter based AST analyzer with global limits.
274 | 
275 |         Args:
276 |             file_path: Relative path to the TypeScript file
277 |             content: File content string
278 |         """
279 |         try:
280 |             logger.info(f"Starting tree-sitter TypeScript analysis for {file_path}")
281 | 
282 |             from gitprobe.analyzers.javascript import analyze_typescript_file_treesitter
283 | 
284 |             functions, relationships = analyze_typescript_file_treesitter(
285 |                 file_path, content, self.limits["typescript"]
286 |             )
287 | 
288 |             logger.info(
289 |                 f"Tree-sitter TypeScript analysis completed for {file_path}: {len(functions)} functions, {len(relationships)} relationships"
290 |             )
291 | 
292 |             for func in functions:
293 |                 func_id = f"{file_path}:{func.name}"
294 |                 self.functions[func_id] = func
295 | 
296 |             self.call_relationships.extend(relationships)
297 | 
298 |         except Exception as e:
299 |             logger.error(f"Failed to analyze TypeScript file {file_path}: {e}", exc_info=True)
300 | 
301 |     def _analyze_c_file(self, file_path: str, content: str):
302 |         """
303 |         Analyze C file using tree-sitter based analyzer.
304 | 
305 |         Args:
306 |             file_path: Relative path to the C file
307 |             content: File content string
308 |         """
309 |         from gitprobe.analyzers.c_cpp import analyze_c_file_treesitter
310 | 
311 |         functions, relationships = analyze_c_file_treesitter(file_path, content, self.limits["c"])
312 | 
313 |         for func in functions:
314 |             func_id = f"{file_path}:{func.name}"
315 |             self.functions[func_id] = func
316 | 
317 |         self.call_relationships.extend(relationships)
318 | 
319 |     def _analyze_cpp_file(self, file_path: str, content: str):
320 |         """
321 |         Analyze C++ file using tree-sitter based analyzer.
322 | 
323 |         Args:
324 |             file_path: Relative path to the C++ file
325 |             content: File content string
326 |         """
327 |         from gitprobe.analyzers.c_cpp import analyze_cpp_file_treesitter
328 | 
329 |         functions, relationships = analyze_cpp_file_treesitter(
330 |             file_path, content, self.limits["cpp"]
331 |         )
332 | 
333 |         for func in functions:
334 |             func_id = f"{file_path}:{func.name}"
335 |             self.functions[func_id] = func
336 | 
337 |         self.call_relationships.extend(relationships)
338 | 
339 |     def _analyze_go_file(self, file_path: str, content: str):
340 |         """
341 |         Analyze Go file using Go AST analyzer.
342 | 
343 |         Args:
344 |             file_path: Relative path to the Go file
345 |             content: File content string
346 |         """
347 |         from gitprobe.analyzers.go import analyze_go_file_treesitter
348 | 
349 |         try:
350 |             functions, relationships = analyze_go_file_treesitter(
351 |                 file_path, content, self.limits["go"]
352 |             )
353 |             logger.info(
354 |                 f"Found {len(functions)} functions and {len(relationships)} relationships in {file_path}"
355 |             )
356 | 
357 |             for func in functions:
358 |                 func_id = f"{file_path}:{func.name}"
359 |                 self.functions[func_id] = func
360 | 
361 |             self.call_relationships.extend(relationships)
362 |         except Exception as e:
363 |             logger.error(f"Failed to analyze Go file {file_path}: {e}", exc_info=True)
364 | 
365 |     def _analyze_rust_file(self, file_path: str, content: str):
366 |         """
367 |         Analyze Rust file using Rust AST analyzer.
368 | 
369 |         Args:
370 |             file_path: Relative path to the Rust file
371 |             content: File content string
372 |         """
373 |         from gitprobe.analyzers.rust import analyze_rust_file_treesitter
374 | 
375 |         try:
376 |             functions, relationships = analyze_rust_file_treesitter(
377 |                 file_path, content, self.limits["rust"]
378 |             )
379 |             logger.info(
380 |                 f"Found {len(functions)} functions and {len(relationships)} relationships in {file_path}"
381 |             )
382 | 
383 |             for func in functions:
384 |                 func_id = f"{file_path}:{func.name}"
385 |                 self.functions[func_id] = func
386 | 
387 |             self.call_relationships.extend(relationships)
388 |         except Exception as e:
389 |             logger.error(f"Failed to analyze Rust file {file_path}: {e}", exc_info=True)
390 | 
391 |     def _resolve_call_relationships(self):
392 |         """
393 |         Resolve function call relationships across all languages.
394 | 
395 |         Attempts to match function calls to actual function definitions,
396 |         handling cross-language calls where possible.
397 |         """
398 |         logger.info("Building function lookup table for resolving relationships.")
399 |         func_lookup = {}
400 |         for func_id, func_info in self.functions.items():
401 |             func_lookup[func_info.name] = func_id
402 | 
403 |         resolved_count = 0
404 |         for relationship in self.call_relationships:
405 |             callee_name = relationship.callee
406 | 
407 |             if callee_name in func_lookup:
408 |                 relationship.callee = func_lookup[callee_name]
409 |                 relationship.is_resolved = True
410 |                 resolved_count += 1
411 |             elif "." in callee_name:
412 |                 method_name = callee_name.split(".")[-1]
413 |                 if method_name in func_lookup:
414 |                     relationship.callee = func_lookup[method_name]
415 |                     relationship.is_resolved = True
416 | 
417 |         logger.info(f"Resolved {resolved_count}/{len(self.call_relationships)} call relationships.")
418 | 
419 |     def _deduplicate_relationships(self):
420 |         """
421 |         Deduplicate call relationships based on caller-callee pairs.
422 | 
423 |         Removes duplicate relationships while preserving the first occurrence.
424 |         This helps eliminate noise from multiple calls to the same function.
425 |         """
426 |         seen = set()
427 |         unique_relationships = []
428 | 
429 |         for rel in self.call_relationships:
430 |             key = (rel.caller, rel.callee)
431 |             if key not in seen:
432 |                 seen.add(key)
433 |                 unique_relationships.append(rel)
434 | 
435 |         logger.debug(
436 |             f"Removed {len(self.call_relationships) - len(unique_relationships)} duplicate relationships."
437 |         )
438 |         self.call_relationships = unique_relationships
439 | 
440 |     def _generate_visualization_data(self) -> Dict:
441 |         """
442 |         Generate visualization data for graph rendering.
443 | 
444 |         Creates Cytoscape.js compatible graph data with nodes and edges.
445 | 
446 |         Returns:
447 |             Dict: Visualization data with cytoscape elements and summary
448 |         """
449 |         logger.info("Generating Cytoscape-compatible visualization data.")
450 |         cytoscape_elements = []
451 | 
452 |         logger.debug(f"Adding {len(self.functions)} function nodes.")
453 |         for func_id, func_info in self.functions.items():
454 |             node_classes = []
455 |             if func_info.is_method:
456 |                 node_classes.append("node-method")
457 |             else:
458 |                 node_classes.append("node-function")
459 | 
460 |             file_ext = Path(func_info.file_path).suffix.lower()
461 |             if file_ext == ".py":
462 |                 node_classes.append("lang-python")
463 |             elif file_ext == ".js":
464 |                 node_classes.append("lang-javascript")
465 |             elif file_ext == ".ts":
466 |                 node_classes.append("lang-typescript")
467 |             elif file_ext in [".c", ".h"]:
468 |                 node_classes.append("lang-c")
469 |             elif file_ext in [".cpp", ".cc", ".cxx", ".hpp", ".hxx"]:
470 |                 node_classes.append("lang-cpp")
471 | 
472 |             cytoscape_elements.append(
473 |                 {
474 |                     "data": {
475 |                         "id": func_id,
476 |                         "label": func_info.name,
477 |                         "file": func_info.file_path,
478 |                         "type": "method" if func_info.is_method else "function",
479 |                         "language": CODE_EXTENSIONS.get(file_ext, "unknown"),
480 |                     },
481 |                     "classes": " ".join(node_classes),
482 |                 }
483 |             )
484 | 
485 |         resolved_rels = [r for r in self.call_relationships if r.is_resolved]
486 |         logger.debug(f"Adding {len(resolved_rels)} relationship edges.")
487 |         for rel in resolved_rels:
488 |             cytoscape_elements.append(
489 |                 {
490 |                     "data": {
491 |                         "id": f"{rel.caller}->{rel.callee}",
492 |                         "source": rel.caller,
493 |                         "target": rel.callee,
494 |                         "line": rel.call_line,
495 |                     },
496 |                     "classes": "edge-call",
497 |                 }
498 |             )
499 | 
500 |         summary = {
501 |             "total_nodes": len(self.functions),
502 |             "total_edges": len(resolved_rels),
503 |             "unresolved_calls": len(self.call_relationships) - len(resolved_rels),
504 |         }
505 |         logger.info(f"Visualization data generated: {summary}")
506 | 
507 |         return {
508 |             "cytoscape": {"elements": cytoscape_elements},
509 |             "summary": summary,
510 |         }
511 | 
512 |     def generate_llm_format(self) -> Dict:
513 |         """Generate clean format optimized for LLM consumption."""
514 |         return {
515 |             "functions": [
516 |                 {
517 |                     "name": func.name,
518 |                     "file": Path(func.file_path).name,
519 |                     "purpose": (func.docstring.split("\n")[0] if func.docstring else None),
520 |                     "parameters": func.parameters,
521 |                     "is_recursive": func.name
522 |                     in [
523 |                         rel.callee
524 |                         for rel in self.call_relationships
525 |                         if rel.caller.endswith(func.name)
526 |                     ],
527 |                 }
528 |                 for func in self.functions.values()
529 |             ],
530 |             "relationships": {
531 |                 func.name: {
532 |                     "calls": [
533 |                         rel.callee.split(":")[-1]
534 |                         for rel in self.call_relationships
535 |                         if rel.caller.endswith(func.name) and rel.is_resolved
536 |                     ],
537 |                     "called_by": [
538 |                         rel.caller.split(":")[-1]
539 |                         for rel in self.call_relationships
540 |                         if rel.callee.endswith(func.name) and rel.is_resolved
541 |                     ],
542 |                 }
543 |                 for func in self.functions.values()
544 |             },
545 |         }
546 | 
547 |     def _select_most_connected_nodes(self, target_count: int):
548 |         """
549 |         Select the most connected nodes from the call graph.
550 | 
551 |         Args:
552 |             target_count: The number of nodes to select
553 |         """
554 |         if len(self.functions) <= target_count:
555 |             logger.info(
556 |                 f"Have {len(self.functions)} functions, target is {target_count} - keeping all"
557 |             )
558 |             return
559 | 
560 |         if not self.call_relationships:
561 |             logger.warning("No call relationships found - keeping all functions by name")
562 |             func_ids = list(self.functions.keys())[:target_count]
563 |             self.functions = {fid: func for fid, func in self.functions.items() if fid in func_ids}
564 |             return
565 | 
566 |         graph = {}
567 |         for rel in self.call_relationships:
568 |             if rel.caller in self.functions:
569 |                 if rel.caller not in graph:
570 |                     graph[rel.caller] = set()
571 |             if rel.callee in self.functions:
572 |                 if rel.callee not in graph:
573 |                     graph[rel.callee] = set()
574 | 
575 |             if rel.caller in graph and rel.callee in graph:
576 |                 graph[rel.caller].add(rel.callee)
577 |                 graph[rel.callee].add(rel.caller)
578 | 
579 |         degree_centrality = {}
580 |         for func_id in self.functions.keys():
581 |             degree_centrality[func_id] = len(graph.get(func_id, set()))
582 | 
583 |         sorted_func_ids = sorted(degree_centrality, key=degree_centrality.get, reverse=True)
584 | 
585 |         selected_func_ids = sorted_func_ids[:target_count]
586 | 
587 |         original_func_count = len(self.functions)
588 |         self.functions = {
589 |             fid: func for fid, func in self.functions.items() if fid in selected_func_ids
590 |         }
591 | 
592 |         original_rel_count = len(self.call_relationships)
593 |         self.call_relationships = [
594 |             rel
595 |             for rel in self.call_relationships
596 |             if rel.caller in selected_func_ids and rel.callee in selected_func_ids
597 |         ]
598 | 
599 |         logger.info(
600 |             f"Node selection: {original_func_count} -> {len(self.functions)} functions, "
601 |             f"{original_rel_count} -> {len(self.call_relationships)} relationships"
602 |         )
603 |         logger.info(f"Kept {len(selected_func_ids)} most connected nodes (target: {target_count})")
604 | 


--------------------------------------------------------------------------------