├── data ├── templates │ └── cpp │ │ ├── general.template │ │ ├── template.template │ │ └── Copy function using source size.template └── queries │ └── cpp │ ├── issues │ ├── qlpack.yml │ └── Copy function using source size.ql │ └── tools │ ├── qlpack.yml │ ├── Macros.ql │ ├── GlobalVars.ql │ ├── FunctionTree.ql │ └── Classes.ql ├── requirements.txt ├── images └── vulnhalla_logo.png ├── pytest.ini ├── tests ├── conftest.py └── test_smoke.py ├── examples ├── ui_example.py └── example.py ├── SECURITY.md ├── src ├── utils │ ├── exceptions │ │ ├── llm.py │ │ ├── codeql.py │ │ ├── __init__.py │ │ └── base.py │ ├── config.py │ ├── common_functions.py │ ├── llm_config.py │ ├── logger.py │ └── config_validator.py ├── ui │ ├── components │ │ ├── issues_list_panel.py │ │ ├── details_panel.py │ │ ├── controls_bar.py │ │ └── splitter_divider.py │ ├── models.py │ ├── issue_parser.py │ └── results_loader.py ├── pipeline.py ├── codeql │ └── run_codeql_queries.py └── vulnhalla.py ├── .gitignore ├── .env.example ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── setup.py ├── LICENSE.txt ├── README.md └── NOTICES.txt /data/templates/cpp/general.template: -------------------------------------------------------------------------------- 1 | 1. Is this a real security issue that you can exploit? -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | python-dotenv 3 | litellm 4 | PyYAML 5 | textual 6 | pySmartDL 7 | pytest -------------------------------------------------------------------------------- /images/vulnhalla_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberark/Vulnhalla/HEAD/images/vulnhalla_logo.png -------------------------------------------------------------------------------- /data/queries/cpp/issues/qlpack.yml: -------------------------------------------------------------------------------- 1 | name: vulnhalla-cpp 2 | version: 0.0.0 3 | dependencies: 4 | codeql/cpp-all: "*" 5 | -------------------------------------------------------------------------------- /data/queries/cpp/tools/qlpack.yml: -------------------------------------------------------------------------------- 1 | name: vulnhalla-cpp 2 | version: 0.0.0 3 | dependencies: 4 | codeql/cpp-all: "*" 5 | -------------------------------------------------------------------------------- /data/queries/cpp/tools/Macros.ql: -------------------------------------------------------------------------------- 1 | import cpp 2 | 3 | from Macro m 4 | select m.getName() as macro_name, "#define " + m.getHead() + " " + m.getBody() as body 5 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | python_files = test_*.py 4 | python_classes = Test* 5 | python_functions = test_* 6 | addopts = 7 | -v 8 | --strict-markers 9 | --tb=short -------------------------------------------------------------------------------- /data/templates/cpp/template.template: -------------------------------------------------------------------------------- 1 | ### Issue Overview 2 | Name: {name} 3 | Description: {description} 4 | Message: {message} 5 | Location: {location} 6 | 7 | ### Hints for Validation 8 | {hints} 9 | 10 | ### Code 11 | {code} -------------------------------------------------------------------------------- /data/queries/cpp/tools/GlobalVars.ql: -------------------------------------------------------------------------------- 1 | import cpp 2 | 3 | from GlobalOrNamespaceVariable g 4 | select g.getName() as global_var_name, g.getLocation().getFile() as file, g.getLocation().getStartLine() as start_line, g.getLocation().getEndLine() as end_line 5 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Shared pytest fixtures and configuration.""" 2 | 3 | import sys 4 | from pathlib import Path 5 | 6 | # Add project root to Python path for imports 7 | PROJECT_ROOT = Path(__file__).parent.parent 8 | if str(PROJECT_ROOT) not in sys.path: 9 | sys.path.insert(0, str(PROJECT_ROOT)) 10 | -------------------------------------------------------------------------------- /examples/ui_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Entry point for running the Vulnhalla UI. 4 | 5 | Usage: 6 | python examples/ui_example.py 7 | """ 8 | 9 | import sys 10 | from pathlib import Path 11 | 12 | # Add project root to Python path 13 | PROJECT_ROOT = Path(__file__).parent.parent 14 | sys.path.insert(0, str(PROJECT_ROOT)) 15 | 16 | from src.ui.ui_app import main 17 | 18 | if __name__ == "__main__": 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policies and Procedures 2 | 3 | CyberArk takes product security very seriously. If you believe you have found a vulnerability in one of our products, we ask that you follow responsible disclosure guidelines and contact product_security@cyberark.com and work with us toward a quick resolution to protect our customers. 4 | 5 | Refer to [CyberArk's Security Vulnerability Policy](https://www.cyberark.com/cyberark-security-vulinerability-policy.pdf) for more details. -------------------------------------------------------------------------------- /src/utils/exceptions/llm.py: -------------------------------------------------------------------------------- 1 | """LLM-related exceptions.""" 2 | 3 | from src.utils.exceptions.base import VulnhallaError 4 | 5 | 6 | class LLMError(VulnhallaError): 7 | """Base class for all LLM-related errors.""" 8 | pass 9 | 10 | 11 | class LLMConfigError(LLMError): 12 | """LLM configuration errors (missing keys, invalid provider, etc.).""" 13 | pass 14 | 15 | 16 | class LLMApiError(LLMError): 17 | """LLM API call failures (timeouts, rate limits, 5xx, etc.).""" 18 | pass 19 | 20 | 21 | -------------------------------------------------------------------------------- /tests/test_smoke.py: -------------------------------------------------------------------------------- 1 | """Smoke tests to verify basic test infrastructure works.""" 2 | 3 | def test_pytest_runs(): 4 | """Test that pytest can discover and run tests.""" 5 | assert True 6 | 7 | 8 | def test_can_import_src(): 9 | """Test that we can import the main project modules.""" 10 | 11 | from src.utils.common_functions import read_file 12 | from src.vulnhalla import IssueAnalyzer 13 | from src.llm.llm_analyzer import LLMAnalyzer 14 | assert read_file and IssueAnalyzer and LLMAnalyzer 15 | -------------------------------------------------------------------------------- /src/utils/exceptions/codeql.py: -------------------------------------------------------------------------------- 1 | """CodeQL-related exceptions.""" 2 | 3 | from src.utils.exceptions.base import VulnhallaError 4 | 5 | 6 | class CodeQLError(VulnhallaError): 7 | """Base class for all CodeQL-related errors.""" 8 | pass 9 | 10 | 11 | class CodeQLConfigError(CodeQLError): 12 | """CodeQL configuration errors (path, executable, packs, etc.).""" 13 | pass 14 | 15 | 16 | class CodeQLExecutionError(CodeQLError): 17 | """CodeQL query/database execution/decoding errors.""" 18 | pass 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/utils/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | """Vulnhalla exception hierarchy.""" 2 | 3 | from src.utils.exceptions.base import VulnhallaError 4 | from src.utils.exceptions.codeql import ( 5 | CodeQLError, 6 | CodeQLConfigError, 7 | CodeQLExecutionError, 8 | ) 9 | from src.utils.exceptions.llm import ( 10 | LLMError, 11 | LLMConfigError, 12 | LLMApiError, 13 | ) 14 | 15 | __all__ = [ 16 | "VulnhallaError", 17 | "CodeQLError", 18 | "CodeQLConfigError", 19 | "CodeQLExecutionError", 20 | "LLMError", 21 | "LLMConfigError", 22 | "LLMApiError", 23 | ] 24 | 25 | 26 | -------------------------------------------------------------------------------- /data/queries/cpp/tools/FunctionTree.ql: -------------------------------------------------------------------------------- 1 | import cpp 2 | 3 | string get_caller(Function c){ 4 | if exists(FunctionCall d | c.getACallToThisFunction() = d) 5 | then result = c.getACallToThisFunction().getEnclosingFunction().getLocation().getFile() + ":" + c.getACallToThisFunction().getEnclosingFunction().getLocation().getStartLine() 6 | else result = "" 7 | } 8 | 9 | 10 | from Function f 11 | select f.getName() as function_name, f.getLocation().getFile() as file, f.getLocation().getStartLine() as start_line, file + ":" + start_line as function_id, f.getBlock().getLocation().getEndLine() as end_line, get_caller(f) as caller_id 12 | 13 | -------------------------------------------------------------------------------- /src/utils/exceptions/base.py: -------------------------------------------------------------------------------- 1 | """Base exception class for all Vulnhalla-specific errors.""" 2 | 3 | 4 | class VulnhallaError(Exception): 5 | """ 6 | Base exception for all Vulnhalla-specific errors. 7 | 8 | Args: 9 | message: Human-readable error message. 10 | cause: Optional underlying exception that caused this error. 11 | """ 12 | def __init__(self, message: str, cause: Exception | None = None) -> None: 13 | super().__init__(message) 14 | self.cause = cause 15 | if cause is not None: 16 | # Enables chained traceback: VulnhallaError <- cause 17 | self.__cause__ = cause 18 | 19 | -------------------------------------------------------------------------------- /data/templates/cpp/Copy function using source size.template: -------------------------------------------------------------------------------- 1 | This static analysis checks if we are using source size in copy functions. This analysis does not check if there is a correlation between source and destination. This is your job! 2 | 1. What size are we using in the copy? Are we really using the source size and not the destination? 3 | 2. Does the source buffer point inside the destination buffer? 4 | 3. What is the size of the source buffer and what is the size of the destination? Is destination size derived from source? Answer this question only if source is not pointer inside dest! 5 | 4. Can the source buffer be bigger than the destination? (yes/no) 6 | Use the tools to get all data needed. 7 | If source is smaller than destination, there is no issue of buffer overflow! 8 | Only if source is bigger than destination it's a problem of buffer overflow! -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python build artifacts 2 | __pycache__/ 3 | *.pyc 4 | *.pyo 5 | *.pyd 6 | *.py[cod] 7 | *$py.class 8 | 9 | # Virtual environments 10 | venv/ 11 | env/ 12 | ENV/ 13 | .venv/ 14 | 15 | # Environment variables (contains API keys) 16 | .env 17 | .env.local 18 | .env.*.local 19 | 20 | # Data directories (runtime-generated) 21 | output/databases/ 22 | output/results/ 23 | output/zip_dbs/ 24 | data/queries/**/*.qlx 25 | data/queries/**/codeql-pack.lock.yml 26 | 27 | # IDE / OS files 28 | .vscode/ 29 | .idea/ 30 | .DS_Store 31 | Thumbs.db 32 | *.swp 33 | *.swo 34 | *~ 35 | 36 | # Temp / log files 37 | *.log 38 | *.tmp 39 | *.bak 40 | 41 | # Distribution / packaging 42 | .Python 43 | build/ 44 | develop-eggs/ 45 | dist/ 46 | downloads/ 47 | eggs/ 48 | .eggs/ 49 | lib/ 50 | lib64/ 51 | parts/ 52 | sdist/ 53 | var/ 54 | wheels/ 55 | *.egg-info/ 56 | .installed.cfg 57 | *.egg 58 | MANIFEST 59 | 60 | # Jupyter Notebook 61 | .ipynb_checkpoints 62 | 63 | # pytest 64 | .pytest_cache/ 65 | .coverage 66 | htmlcov/ 67 | 68 | # mypy 69 | .mypy_cache/ 70 | .dmypy.json 71 | dmypy.json 72 | -------------------------------------------------------------------------------- /src/ui/components/issues_list_panel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Issues list panel component for Vulnhalla UI. 4 | """ 5 | 6 | from textual.containers import Container, Vertical 7 | from textual.widgets import Label, DataTable, Static, Input 8 | from textual.app import ComposeResult 9 | 10 | 11 | class IssuesListPanel(Container): 12 | """ 13 | Left-hand panel showing list of issues in a DataTable. 14 | """ 15 | 16 | def compose(self) -> ComposeResult: 17 | """Compose the issues list panel layout. 18 | 19 | Builds the left-hand panel that displays the table of issues, 20 | including columns such as ID, repository, file and decisions, 21 | along with the search input. 22 | """ 23 | with Vertical(): 24 | yield Label("Issues", classes="panel-title") 25 | table = DataTable(id="issues-table") 26 | table.cursor_type = "row" 27 | yield table 28 | yield Static("", id="issues-count") 29 | yield Input(placeholder="Search by issue name, file, repo, LLM decision, or manual decision...", id="issues-search") 30 | 31 | -------------------------------------------------------------------------------- /src/utils/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Application Configuration Module 4 | 5 | Loads general application configuration from .env file or environment variables. 6 | Handles CodeQL path, GitHub token, and other non-LLM settings. 7 | """ 8 | 9 | import os 10 | from typing import Optional 11 | from dotenv import load_dotenv 12 | 13 | # Load .env file if it exists, otherwise try .env.example 14 | if os.path.exists(".env"): 15 | load_dotenv(".env") 16 | elif os.path.exists(".env.example"): 17 | load_dotenv(".env.example") 18 | 19 | 20 | def get_codeql_path() -> str: 21 | """ 22 | Get CodeQL executable path from .env file or environment variables. 23 | 24 | Returns: 25 | Path to CodeQL executable. Defaults to "codeql" if not set. 26 | """ 27 | path = os.getenv("CODEQL_PATH", "codeql") 28 | # Strip quotes and Python raw string prefix if present 29 | if path and path != "codeql": 30 | path = path.strip('"').strip("'") 31 | # Remove 'r' prefix if present (Python raw string syntax, not valid in .env) 32 | if path.startswith("r\"") or path.startswith("r'"): 33 | path = path[2:] 34 | path = path.strip('"').strip("'") 35 | return path 36 | 37 | 38 | def get_github_token() -> Optional[str]: 39 | """ 40 | Get GitHub API token from .env file or environment variables. 41 | 42 | Returns: 43 | GitHub token string if set, None otherwise. 44 | """ 45 | return os.getenv("GITHUB_TOKEN") 46 | 47 | -------------------------------------------------------------------------------- /src/ui/components/details_panel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Details panel component for Vulnhalla UI. 4 | """ 5 | 6 | from textual.containers import Container, Vertical, ScrollableContainer 7 | from textual.widgets import Static, Label, Select 8 | from textual.app import ComposeResult 9 | 10 | 11 | class DetailsPanel(Container): 12 | """ 13 | Right panel showing issue details with scrollable content and manual decision selector. 14 | """ 15 | 16 | def compose(self) -> ComposeResult: 17 | """Compose the issue details panel layout. 18 | 19 | Builds the right-hand panel that shows LLM decisions, metadata, 20 | code snippets and the manual decision selector for the selected issue. 21 | """ 22 | with Vertical(): 23 | # Scrollable content area 24 | scrollable_content = ScrollableContainer(id="details-scrollable") 25 | with scrollable_content: 26 | yield Static("Select an issue to view details", id="details-content", markup=True) 27 | # Manual decision controls 28 | with Vertical(id="manual-decision-container"): 29 | yield Label("Enter your manual decision:", id="manual-decision-label") 30 | yield Select( 31 | [ 32 | ("Not Set", None), 33 | ("True Positive", "True Positive"), 34 | ("False Positive", "False Positive"), 35 | ("Uncertain", "Uncertain"), 36 | ], 37 | value=None, 38 | id="manual-decision-select", 39 | prompt="Not Set" 40 | ) 41 | 42 | -------------------------------------------------------------------------------- /src/ui/components/controls_bar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Controls bar component for Vulnhalla UI. 4 | """ 5 | 6 | from textual.containers import Container, Horizontal, Vertical 7 | from textual.widgets import Static, Label, Select, Button 8 | from textual.app import ComposeResult 9 | 10 | 11 | class ControlsBar(Container): 12 | """ 13 | Bottom horizontal bar with controls, filters, and actions. 14 | """ 15 | 16 | def compose(self) -> ComposeResult: 17 | """Compose the controls bar layout. 18 | 19 | Creates the bottom bar with language information, filters, 20 | action buttons and keyboard shortcut help text. 21 | """ 22 | with Vertical(): 23 | # Language label only 24 | with Horizontal(): 25 | yield Static("Language: C (only language currently supported)", classes="control-label") 26 | # Filter and buttons 27 | with Horizontal(): 28 | yield Label("Filter by llm decision:", classes="control-label") 29 | yield Select( 30 | [("All", "all"), ("True Positive", "true"), ("False Positive", "false"), ("Needs more Info to decide", "more")], 31 | value="all", 32 | id="filter-select" 33 | ) 34 | yield Button("Refresh", id="refresh-btn") 35 | yield Button("Run Analysis", id="run-analysis-btn") 36 | yield Static("") 37 | # Key Bindings help text 38 | with Horizontal(): 39 | yield Label("Key Bindings:", classes="control-label") 40 | yield Static("↑/↓: Navigate | Tab: Switch focus | Enter: Show details | /: Search | [: Resize left | ]: Resize right | r: Reload | q: Quit", classes="help-text") 41 | 42 | -------------------------------------------------------------------------------- /data/queries/cpp/tools/Classes.ql: -------------------------------------------------------------------------------- 1 | import cpp 2 | 3 | private predicate isNamespaceEntity(NameQualifyingElement n) { n instanceof Namespace } 4 | private predicate isUserTypeEntity(NameQualifyingElement n) { n instanceof UserType } 5 | private predicate isClassEntity(NameQualifyingElement n) { n instanceof Class } 6 | 7 | 8 | private int getEndLine(NameQualifyingElement n) { 9 | exists (Namespace c | c = n and result = max(c.getADeclaration().getLocation().getEndLine())) 10 | or 11 | // Anonymus structs. This will get the real end line 12 | exists (Class c | c = n and c.getName() = "(unnamed class/struct/union)" and exists (TypedefType u | u.getUnderlyingType() instanceof Class and u.getUnderlyingType() = c and result = u.getLocation().getStartLine())) 13 | or 14 | exists (Class c | c = n and result = max(c.getAMember().getLocation().getEndLine())) 15 | or 16 | exists (UserType u | u = n and result = max(u.getADeclaration().getLocation().getEndLine())) 17 | } 18 | 19 | private string getType(NameQualifyingElement c) { 20 | isNamespaceEntity(c) and result = "NameSapce" 21 | or 22 | isClassEntity(c) and result = "Class" 23 | or 24 | isUserTypeEntity(c) and result = "UserType" 25 | } 26 | 27 | private string getName(NameQualifyingElement n) { 28 | // Anonymus structs 29 | n.getName() = "(unnamed class/struct/union)" and exists (TypedefType u | u.getUnderlyingType() instanceof Class and u.getUnderlyingType() = n and result = u.getName()) 30 | or 31 | result = n.getName() 32 | } 33 | 34 | private string getSimpleName(NameQualifyingElement n) { 35 | isNamespaceEntity(n) and result = "" 36 | or 37 | // Anonymus structs 38 | n.getName() = "(unnamed class/struct/union)" and exists (TypedefType u | u.getUnderlyingType() instanceof Class and u.getUnderlyingType() = n and result = u.getSimpleName()) 39 | or 40 | exists (UserType u | u = n and result = u.getSimpleName()) 41 | } 42 | 43 | from NameQualifyingElement c 44 | where isNamespaceEntity(c) or isUserTypeEntity(c) 45 | select getType(c) as type, getName(c) as name, c.getLocation().getFile() as file, c.getLocation().getStartLine() as start_line, getEndLine(c) as end_line, getSimpleName(c) as simple_name 46 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | example.py 5 | ---------- 6 | Example usage of Vulnhalla - demonstrates a full pipeline run: 7 | 1) Fetch CodeQL databases (from fetch_repos.py), 8 | 2) Run CodeQL queries (from run_codeql_queries.py), 9 | 3) Analyze results with LLM (from vulnhalla.py). 10 | """ 11 | 12 | import sys 13 | from pathlib import Path 14 | 15 | # Add project root to Python path 16 | PROJECT_ROOT = Path(__file__).parent.parent 17 | sys.path.insert(0, str(PROJECT_ROOT)) 18 | 19 | from src.codeql.fetch_repos import fetch_codeql_dbs 20 | from src.codeql.run_codeql_queries import compile_and_run_codeql_queries, DEFAULT_LANG 21 | from src.vulnhalla import IssueAnalyzer 22 | from src.utils.config import get_codeql_path 23 | from src.utils.config_validator import validate_and_exit_on_error 24 | from src.utils.logger import setup_logging, get_logger 25 | from src.ui.ui_app import main as ui_main 26 | 27 | logger = get_logger(__name__) 28 | 29 | 30 | def main(): 31 | """Run an end-to-end example of the Vulnhalla pipeline. 32 | 33 | This function fetches CodeQL databases for two demo 34 | repositories, runs CodeQL queries, classifies the findings 35 | using the configured LLM provider, writes the results 36 | to the output directory, and opens the results UI. 37 | """ 38 | # Initialize logging 39 | setup_logging() 40 | logger.info("Starting Vulnhalla pipeline... This may take a few minutes.") 41 | logger.info("") 42 | 43 | # Validate configuration before starting 44 | validate_and_exit_on_error() 45 | 46 | # 1) Fetch CodeQL database 47 | logger.info("[1/3] Fetching CodeQL DBs") 48 | fetch_codeql_dbs( 49 | lang="c", # Or use fetch_repos.LANG if set 50 | threads=4, # Higher threads may exceed GitHub rate limits. Add a GitHub token if you need higher throughput. 51 | 52 | single_repo="videolan/vlc" 53 | ) 54 | fetch_codeql_dbs(lang="c", threads=16, single_repo="redis/redis") 55 | 56 | # 2) Run CodeQL queries on all downloaded databases 57 | logger.info("\n[2/3] Running CodeQL Queries") 58 | compile_and_run_codeql_queries( 59 | codeql_bin=get_codeql_path(), 60 | lang="c", 61 | threads=16, 62 | timeout=300 63 | ) 64 | 65 | # 3) Build/Analyze CodeQL results 66 | logger.info("\n[3/3] Building and Analyzing Results") 67 | # Load configuration from .env file (create .env from .env.example) 68 | # Or use: analyzer = IssueAnalyzer(lang="c", api_key="your-api-key") 69 | analyzer = IssueAnalyzer(lang="c") 70 | analyzer.run() 71 | 72 | logger.info("\n✅ Pipeline completed successfully!") 73 | logger.info("Opening results UI...") 74 | ui_main() 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /src/ui/components/splitter_divider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Splitter divider component for Vulnhalla UI. 4 | """ 5 | 6 | from textual.widget import Widget 7 | 8 | 9 | class SplitterDivider(Widget): 10 | """ 11 | Draggable divider widget between panels for resizing. 12 | """ 13 | 14 | DEFAULT_CSS = """ 15 | SplitterDivider { 16 | width: 1; 17 | background: transparent; 18 | color: $surface-lighten-2; 19 | } 20 | SplitterDivider:hover { 21 | background: $primary; 22 | color: $primary; 23 | } 24 | """ 25 | 26 | def __init__(self, app_instance=None): 27 | """ 28 | Initialize the SplitterDivider. 29 | 30 | Args: 31 | app_instance: Reference to the VulnhallaUI app instance for updating split position. 32 | """ 33 | super().__init__() 34 | self.app_instance = app_instance 35 | self.dragging = False 36 | 37 | def render(self): 38 | """ 39 | Render the divider as a thin vertical line. 40 | 41 | Returns: 42 | str: Single vertical line character "│". 43 | """ 44 | return "│" 45 | 46 | def on_mouse_down(self, event) -> None: 47 | """ 48 | Start dragging when mouse is pressed. 49 | 50 | Args: 51 | event: Mouse down event. 52 | """ 53 | self.dragging = True 54 | self.capture_mouse() 55 | 56 | def on_mouse_move(self, event) -> None: 57 | """ 58 | Update split position while dragging. 59 | 60 | Args: 61 | event: Mouse move event containing position information. 62 | """ 63 | if self.dragging and self.app_instance: 64 | parent = self.parent 65 | if parent and parent.region: 66 | try: 67 | # Get mouse position relative to parent container 68 | mouse_x = event.screen_x - parent.region.x 69 | parent_width = parent.size.width 70 | if parent_width > 0: 71 | new_position = max(0.2, min(0.8, mouse_x / parent_width)) 72 | self.app_instance.split_position = new_position 73 | self.app_instance._update_split_position() 74 | except (AttributeError, TypeError): 75 | # Fallback: use delta if available 76 | if hasattr(event, 'delta_x') and event.delta_x != 0: 77 | parent_width = parent.size.width 78 | if parent_width > 0: 79 | delta = event.delta_x / parent_width 80 | new_position = max(0.2, min(0.8, self.app_instance.split_position + delta)) 81 | self.app_instance.split_position = new_position 82 | self.app_instance._update_split_position() 83 | 84 | def on_mouse_up(self, event) -> None: 85 | """ 86 | Stop dragging when mouse is released. 87 | 88 | Args: 89 | event: Mouse up event. 90 | """ 91 | if self.dragging: 92 | self.dragging = False 93 | self.release_mouse() 94 | 95 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # CodeQL Configuration 2 | 3 | # Path to CodeQL executable (required) 4 | # Examples: 5 | # Linux/macOS: /usr/local/bin/codeql or /path/to/codeql 6 | # Windows: IMPORTANT - Path MUST end with .cmd 7 | # Example: C:\path\to\codeql\codeql.cmd 8 | # Use forward slashes or escaped backslashes: C:/path/to/codeql/codeql.cmd 9 | # Or use raw string format: r"C:\path\to\codeql\codeql.cmd" 10 | CODEQL_PATH="your_codeql_path" 11 | 12 | # GitHub Configuration (optional, for higher rate limits) 13 | # Get token from: https://github.com/settings/tokens 14 | # GITHUB_TOKEN=ghp_your_token_here 15 | 16 | # LLM Configuration 17 | # Copy this file to .env and fill in your API keys 18 | 19 | # Provider selection (required) 20 | # Allowed providers: openai, azure, gemini 21 | 22 | # Model name (required, provider-specific) 23 | # Examples by provider: 24 | # OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo 25 | # Azure: gpt-4o, gpt-4 26 | # Google AI Studio: gemini-2.5-flash, gemini-2.0-flash 27 | 28 | # Optional: Override default LLM parameters 29 | # NOTE: 30 | # Do NOT increase these values unless you fully understand the impact. 31 | # Lower values keep the model stable and deterministic — critical for security analysis. 32 | # Higher values may cause the model to become inconsistent, creative, or hallucinate results. 33 | # Recommended: leave these at their default values. 34 | # LLM_TEMPERATURE=0.2 35 | # LLM_TOP_P=0.2 36 | 37 | # ============================================================================ 38 | # Provider-Specific Configuration 39 | # ============================================================================ 40 | # Uncomment and fill in the section for your chosen provider 41 | 42 | # ---------------------------------------------------------------------------- 43 | # OpenAI 44 | # ---------------------------------------------------------------------------- 45 | PROVIDER=openai 46 | MODEL=gpt-4o 47 | OPENAI_API_KEY="your_api_key" 48 | 49 | # ---------------------------------------------------------------------------- 50 | # Azure OpenAI 51 | # ---------------------------------------------------------------------------- 52 | # AZURE_OPENAI_API_KEY="your_api_key" 53 | # AZURE_OPENAI_ENDPOINT="https://your-name.openai.azure.com/" 54 | # AZURE_OPENAI_API_VERSION="2024-08-01-preview" 55 | # PROVIDER=azure 56 | # MODEL=gpt-4o 57 | 58 | # ---------------------------------------------------------------------------- 59 | # Google AI Studio 60 | # ---------------------------------------------------------------------------- 61 | # GOOGLE_API_KEY="your_api_key" 62 | # PROVIDER=gemini 63 | # MODEL=gemini-2.5-flash 64 | 65 | # Logging Configuration 66 | 67 | # DEBUG, INFO, WARNING, ERROR 68 | LOG_LEVEL=INFO 69 | 70 | # Optional: path to log file (e.g., logs/vulnhalla.log) 71 | # If empty or commented out, no file logging is used 72 | # LOG_FILE=logs/vulnhalla.log 73 | LOG_FILE= 74 | 75 | # default or json 76 | LOG_FORMAT=default 77 | 78 | # Console format control: 79 | # - Default: INFO messages are minimal (message only) 80 | # WARNING/ERROR/CRITICAL use simple format (LEVEL - message) 81 | # - If LOG_VERBOSE_CONSOLE=true: 82 | # WARNING/ERROR/CRITICAL use full format 83 | # (timestamp - logger - level - message) 84 | # - INFO always remains minimal regardless of verbose mode 85 | LOG_VERBOSE_CONSOLE=false 86 | 87 | # Control third-party library logging verbosity (LiteLLM, urllib3, requests). Default: ERROR 88 | THIRD_PARTY_LOG_LEVEL=ERROR -------------------------------------------------------------------------------- /data/queries/cpp/issues/Copy function using source size.ql: -------------------------------------------------------------------------------- 1 | /** 2 | * @name Copy function using source size 3 | * @description Calling a copy operation with a size derived from the source 4 | * buffer instead of the destination buffer may result in a buffer overflow. 5 | * @kind path-problem 6 | * @id cpp/overflow-destination 7 | * @problem.severity warning 8 | * @security-severity 9.3 9 | * @precision low 10 | * @tags reliability 11 | * security 12 | * external/cwe/cwe-119 13 | * external/cwe/cwe-131 14 | */ 15 | 16 | import cpp 17 | import semmle.code.cpp.ir.dataflow.TaintTracking 18 | import semmle.code.cpp.controlflow.IRGuards 19 | import semmle.code.cpp.security.FlowSources 20 | import OverflowDestination::PathGraph 21 | 22 | /** 23 | * Holds if `fc` is a call to a copy operation where the size argument contains 24 | * a reference to the source argument. For example: 25 | * ``` 26 | * memcpy(dest, src, sizeof(src)); 27 | * ``` 28 | */ 29 | predicate sourceSized(FunctionCall fc, Expr src) { 30 | fc.getTarget().hasGlobalOrStdName(["strncpy", "strncat", "memcpy", "memmove"]) and 31 | exists(Expr dest, Expr size, Variable v | 32 | fc.getArgument(0) = dest and 33 | fc.getArgument(1).getFullyConverted() = src and 34 | fc.getArgument(2) = size and 35 | src = v.getAnAccess().getFullyConverted() and 36 | size.getAChild+() = v.getAnAccess() and 37 | // exception: `dest` is also referenced in the size argument 38 | not exists(Variable other | 39 | dest = other.getAnAccess() and size.getAChild+() = other.getAnAccess() 40 | ) and 41 | // exception: `src` and `dest` are both arrays of the same type and size 42 | not exists(ArrayType srctype, ArrayType desttype | 43 | dest.getType().getUnderlyingType() = desttype and 44 | src.getType().getUnderlyingType() = srctype and 45 | desttype.getBaseType().getUnderlyingType() = srctype.getBaseType().getUnderlyingType() and 46 | desttype.getArraySize() = srctype.getArraySize() 47 | ) 48 | ) 49 | } 50 | 51 | predicate readsVariable(LoadInstruction load, Variable var) { 52 | load.getSourceAddress().(VariableAddressInstruction).getAstVariable() = var 53 | } 54 | 55 | predicate hasUpperBoundsCheck(Variable var) { 56 | exists(RelationalOperation oper, VariableAccess access | 57 | oper.getAnOperand() = access and 58 | access.getTarget() = var and 59 | // Comparing to 0 is not an upper bound check 60 | not oper.getAnOperand().getValue() = "0" 61 | ) 62 | } 63 | 64 | predicate nodeIsBarrierEqualityCandidate(DataFlow::Node node, Operand access, Variable checkedVar) { 65 | readsVariable(node.asInstruction(), checkedVar) and 66 | any(IRGuardCondition guard).ensuresEq(access, _, _, node.asInstruction().getBlock(), true) 67 | } 68 | 69 | module OverflowDestinationConfig implements DataFlow::ConfigSig { 70 | predicate isSource(DataFlow::Node source) { source instanceof FlowSource } 71 | 72 | predicate isSink(DataFlow::Node sink) { sourceSized(_, sink.asIndirectConvertedExpr()) } 73 | 74 | predicate isBarrier(DataFlow::Node node) { 75 | exists(Variable checkedVar | 76 | readsVariable(node.asInstruction(), checkedVar) and 77 | hasUpperBoundsCheck(checkedVar) 78 | ) 79 | or 80 | exists(Variable checkedVar, Operand access | 81 | readsVariable(access.getDef(), checkedVar) and 82 | nodeIsBarrierEqualityCandidate(node, access, checkedVar) 83 | ) 84 | } 85 | } 86 | 87 | module OverflowDestination = TaintTracking::Global; 88 | 89 | from FunctionCall fc, OverflowDestination::PathNode source, OverflowDestination::PathNode sink 90 | where 91 | OverflowDestination::flowPath(source, sink) and 92 | sourceSized(fc, sink.getNode().asIndirectConvertedExpr()) 93 | select fc, source, sink, 94 | "To avoid overflow, this operation should be bounded by destination-buffer size, not source-buffer size." 95 | -------------------------------------------------------------------------------- /src/ui/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Data models for Vulnhalla UI. 4 | """ 5 | 6 | from typing import Callable, Dict, List, Optional, Tuple 7 | from dataclasses import dataclass 8 | 9 | 10 | @dataclass 11 | class Issue: 12 | """ 13 | Represents a single analyzed issue from CodeQL analysis results. 14 | 15 | Attributes: 16 | id (str): Issue identifier extracted from filename (e.g., "1", "2"). 17 | name (str): Issue name or type. 18 | file (str): File path basename. 19 | line (int): Line number where the issue occurs. 20 | status (str): LLM classification status ("true", "false", or "more"). 21 | issue_type (str): Issue type directory name. 22 | lang (str): Language code. 23 | repo (str): Repository name in format "org/repo" (e.g., "redis/redis"). 24 | raw_path (str): Path to the _raw.json file. 25 | final_path (str): Path to the _final.json file. 26 | raw_data (Optional[Dict]): Parsed raw JSON data. 27 | final_data (Optional[List]): Parsed final JSON containing LLM messages. 28 | manual_decision (Optional[str]): Manual verdict set by user ("True Positive", 29 | "False Positive", "Uncertain", or None for "Not Set"). 30 | """ 31 | id: str 32 | name: str 33 | file: str 34 | line: int 35 | status: str 36 | issue_type: str 37 | lang: str 38 | repo: str 39 | raw_path: str 40 | final_path: str 41 | raw_data: Optional[Dict] = None 42 | final_data: Optional[List] = None 43 | manual_decision: Optional[str] = None 44 | 45 | 46 | # Constants for status ordering (used in sorting) 47 | STATUS_ORDER: Dict[str, int] = {"true": 0, "false": 1, "more": 2} 48 | 49 | MANUAL_DECISION_ORDER: Dict[Optional[str], int] = { 50 | "True Positive": 0, 51 | "False Positive": 1, 52 | "Uncertain": 2, 53 | "Not Set": 3, 54 | None: 3 55 | } 56 | 57 | # Status display mapping (internal status -> display text) 58 | STATUS_DISPLAY_MAP: Dict[str, str] = { 59 | "true": "True Positive", 60 | "false": "False Positive", 61 | "more": "Needs More Data" 62 | } 63 | 64 | 65 | def format_status_display(status: str) -> str: 66 | """ 67 | Format status value for display. 68 | 69 | Args: 70 | status (str): Internal status value ("true", "false", or "more"). 71 | 72 | Returns: 73 | str: Display text for the status. 74 | """ 75 | return STATUS_DISPLAY_MAP.get(status, status) 76 | 77 | 78 | def format_manual_decision(manual_decision: Optional[str]) -> str: 79 | """ 80 | Format manual decision value for display. 81 | 82 | Args: 83 | manual_decision (Optional[str]): Manual decision value or None. 84 | 85 | Returns: 86 | str: Display text for the manual decision ("Not Set" if None). 87 | """ 88 | return manual_decision if manual_decision else "Not Set" 89 | 90 | 91 | def get_default_sort_key(issue: "Issue") -> Tuple[str, float]: 92 | """ 93 | Get default sort key for an issue (repo, then ID). 94 | 95 | Args: 96 | issue (Issue): Issue to get sort key for. 97 | 98 | Returns: 99 | Tuple[str, float]: Sort key tuple (repo lowercase, numeric ID or inf). 100 | """ 101 | repo_key = issue.repo.lower() 102 | id_key = int(issue.id) if issue.id.isdigit() else float('inf') 103 | return (repo_key, id_key) 104 | 105 | 106 | def get_sort_key_for_column(column: str) -> Optional[Callable[["Issue"], any]]: 107 | """ 108 | Get sort key function for a given column name. 109 | 110 | Args: 111 | column (str): Column name to sort by. 112 | 113 | Returns: 114 | Optional[Callable]: Sort key function, or None if column not supported. 115 | """ 116 | sort_keys: Dict[str, Callable[["Issue"], any]] = { 117 | "ID": lambda issue: int(issue.id) if issue.id.isdigit() else float('inf'), 118 | "Repo": lambda issue: issue.repo.lower(), 119 | "Issue name": lambda issue: issue.name.lower(), 120 | "File": lambda issue: issue.file.lower(), 121 | "LLM decision": lambda issue: STATUS_ORDER.get(issue.status, 99), 122 | "Manual decision": lambda issue: MANUAL_DECISION_ORDER.get(issue.manual_decision, 3), 123 | } 124 | return sort_keys.get(column) 125 | 126 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Vulnhalla 2 | 3 | Thank you for your interest in contributing to Vulnhalla! We are always delighted to welcome new contributors! 4 | 5 | For general contributions and community guidelines, please see the [cyberark community documentation](https://github.com/cyberark/community/). 6 | 7 | ## Found an Issue? 8 | 9 | If you have found a bug, please raise an issue on the Vulnhalla repo: https://github.com/cyberark/Vulnhalla/issues 10 | 11 | ## Found a Vulnerability? 12 | 13 | If you think you have found a vulnerability in Vulnhalla, please refer to [Security](SECURITY.md) 14 | 15 | We are always very grateful to researchers who report vulnerabilities responsibly. 16 | 17 | ## Development 18 | 19 | We recommend using Python 3.10 – 3.13. Python 3.14+ is not supported (grpcio wheels unavailable). 20 | 21 | 22 | ### Contribution Guidelines 23 | 24 | - **Code Style**: Follow Python PEP 8 style guidelines 25 | - **Testing**: Test your changes using `examples/example.py` and `examples/ui_example.py` 26 | - **Documentation**: Update the README.md if you're adding new features or changing behavior 27 | - **Commit Messages**: Write clear, descriptive commit messages 28 | - **Pull Requests**: 29 | - Provide a clear description of your changes 30 | - Reference any related issues 31 | - Ensure your code works with Python 3.10-3.13 32 | - **Logging**: Use structured logging instead of `print()` statements (see [Logging Guidelines](#logging-guidelines) below) 33 | 34 | 35 | ### General Steps for Contributing (Creating a Pull Request) 36 | 37 | 1. Fork the project. 38 | 39 | 2. Clone your fork. 40 | 41 | ``` 42 | # Clone the repository (fork) 43 | git clone https://github.com/cyberark/Vulnhalla.git 44 | cd Vulnhalla 45 | ``` 46 | 47 | 3. Install the project's requirements and configure your environment. See [README.md](README.md) for detailed instructions on installing dependencies, setting up CodeQL packs, and configuring your `.env` file. 48 | 49 | 4. Make local changes to your fork by editing files 50 | 51 | 5. Test your changes 52 | 53 | ``` 54 | # Test the full pipeline 55 | python examples/example.py 56 | 57 | # Test UI changes (if applicable) 58 | python examples/ui_example.py 59 | ``` 60 | 61 | 6. Commit your changes. Use clear, descriptive commit messages. 62 | 63 | 7. Push your local changes to the remote server. 64 | 65 | 8. Create a new Pull Request. Please include: 66 | - A clear description of your changes 67 | - Reference to any related issues (e.g., "Fixes #123") 68 | - Any testing you performed 69 | 70 | From here, your pull request will be reviewed, and once it is merged into the project. Congratulations, you're a contributor! 71 | 72 | ### Reporting Issues 73 | 74 | Before reporting issues, please: 75 | - Check existing issues to avoid duplicates 76 | - Include Python version, OS, and error messages 77 | - Provide steps to reproduce the issue 78 | 79 | ## Logging Guidelines 80 | 81 | Vulnhalla uses centralized logging. Always use `get_logger(__name__)` instead of `print()` for application messages. 82 | 83 | ### Basic Usage 84 | 85 | ```python 86 | from src.utils.logger import get_logger 87 | 88 | logger = get_logger(__name__) 89 | 90 | # ✅ Good 91 | logger.info("Processing database: %s", db_path) 92 | logger.warning("Rate limit approaching: %d requests remaining", remaining) 93 | logger.error("Failed to process: %s", error_message) 94 | logger.debug("Debug information: %s", debug_data) 95 | 96 | # ❌ Bad 97 | print("Processing database:", db_path) # Don't use print() 98 | ``` 99 | 100 | ### Log Levels 101 | 102 | - **`logger.debug()`** - Detailed diagnostics (shown with `LOG_LEVEL=DEBUG`) 103 | - **`logger.info()`** - Status updates, progress messages 104 | - **`logger.warning()`** - Warnings (rate limits, missing data) 105 | - **`logger.error()`** - Errors, failures, exceptions 106 | 107 | ### When Print() is Acceptable 108 | 109 | `print()` is only acceptable for: 110 | - Interactive CLI prompts 111 | - Real-time progress indicators with `\r` (e.g., download progress bars) 112 | 113 | ## Testing 114 | 115 | Please test your changes manually using the example scripts: 116 | 117 | - `python examples/example.py` - Tests the full pipeline 118 | - `python examples/ui_example.py` - Tests the UI 119 | 120 | Ensure your code works with Python 3.10-3.13 before submitting. 121 | 122 | **Testing with Different Log Levels:** 123 | ```bash 124 | # Test with debug logging 125 | LOG_LEVEL=DEBUG python examples/example.py 126 | 127 | # Test with warning level only 128 | LOG_LEVEL=WARNING python examples/example.py 129 | ``` 130 | 131 | ## Releases 132 | 133 | Releases should only be created by our core maintainers. 134 | 135 | ## Legal 136 | 137 | Any submission of work, including any modification of, or addition to, an existing work ("Contribution") to "Vulnhalla" shall be governed by and subject to the terms of the Apache License, Version 2.0 (the "License") and to the following complementary terms. In case of any conflict or inconsistency between the provisions of the License and the complementary terms, the complementary terms shall prevail. By submitting the Contribution, you represent and warrant that the Contribution is your original creation and you own all right, title and interest in the Contribution. You represent that you are legally entitled to grant the rights set out in the License and herein, without violation of, or conflict with, the rights of any other party. You represent that your Contribution includes complete details of any third-party license or other restriction associated with any part of your Contribution of which you are personally aware. 138 | -------------------------------------------------------------------------------- /src/utils/common_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common utility functions for Vulnhalla. 3 | 4 | This module provides reusable helpers for file and path handling, 5 | working with CodeQL database directories, and other small I/O utilities 6 | that are shared across multiple parts of the project. 7 | """ 8 | 9 | import os 10 | import zipfile 11 | import yaml 12 | from typing import Any, Dict, List 13 | 14 | from src.utils.exceptions import VulnhallaError, CodeQLError 15 | 16 | 17 | def read_file(file_name: str) -> str: 18 | """ 19 | Read text from a file (UTF-8). 20 | 21 | Args: 22 | file_name (str): The path to the file to be read. 23 | 24 | Returns: 25 | str: The contents of the file, decoded as UTF-8. 26 | 27 | Raises: 28 | VulnhallaError: If file cannot be read (not found, permission denied, encoding error). 29 | """ 30 | try: 31 | with open(file_name, "r", encoding="utf-8") as f: 32 | return f.read() 33 | except FileNotFoundError as e: 34 | raise VulnhallaError(f"File not found: {file_name}") from e 35 | except PermissionError as e: 36 | raise VulnhallaError(f"Permission denied reading file: {file_name}") from e 37 | except UnicodeDecodeError as e: 38 | raise VulnhallaError(f"Failed to decode file as UTF-8: {file_name}") from e 39 | except OSError as e: 40 | raise VulnhallaError(f"OS error while reading file: {file_name}") from e 41 | 42 | 43 | def write_file_text(file_name: str, data: str) -> None: 44 | """ 45 | Write text data to a file (UTF-8). 46 | 47 | Args: 48 | file_name (str): The path to the file to be written. 49 | data (str): The string data to write to the file. 50 | 51 | Raises: 52 | VulnhallaError: If file cannot be written (permission denied, disk full, etc.). 53 | """ 54 | try: 55 | with open(file_name, "w", encoding="utf-8") as f: 56 | f.write(data) 57 | except PermissionError as e: 58 | raise VulnhallaError(f"Permission denied writing file: {file_name}") from e 59 | except OSError as e: 60 | raise VulnhallaError(f"OS error while writing file: {file_name}") from e 61 | 62 | 63 | def write_file_ascii(file_name: str, data: str) -> None: 64 | """ 65 | Write data to a file in ASCII mode (ignores errors). 66 | Useful for contexts similar to the original 'wb' approach 67 | where non-ASCII characters are simply dropped. 68 | 69 | Args: 70 | file_name (str): The path to the file to be written. 71 | data (str): The string data to write (non-ASCII chars ignored). 72 | 73 | Raises: 74 | VulnhallaError: If file cannot be written (permission denied, disk full, etc.). 75 | """ 76 | try: 77 | with open(file_name, "wb") as f: 78 | f.write(data.encode("ascii", "ignore")) 79 | except PermissionError as e: 80 | raise VulnhallaError(f"Permission denied writing file: {file_name}") from e 81 | except OSError as e: 82 | raise VulnhallaError(f"OS error while writing file: {file_name}") from e 83 | 84 | 85 | def get_all_dbs(dbs_folder: str) -> List[str]: 86 | """ 87 | Return a list of all CodeQL database paths under `dbs_folder`. 88 | 89 | Args: 90 | dbs_folder (str): The folder containing CodeQL databases. 91 | 92 | Returns: 93 | List[str]: A list of file-system paths pointing to valid CodeQL databases. 94 | 95 | Raises: 96 | CodeQLError: If database folder cannot be accessed (permission denied, not found, etc.). 97 | """ 98 | try: 99 | dbs_path = [] 100 | for folder in os.listdir(dbs_folder): 101 | folder_path = os.path.join(dbs_folder, folder) 102 | if os.path.isdir(folder_path): 103 | for sub_folder in os.listdir(folder_path): 104 | curr_db_path = os.path.join(folder_path, sub_folder) 105 | if os.path.exists(os.path.join(curr_db_path, "codeql-database.yml")): 106 | dbs_path.append(curr_db_path) 107 | return dbs_path 108 | except PermissionError as e: 109 | raise CodeQLError(f"Permission denied accessing database folder: {dbs_folder}") from e 110 | except OSError as e: 111 | raise CodeQLError(f"OS error while accessing database folder: {dbs_folder}") from e 112 | 113 | 114 | def read_file_lines_from_zip(zip_path: str, file_path_in_zip: str) -> str: 115 | """ 116 | Read text from a single file within a ZIP archive (UTF-8). 117 | 118 | Args: 119 | zip_path (str): The path to the ZIP file. 120 | file_path_in_zip (str): The internal path within the ZIP to the file. 121 | 122 | Returns: 123 | str: The contents of the file (as UTF-8) located within the ZIP. 124 | 125 | Raises: 126 | CodeQLError: If ZIP file cannot be read or file not found in archive. 127 | """ 128 | try: 129 | with zipfile.ZipFile(zip_path, 'r') as zip_ref: 130 | with zip_ref.open(file_path_in_zip) as file: 131 | return file.read().decode('utf-8') 132 | except zipfile.BadZipFile as e: 133 | raise CodeQLError(f"Invalid or corrupted ZIP file: {zip_path}") from e 134 | except KeyError as e: 135 | raise CodeQLError(f"File '{file_path_in_zip}' not found in ZIP archive: {zip_path}") from e 136 | except PermissionError as e: 137 | raise CodeQLError(f"Permission denied reading ZIP file: {zip_path}") from e 138 | except OSError as e: 139 | raise CodeQLError(f"OS error while reading ZIP file: {zip_path}") from e 140 | 141 | 142 | def read_yml(file_path: str) -> Dict[str, Any]: 143 | """ 144 | Read and parse a YAML file, returning its data as a Python dictionary. 145 | 146 | Args: 147 | file_path (str): The path to the YAML file. 148 | 149 | Returns: 150 | Dict[str, Any]: The YAML data as a dictionary. 151 | 152 | Raises: 153 | VulnhallaError: If file cannot be read or YAML parsing fails. 154 | """ 155 | try: 156 | with open(file_path, 'r', encoding="utf-8") as file: 157 | return yaml.safe_load(file) 158 | except FileNotFoundError as e: 159 | raise VulnhallaError(f"YAML file not found: {file_path}") from e 160 | except PermissionError as e: 161 | raise VulnhallaError(f"Permission denied reading YAML file: {file_path}") from e 162 | except yaml.YAMLError as e: 163 | raise VulnhallaError(f"Failed to parse YAML file: {file_path}") from e 164 | except OSError as e: 165 | raise VulnhallaError(f"OS error while reading YAML file: {file_path}") from e -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # CyberArk Community Code of Conduct 2 | 3 | CyberArk is a leader in Privileged Access Management, thanks to its customers and community. We listen to our community and wish to provide additional relevant tools. We believe that our mission is best served in an environment that is friendly, safe, and accepting; free from intimidation or harassment. 4 | 5 | Towards this end, CyberArk's developers have created this Community Code of Conduct for the CyberArk open source community. Our Code of Conduct sets the standard for how developers, and community members can work together in a respectful and collaborative manner. Those who do not abide by this Code of Conduct will not be permitted to remain part of our community. 6 | 7 | 8 | ## Summary of Key Principles 9 | 10 | - Be respectful to others in the community at all times. 11 | 12 | - Report harassing or abusive behavior that you experience or witness at ReportAbuse@cyberark.com 13 | 14 | - The CyberArk community will not tolerate abusive or disrespectful behavior towards its members; anyone engaging in such behavior will be suspended from the CyberArk community. 15 | 16 | 17 | ## Scope 18 | 19 | This Code of Conduct applies to all members of the CyberArk community, including paid and unpaid agents, administrators, users, and customers of CyberArk. It applies in all CyberArk community venues, online and in person, including CyberArk Open Source project communities (such as public GitHub repositories, chat channels, social media, mailing lists, and public events) and in one-on-one communications pertaining to CyberArk affairs. 20 | 21 | This policy covers the usage of CyberArk hosted services, as well as the CyberArk website, CyberArk related events, and any other services offered by or on behalf of CyberArk (collectively, the "Service"). 22 | 23 | This Code of Conduct is in addition to, and does not in any way nullify or invalidate, any other terms or conditions related to use of the Service. 24 | 25 | 26 | ## Maintaining a Friendly, Harassment-Free Space 27 | 28 | We are committed to providing a friendly, safe and welcoming environment for all, regardless of gender identity, sexual orientation, ability, ethnicity, religion, age, physical appearance, body size, race, or similar personal characteristics. 29 | 30 | We ask that you please respect that people have differences of opinion regarding technical choices, and that every design or implementation choice carries a trade-off and numerous costs. There is seldom a single right answer. A difference of technology preferences is not a license to be rude. 31 | 32 | Harassing other users of the Service for any reason is never tolerated, whether via public or private media. Any spamming, trolling, flaming, baiting, or other attention-stealing behavior is not welcome, and will not be tolerated. 33 | 34 | Even if your intent is not to harass or offend others, be mindful of how your comments might be perceived by others in the community. 35 | 36 | 37 | ## Unacceptable Behavior 38 | 39 | The following behaviors are considered harassment under this Code of Conduct and are unacceptable within our community: 40 | 41 | - Violence, threats of violence, or violent language directed against another person or group of people. 42 | 43 | - Sexist, racist, homophobic, transphobic, ableist, or otherwise discriminatory jokes and language. 44 | 45 | - Posting or displaying sexually explicit or violent material. 46 | 47 | - Posting or threatening to post other people's personally identifying information ("doxing"). 48 | 49 | - Personal insults, particularly those related to related to gender identity, sexual orientation, ability, ethnicity, religion, age, physical appearance, body size, race, or similar personal characteristics. 50 | 51 | - Using offensive or harassing nicknames or other identifiers. 52 | 53 | - Inappropriate photography or recording. 54 | 55 | - Inappropriate physical contact. You should have someone's consent before touching them. 56 | 57 | - Unwelcome sexual attention. This includes: sexualized comments or jokes; inappropriate touching, groping, and unwelcome sexual advances. 58 | 59 | - Deliberate intimidation, stalking, or following (online or in person). 60 | 61 | - Sustained disruption of community events, including talks and presentations. 62 | 63 | - Advocating for, or encouraging, any of the above behavior. 64 | 65 | 66 | ## Reporting Violations 67 | 68 | If you witness or experience unacceptable behavior in the CyberArk community, please promptly report it to our team at ReportAbuse@cyberark.com. If this is the initial report of a problem, please include as much detail as possible. It is easiest for us to address issues when we have more context. 69 | 70 | The CyberArk Community Team will look into any reported issues in a confidential manner and take any necessary actions to address and resolve the problem. 71 | 72 | We will not tolerate any form of retaliation towards users who report these issues to us. 73 | 74 | If you feel that you have been falsely or unfairly accused of violating this Code of Conduct by others in the community, you should notify the ReportAbuse@cyberark.com team so that we can address and resolve the accusation. 75 | 76 | As always, if you have an urgent security issue, contact product_security@cyberark.com and if you have concerns about a potential copyright violation, contact legal@cyberark.com. 77 | 78 | 79 | ## Consequences 80 | 81 | All content published to the Service, including user account credentials, is hosted at the sole discretion of the CyberArk administrators. If a community member engages in unacceptable behavior, the CyberArk administrators may take any action they deem appropriate, up to and including a temporary ban or permanent expulsion from the community without warning. In general, we will choose the course of action that we judge as being most in the interest of fostering a safe and friendly community. 82 | 83 | 84 | ## Contact Info 85 | 86 | Please contact ReportAbuse@cyberark.com if you need to report a problem or address a grievance related to an abuse report. 87 | 88 | You are also encouraged to contact us if you have questions about what constitutes appropriate and inappropriate content. We are happy to provide guidance to help you be a successful part of our community. Our technical community is available [here](https://cyberark-customers.force.com/s/). 89 | 90 | 91 | ## Credit and License 92 | 93 | 94 | This Code of Conduct borrows from the [npm Code of Conduct](https://www.npmjs.com/policies/conduct), Stumptown Syndicate [Citizen's Code of Conduct](http://citizencodeofconduct.org/), and the [Rust Project Code of Conduct](https://www.rust-lang.org/conduct.html). 95 | 96 | This document may be reused under a [Creative Commons Attribution-ShareAlike License](https://creativecommons.org/licenses/by-sa/4.0/). -------------------------------------------------------------------------------- /src/utils/llm_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | LLM Configuration Module 4 | 5 | Loads LLM configuration from .env file or environment variables. 6 | Supports multiple providers via LiteLLM. 7 | """ 8 | 9 | import os 10 | from typing import Dict, Optional, Any 11 | from dotenv import load_dotenv 12 | 13 | # Load .env file if it exists, otherwise try .env.example 14 | if os.path.exists(".env"): 15 | load_dotenv(".env") 16 | elif os.path.exists(".env.example"): 17 | load_dotenv(".env.example") 18 | 19 | # Allowed LLM providers 20 | ALLOWED_LLM_PROVIDERS = { 21 | "openai", "azure", "anthropic", "mistral", "codestral", 22 | "groq", "openrouter", "huggingface", "cohere", "bedrock", 23 | "vertex_ai", "gemini", "ollama" 24 | } 25 | 26 | 27 | def get_model_name(provider: Optional[str], model: Optional[str]) -> str: 28 | """ 29 | Construct the model name in LiteLLM format. 30 | 31 | Args: 32 | provider: Provider name (e.g., "openai", "azure", "anthropic") 33 | model: Model name (e.g., "gpt-4o", "claude-3-opus", "openrouter/google/gemini-pro") 34 | 35 | Returns: 36 | Model name in LiteLLM format (e.g., "gpt-4o" or "azure/gpt-4o") 37 | """ 38 | if not model: 39 | return "gpt-4o" # Default fallback 40 | 41 | # For OpenAI, return as-is (no prefix needed) 42 | if provider == "openai": 43 | return model 44 | 45 | # For Azure, ensure model looks like "azure/" 46 | if provider == "azure": 47 | if model.startswith("azure/"): 48 | return model 49 | return f"azure/{model}" 50 | 51 | # For all other providers, add provider/ prefix if not already present 52 | if provider: 53 | if model.startswith(f"{provider}/"): 54 | return model # Already has correct prefix 55 | return f"{provider}/{model}" 56 | 57 | return model 58 | 59 | 60 | def load_llm_config() -> Dict[str, Any]: 61 | """ 62 | Load LLM configuration from .env file or environment variables. 63 | 64 | Returns: 65 | Dictionary with LLM configuration: 66 | { 67 | "provider": str, 68 | "model": str, 69 | "api_key": str, 70 | "endpoint": Optional[str], 71 | "api_version": Optional[str], 72 | "temperature": float, 73 | "top_p": float 74 | } 75 | 76 | Raises: 77 | ValueError: If required configuration is missing 78 | """ 79 | # Determine provider 80 | provider = os.getenv("PROVIDER", "openai").lower() 81 | 82 | # Normalize aliases to canonical provider name 83 | if provider == "google": 84 | provider = "gemini" 85 | 86 | # Validate provider is in allowed list 87 | if provider not in ALLOWED_LLM_PROVIDERS: 88 | raise ValueError( 89 | f"Provider '{provider}' is not supported. " 90 | f"Allowed providers: {', '.join(sorted(ALLOWED_LLM_PROVIDERS))}" 91 | ) 92 | 93 | # Get model name 94 | model = os.getenv("MODEL", "gpt-4o") 95 | 96 | # Get API key and provider-specific config based on provider 97 | api_key = None 98 | endpoint = None 99 | api_version = None 100 | 101 | if provider == "openai": 102 | api_key = os.getenv("OPENAI_API_KEY") 103 | 104 | elif provider == "azure": 105 | api_key = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AZURE_API_KEY") 106 | endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("AZURE_API_BASE") 107 | api_version = os.getenv("AZURE_OPENAI_API_VERSION") or os.getenv("AZURE_API_VERSION", "2024-08-01-preview") 108 | 109 | elif provider == "anthropic": 110 | api_key = os.getenv("ANTHROPIC_API_KEY") 111 | 112 | elif provider == "gemini": 113 | api_key = os.getenv("GOOGLE_API_KEY") 114 | 115 | elif provider == "mistral": 116 | api_key = os.getenv("MISTRAL_API_KEY") 117 | 118 | elif provider == "codestral": 119 | # Codestral uses Mistral API key 120 | api_key = os.getenv("MISTRAL_API_KEY") 121 | 122 | elif provider == "groq": 123 | api_key = os.getenv("GROQ_API_KEY") 124 | 125 | elif provider == "openrouter": 126 | api_key = os.getenv("OPENROUTER_API_KEY") 127 | 128 | elif provider == "huggingface": 129 | api_key = os.getenv("HUGGINGFACE_API_KEY") 130 | 131 | elif provider == "cohere": 132 | api_key = os.getenv("COHERE_API_KEY") or os.getenv("CO_API_KEY") 133 | 134 | elif provider == "bedrock": 135 | # Bedrock uses AWS credentials 136 | api_key = os.getenv("AWS_ACCESS_KEY_ID") 137 | aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY") 138 | aws_region = os.getenv("AWS_REGION_NAME", "us-east-1") 139 | # Store region in endpoint field for Bedrock 140 | endpoint = aws_region 141 | 142 | elif provider == "vertex_ai": 143 | # Vertex AI uses GCP credentials (service account JSON or GOOGLE_APPLICATION_CREDENTIALS) 144 | # No API key needed, but we set a placeholder to pass validation 145 | gcp_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") 146 | if not gcp_creds and not os.path.exists(os.path.expanduser("~/.config/gcloud/application_default_credentials.json")): 147 | raise ValueError( 148 | "GCP credentials not found. Set GOOGLE_APPLICATION_CREDENTIALS or run 'gcloud auth application-default login'" 149 | ) 150 | api_key = "vertex_ai_placeholder" 151 | 152 | elif provider == "ollama": 153 | # Ollama uses OLLAMA_BASE_URL (defaults to http://localhost:11434) 154 | endpoint = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") 155 | # Ollama doesn't require API key, but we set a placeholder to pass validation 156 | api_key = "ollama_placeholder" 157 | 158 | # Get optional parameters 159 | temperature = float(os.getenv("LLM_TEMPERATURE", "0.2")) 160 | top_p = float(os.getenv("LLM_TOP_P", "0.2")) 161 | 162 | config = { 163 | "provider": provider, 164 | "model": get_model_name(provider, model), 165 | "api_key": api_key, 166 | "temperature": temperature, 167 | "top_p": top_p 168 | } 169 | 170 | # Add provider-specific fields 171 | if endpoint: 172 | config["endpoint"] = endpoint 173 | if api_version: 174 | config["api_version"] = api_version 175 | 176 | # Special handling for Bedrock (store AWS region and secret) 177 | if provider == "bedrock": 178 | config["aws_secret_access_key"] = os.getenv("AWS_SECRET_ACCESS_KEY") 179 | config["aws_region"] = endpoint # Store region in endpoint field 180 | 181 | # Special handling for Vertex AI (store GCP project/location if provided) 182 | if provider == "vertex_ai": 183 | if os.getenv("GCP_PROJECT_ID"): 184 | config["gcp_project_id"] = os.getenv("GCP_PROJECT_ID") 185 | if os.getenv("GCP_LOCATION"): 186 | config["gcp_location"] = os.getenv("GCP_LOCATION") 187 | 188 | return config 189 | 190 | 191 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Vulnhalla Setup Script - Cross platform one line installation 4 | Usage: python setup.py 5 | """ 6 | 7 | import os 8 | import sys 9 | import subprocess 10 | import shutil 11 | from pathlib import Path 12 | 13 | # Get project root 14 | PROJECT_ROOT = Path(__file__).parent 15 | 16 | # Add project root to Python path for imports 17 | sys.path.insert(0, str(PROJECT_ROOT)) 18 | 19 | # Initialize logging early 20 | from src.utils.logger import setup_logging, get_logger 21 | setup_logging() 22 | logger = get_logger(__name__) 23 | # Check Python version 24 | if sys.version_info >= (3, 14): 25 | logger.error("Python 3.14+ is not yet supported (grpcio wheels unavailable). Please use Python 3.11 or 3.12.") 26 | sys.exit(1) 27 | 28 | 29 | def check_dependencies_installed() -> bool: 30 | """ 31 | Check if all required dependencies are already installed by trying to import them. 32 | 33 | Returns: 34 | bool: True if all dependencies are installed, False otherwise. 35 | """ 36 | try: 37 | import requests 38 | import dotenv 39 | import litellm 40 | import yaml 41 | import textual 42 | import pySmartDL 43 | return True 44 | except ImportError: 45 | return False 46 | 47 | 48 | def main(): 49 | """Run the Vulnhalla setup process. 50 | 51 | This script installs Python dependencies, verifies the CodeQL 52 | CLI configuration, installs required CodeQL packs, and prints 53 | next steps for running the analysis pipeline. 54 | """ 55 | logger.info("Vulnhalla Setup") 56 | logger.info("=" * 50) 57 | 58 | # Check if virtual environment exists 59 | venv_path = PROJECT_ROOT / "venv" 60 | use_venv = venv_path.exists() 61 | 62 | if use_venv: 63 | # Use virtual environment pip 64 | if os.name == 'nt': # Windows 65 | pip_exe = [str(PROJECT_ROOT / "venv/Scripts/pip.exe")] 66 | else: # Unix/macOS/Linux 67 | pip_exe = [str(PROJECT_ROOT / "venv/bin/pip")] 68 | logger.info("Using virtual environment...") 69 | else: 70 | # Use system pip 71 | pip_exe = [sys.executable, "-m", "pip"] 72 | logger.info("Installing to current Python environment...") 73 | 74 | if check_dependencies_installed(): 75 | logger.info("✅ All dependencies are already installed! Skipping installation.") 76 | else: 77 | # Install dependencies 78 | logger.info("📦 Installing Python dependencies... This may take a moment ⏳") 79 | try: 80 | subprocess.run(pip_exe + ["install","-q", "-r", str(PROJECT_ROOT / "requirements.txt")], check=True) 81 | logger.info("✅ Python dependencies installed successfully!") 82 | except subprocess.CalledProcessError as e: 83 | logger.error("\n❌ Setup failed. Please fix the missing dependencies and run setup.py again.") 84 | sys.exit(1) 85 | 86 | # Install CodeQL packs 87 | # Check for CodeQL in PATH or .env 88 | codeql_cmd = None 89 | 90 | try: 91 | from src.utils.config import get_codeql_path 92 | from src.utils.config_validator import find_codeql_executable 93 | 94 | codeql_path = get_codeql_path() 95 | logger.info("Checking CodeQL path: %s", codeql_path) 96 | 97 | # Use helper function to find executable 98 | codeql_cmd = find_codeql_executable() 99 | 100 | if codeql_cmd: 101 | if codeql_path == "codeql": 102 | logger.info("🔍 Checking if 'codeql' is in PATH...") 103 | logger.info("✅ Found in PATH: %s", codeql_cmd) 104 | else: 105 | logger.info("✅ Found CodeQL path: %s", codeql_cmd) 106 | else: 107 | # Provide detailed error messages 108 | if codeql_path and codeql_path != "codeql": 109 | # Custom path specified - strip quotes if present 110 | codeql_path_clean = codeql_path.strip('"').strip("'") 111 | logger.error("❌ Path does not exist: %s", codeql_path_clean) 112 | if os.name == 'nt': 113 | logger.info("Also checked: %s.cmd", codeql_path_clean) 114 | else: 115 | logger.info("🔍 Checking if 'codeql' is in PATH...") 116 | logger.error("❌ 'codeql' not found in PATH") 117 | except Exception as e: 118 | # Fallback to checking PATH 119 | logger.error("❌ Error loading config: %s", e) 120 | logger.info("🔍 Falling back to PATH check...") 121 | codeql_cmd = shutil.which("codeql") 122 | if codeql_cmd: 123 | logger.info("✅ Found in PATH: %s", codeql_cmd) 124 | 125 | if codeql_cmd: 126 | logger.info("📦 Installing CodeQL packs... This may take a moment ⏳") 127 | 128 | # Tools pack 129 | tools_dir = PROJECT_ROOT / "data/queries/cpp/tools" 130 | if tools_dir.exists(): 131 | os.chdir(str(tools_dir)) 132 | result = subprocess.run([codeql_cmd, "pack", "install"], check=False, capture_output=True, text=True) 133 | if result.returncode != 0: 134 | logger.warning("Failed to install tools pack: %s", result.stderr) 135 | os.chdir(str(PROJECT_ROOT)) 136 | 137 | # Issues pack 138 | issues_dir = PROJECT_ROOT / "data/queries/cpp/issues" 139 | if issues_dir.exists(): 140 | os.chdir(str(issues_dir)) 141 | result = subprocess.run([codeql_cmd, "pack", "install"], check=False, capture_output=True, text=True) 142 | if result.returncode != 0: 143 | logger.warning("Failed to install issues pack: %s", result.stderr) 144 | os.chdir(str(PROJECT_ROOT)) 145 | else: 146 | logger.error("❌ CodeQL CLI not found. Skipping CodeQL pack installation.") 147 | logger.info("🔗 Install CodeQL CLI from: https://github.com/github/codeql-cli-binaries/releases") 148 | logger.info(" After installation, either add CodeQL to your PATH or set CODEQL_PATH in your .env file.") 149 | logger.info(" Then run: python setup.py or install packages manually") 150 | return 151 | 152 | # Optional: Validate CodeQL configuration if .env file exists 153 | env_file = PROJECT_ROOT / ".env" 154 | if env_file.exists(): 155 | logger.info("\n🔍 Validating CodeQL configuration...") 156 | try: 157 | from src.utils.config_validator import validate_codeql_path 158 | is_valid, error = validate_codeql_path() 159 | if is_valid: 160 | logger.info("✅ CodeQL configuration validated successfully!") 161 | else: 162 | logger.warning("⚠️ CodeQL configuration issue detected:") 163 | logger.warning(" %s", error.split(chr(10))[0]) # Print first line of error 164 | logger.warning(" Please fix this before running the pipeline.") 165 | except Exception as e: 166 | logger.warning("⚠️ Could not validate CodeQL configuration: %s", e) 167 | logger.info(" This is not critical - you can fix configuration later.") 168 | 169 | logger.info("🎉 Setup completed successfully! 🎉") 170 | logger.info("🔗 Next steps:") 171 | if not env_file.exists(): 172 | logger.info("1. Create a .env file with all the required variables (see README.md)") 173 | logger.info("2. Run one of the following commands to start the pipeline:") 174 | else: 175 | logger.info("Run one of the following commands to start the pipeline:") 176 | logger.info(" • python src/pipeline.py # Analyze a specific repository") 177 | logger.info(" • python src/pipeline.py # Analyze top 100 repositories") 178 | logger.info(" • python examples/example.py # See a full pipeline run") 179 | 180 | if __name__ == "__main__": 181 | main() 182 | 183 | -------------------------------------------------------------------------------- /src/pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Pipeline orchestration for Vulnhalla. 4 | This module coordinates the complete analysis pipeline: 5 | 1. Fetch CodeQL databases 6 | 2. Run CodeQL queries 7 | 3. Classify results with LLM 8 | 4. Open UI (optional) 9 | """ 10 | import sys 11 | from pathlib import Path 12 | from typing import Optional 13 | 14 | # Add project root to Python path 15 | PROJECT_ROOT = Path(__file__).parent.parent 16 | if str(PROJECT_ROOT) not in sys.path: 17 | sys.path.insert(0, str(PROJECT_ROOT)) 18 | 19 | from src.codeql.fetch_repos import fetch_codeql_dbs 20 | from src.codeql.run_codeql_queries import compile_and_run_codeql_queries 21 | from src.utils.config import get_codeql_path 22 | from src.utils.config_validator import validate_and_exit_on_error 23 | from src.utils.logger import setup_logging, get_logger 24 | from src.utils.exceptions import ( 25 | CodeQLError, CodeQLConfigError, CodeQLExecutionError, 26 | LLMError, LLMConfigError, LLMApiError, 27 | VulnhallaError 28 | ) 29 | from src.vulnhalla import IssueAnalyzer 30 | from src.ui.ui_app import main as ui_main 31 | 32 | # Initialize logging 33 | setup_logging() 34 | logger = get_logger(__name__) 35 | 36 | 37 | def _log_exception_cause(e: Exception) -> None: 38 | """ 39 | Log the cause of an exception if available and not already included in the exception message. 40 | Checks both e.cause (if set via constructor) and e.__cause__ (if set via 'from e'). 41 | """ 42 | cause = getattr(e, 'cause', None) or getattr(e, '__cause__', None) 43 | if cause: 44 | # Only log cause if it's not already included in the exception message 45 | cause_str = str(cause) 46 | error_str = str(e) 47 | if cause_str not in error_str: 48 | logger.error(" Cause: %s", cause) 49 | 50 | 51 | def analyze_pipeline(repo: Optional[str] = None, lang: str = "c", threads: int = 16, open_ui: bool = True) -> None: 52 | """ 53 | Run the complete Vulnhalla pipeline: fetch, analyze, classify, and optionally open UI. 54 | 55 | Args: 56 | repo: Optional GitHub repository name (e.g., "redis/redis"). If None, fetches top repos. 57 | lang: Programming language code. Defaults to "c". 58 | threads: Number of threads for CodeQL operations. Defaults to 16. 59 | open_ui: Whether to open the UI after completion. Defaults to True. 60 | 61 | Note: 62 | This function catches and handles all exceptions internally, logging errors 63 | and exiting with code 1 on failure. It does not raise exceptions. 64 | """ 65 | logger.info("🚀 Starting Vulnhalla Analysis Pipeline") 66 | logger.info("=" * 60) 67 | 68 | try: 69 | # Validate configuration before starting 70 | validate_and_exit_on_error() 71 | except (CodeQLConfigError, LLMConfigError, VulnhallaError) as e: 72 | # Format error message for display 73 | message = f""" 74 | ⚠️ Configuration Validation Failed 75 | ============================================================ 76 | {str(e)} 77 | ============================================================ 78 | Please fix the configuration errors above and try again. 79 | See README.md for configuration reference. 80 | """ 81 | logger.error(message) 82 | _log_exception_cause(e) 83 | sys.exit(1) 84 | 85 | try: 86 | # Step 1: Fetch CodeQL databases 87 | logger.info("\n[1/4] Fetching CodeQL Databases") 88 | logger.info("-" * 60) 89 | if repo: 90 | logger.info("Fetching database for: %s", repo) 91 | fetch_codeql_dbs(lang=lang, threads=threads, single_repo=repo) 92 | else: 93 | logger.info("Fetching top repositories for language: %s", lang) 94 | fetch_codeql_dbs(lang=lang, max_repos=100, threads=4) 95 | except CodeQLConfigError as e: 96 | logger.error("❌ Configuration error while fetching CodeQL databases: %s", e) 97 | _log_exception_cause(e) 98 | logger.error(" Please check your GitHub token and permissions.") 99 | sys.exit(1) 100 | except CodeQLError as e: 101 | logger.error("❌ Failed to fetch CodeQL databases: %s", e) 102 | _log_exception_cause(e) 103 | logger.error(" Please check file permissions, disk space, and GitHub API access.") 104 | sys.exit(1) 105 | 106 | try: 107 | # Step 2: Run CodeQL queries 108 | logger.info("\n[2/4] Running CodeQL Queries") 109 | logger.info("-" * 60) 110 | compile_and_run_codeql_queries( 111 | codeql_bin=get_codeql_path(), 112 | lang=lang, 113 | threads=threads, 114 | timeout=300 115 | ) 116 | except CodeQLConfigError as e: 117 | logger.error("❌ Configuration error while running CodeQL queries: %s", e) 118 | _log_exception_cause(e) 119 | logger.error(" Please check your CODEQL_PATH configuration.") 120 | sys.exit(1) 121 | except CodeQLExecutionError as e: 122 | logger.error("❌ Failed to execute CodeQL queries: %s", e) 123 | _log_exception_cause(e) 124 | logger.error(" Please check your CodeQL installation and database files.") 125 | sys.exit(1) 126 | except CodeQLError as e: 127 | logger.error("❌ CodeQL error: %s", e) 128 | _log_exception_cause(e) 129 | sys.exit(1) 130 | 131 | try: 132 | # Step 3: Classify results with LLM 133 | logger.info("\n[3/4] Classifying Results with LLM") 134 | logger.info("-" * 60) 135 | analyzer = IssueAnalyzer(lang=lang) 136 | analyzer.run() 137 | except LLMConfigError as e: 138 | logger.error("❌ LLM configuration error: %s", e) 139 | _log_exception_cause(e) 140 | logger.error(" Please check your LLM configuration and API credentials in .env file.") 141 | sys.exit(1) 142 | except LLMApiError as e: 143 | logger.error("❌ LLM API error: %s", e) 144 | _log_exception_cause(e) 145 | logger.error(" Please check your API key, network connection, and rate limits.") 146 | sys.exit(1) 147 | except LLMError as e: 148 | logger.error("❌ LLM error: %s", e) 149 | _log_exception_cause(e) 150 | sys.exit(1) 151 | except CodeQLError as e: 152 | logger.error("❌ CodeQL error while reading database files: %s", e) 153 | _log_exception_cause(e) 154 | logger.error(" This step reads CodeQL database files (YAML, ZIP, CSV) to prepare data for LLM analysis.") 155 | logger.error(" Please check your CodeQL databases and files are accessible.") 156 | sys.exit(1) 157 | except VulnhallaError as e: 158 | logger.error("❌ File system error while saving results: %s", e) 159 | _log_exception_cause(e) 160 | logger.error(" This step writes analysis results to disk and creates output directories.") 161 | logger.error(" Please check file permissions and disk space.") 162 | sys.exit(1) 163 | 164 | # Step 4: Open UI 165 | if open_ui: 166 | logger.info("\n[4/4] Opening UI") 167 | logger.info("-" * 60) 168 | logger.info("✅ Pipeline completed successfully!") 169 | logger.info("Opening results UI...") 170 | ui_main() 171 | else: 172 | logger.info("\n✅ Pipeline completed successfully!") 173 | logger.info("View results with: python src/ui/ui_app.py") 174 | 175 | 176 | def main_analyze() -> None: 177 | """ 178 | CLI entry point for the complete analysis pipeline. 179 | Usage: 180 | vulnhalla-analyze # Analyze top 100 repos 181 | vulnhalla-analyze redis/redis # Analyze specific repo 182 | """ 183 | # Parse command line arguments 184 | repo = None 185 | if len(sys.argv) > 1: 186 | repo = sys.argv[1] 187 | if "/" not in repo: 188 | logger.error("❌ Error: Repository must be in format 'org/repo'") 189 | logger.error(" Example: python src/pipeline.py redis/redis") 190 | logger.error(" Or run without arguments to analyze top repositories") 191 | sys.exit(1) 192 | analyze_pipeline(repo=repo) 193 | 194 | 195 | if __name__ == '__main__': 196 | main_analyze() -------------------------------------------------------------------------------- /src/ui/issue_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Issue parsing utilities for extracting and processing data from Issue objects. 4 | 5 | This module contains pure data parsing logic, separated from UI concerns. 6 | """ 7 | 8 | import re 9 | from typing import List, Optional, Tuple 10 | 11 | from src.ui.models import Issue 12 | 13 | 14 | # Regex patterns for parsing 15 | LOCATION_PATTERN = re.compile(r'Location:\s*[^:]*:(\d+)', re.IGNORECASE) 16 | FILE_LINE_PATTERN = re.compile(r'^\s*file:') 17 | NUMBERED_LINE_PATTERN = re.compile(r'^\s*\d+:') 18 | LINE_NUMBER_PATTERN = re.compile(r'^\s*\d+:\s*') 19 | LINE_NUMBER_MATCH_PATTERN = re.compile(r'^\s*(\d+):') 20 | 21 | 22 | def extract_line_number_from_location(issue: Issue) -> Optional[int]: 23 | """ 24 | Extract line number from "Location: ..." text in raw_data or final_data. 25 | 26 | Args: 27 | issue (Issue): Issue object containing raw_data and final_data. 28 | 29 | Returns: 30 | Optional[int]: Line number extracted from location text, or None if not found. 31 | """ 32 | # Check raw_data prompt first 33 | if issue.raw_data and "prompt" in issue.raw_data: 34 | match = LOCATION_PATTERN.search(issue.raw_data["prompt"]) 35 | if match: 36 | try: 37 | return int(match.group(1)) 38 | except (ValueError, IndexError): 39 | pass 40 | 41 | # Check final_data messages 42 | if issue.final_data: 43 | for msg in issue.final_data: 44 | if isinstance(msg, dict): 45 | content = msg.get("content", "") 46 | if content: 47 | match = LOCATION_PATTERN.search(content) 48 | if match: 49 | try: 50 | return int(match.group(1)) 51 | except (ValueError, IndexError): 52 | pass 53 | 54 | return None 55 | 56 | 57 | def extract_code_blocks_from_text(text: str) -> List[str]: 58 | """ 59 | Extract Vulnhalla code blocks from text. 60 | 61 | Args: 62 | text (str): Text containing code blocks in format "file: ..." followed by 63 | numbered lines like "123: code". 64 | 65 | Returns: 66 | List[str]: List of extracted code block strings. 67 | """ 68 | if not text: 69 | return [] 70 | 71 | blocks = [] 72 | lines = text.split('\n') 73 | i = 0 74 | 75 | while i < len(lines): 76 | if FILE_LINE_PATTERN.match(lines[i]): 77 | # Found file: line - start collecting block 78 | block_lines = [lines[i]] 79 | i += 1 80 | 81 | # Collect numbered lines until we hit a non-numbered line 82 | while i < len(lines): 83 | line = lines[i] 84 | if NUMBERED_LINE_PATTERN.match(line): 85 | block_lines.append(line) 86 | i += 1 87 | # Include continuation lines that end with a backslash 88 | if line.rstrip().endswith('\\') and i < len(lines): 89 | block_lines.append(lines[i]) 90 | i += 1 91 | else: 92 | break # End of block 93 | 94 | # Only keep blocks with at least file: + one numbered line 95 | if len(block_lines) > 1: 96 | block = '\n'.join(block_lines).strip() 97 | if block: 98 | blocks.append(block) 99 | else: 100 | i += 1 101 | 102 | return blocks 103 | 104 | 105 | def extract_code_from_messages(final_data: Optional[List]) -> List[str]: 106 | """ 107 | Extract all code blocks from final_data messages in chronological order. 108 | 109 | Args: 110 | final_data (Optional[List]): List of message dictionaries from LLM conversation. 111 | 112 | Returns: 113 | List[str]: List of extracted code block strings. 114 | """ 115 | if not final_data: 116 | return [] 117 | 118 | all_blocks = [] 119 | for msg in final_data: 120 | if isinstance(msg, dict): 121 | content = msg.get("content", "") 122 | if isinstance(content, str) and content: 123 | all_blocks.extend(extract_code_blocks_from_text(content)) 124 | 125 | return all_blocks 126 | 127 | 128 | def normalize_code_snippet(snippet: str) -> str: 129 | """ 130 | Normalize code snippet for deduplication: strip line numbers and whitespace. 131 | 132 | The file header is normalized to handle slight formatting differences (whitespace, 133 | trailing characters) so that the same code block with minor header differences 134 | will be properly deduplicated. 135 | 136 | Args: 137 | snippet (str): Code snippet to normalize. 138 | 139 | Returns: 140 | str: Normalized code snippet with line numbers removed and file header normalized. 141 | """ 142 | snippet = snippet.strip() 143 | if not snippet: 144 | return "" 145 | 146 | # Check for file: header 147 | file_match = re.match(r'(file:\s*[^\n]+)\n(.*)', snippet, re.DOTALL) 148 | if file_match: 149 | # Normalize file header: strip whitespace and normalize multiple spaces to single space 150 | file_header = file_match.group(1).strip() 151 | # Normalize whitespace in file header (multiple spaces -> single space) 152 | file_header = re.sub(r'\s+', ' ', file_header) 153 | code_lines = file_match.group(2).split('\n') 154 | else: 155 | file_header = None 156 | code_lines = snippet.split('\n') 157 | 158 | # Normalize all code lines: remove line numbers and strip whitespace 159 | normalized_lines = [] 160 | for line in code_lines: 161 | line = LINE_NUMBER_PATTERN.sub('', line).strip() 162 | if line: # Keep non-empty lines 163 | normalized_lines.append(line) 164 | 165 | normalized_code = '\n'.join(normalized_lines) 166 | # Return normalized key with normalized file header 167 | return f"{file_header}\n{normalized_code}" if file_header else normalized_code 168 | 169 | 170 | def collect_all_code_snippets(issue: Issue) -> Tuple[str, List[str]]: 171 | """ 172 | Collect all unique code snippets from final_data, deduplicated and in order. 173 | 174 | Args: 175 | issue (Issue): Issue object containing final_data. 176 | 177 | Returns: 178 | Tuple[str, List[str]]: Tuple of (initial_code, additional_code_list) where 179 | initial_code is the first snippet (or empty string) and 180 | additional_code_list contains additional snippets (empty if none). 181 | """ 182 | snippets = extract_code_from_messages(issue.final_data) 183 | if not snippets: 184 | return ("", []) 185 | 186 | # Deduplicate: keep first occurrence of each normalized block 187 | seen = set() 188 | unique_snippets = [] 189 | for snippet in snippets: 190 | key = normalize_code_snippet(snippet) 191 | if key and key not in seen: 192 | seen.add(key) 193 | unique_snippets.append(snippet) 194 | 195 | if not unique_snippets: 196 | return ("", []) 197 | 198 | initial_code = unique_snippets[0] 199 | additional_code = unique_snippets[1:] if len(unique_snippets) > 1 else [] 200 | 201 | return (initial_code, additional_code) 202 | 203 | 204 | 205 | def extract_last_message(final_data: Optional[List]) -> Optional[str]: 206 | """ 207 | Extract the last non-empty message content from final_data. 208 | 209 | Args: 210 | final_data (Optional[List]): List of message dictionaries from LLM conversation. 211 | 212 | Returns: 213 | Optional[str]: Content string of the last message, or None if no valid message found. 214 | """ 215 | if not final_data: 216 | return None 217 | 218 | # Iterate backwards to find the last non-empty message 219 | for msg in reversed(final_data): 220 | if isinstance(msg, dict): 221 | content = msg.get("content", "") 222 | if isinstance(content, str) and content.strip(): 223 | return content.strip() 224 | 225 | return None 226 | 227 | -------------------------------------------------------------------------------- /src/utils/logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Centralized logging configuration for Vulnhalla. 4 | 5 | Provides consistent logging setup across all modules with support for: 6 | - Console output (INFO level by default) 7 | - Optional file logging (DEBUG level) 8 | - Environment variable configuration 9 | - Structured logging (JSON) option 10 | """ 11 | 12 | import logging 13 | import sys 14 | import os 15 | from pathlib import Path 16 | from typing import Optional 17 | 18 | # Default configuration 19 | DEFAULT_LOG_LEVEL = "INFO" 20 | DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 21 | DEFAULT_LOG_FORMAT_SIMPLE = "%(levelname)s - %(message)s" # Simpler format for console 22 | DEFAULT_LOG_FORMAT_INFO = "%(message)s" # Minimal format for INFO messages 23 | DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S" 24 | 25 | # Track if logging has been initialized 26 | _logging_initialized = False 27 | 28 | 29 | def reset_logging() -> None: 30 | """ 31 | Reset logging state 32 | 33 | Clears all handlers and resets the initialization flag. 34 | """ 35 | global _logging_initialized 36 | root_logger = logging.getLogger() 37 | root_logger.handlers.clear() 38 | _logging_initialized = False 39 | 40 | 41 | def suppress_third_party_loggers() -> None: 42 | """ 43 | Suppress verbose logging from third-party libraries. 44 | 45 | Configures log levels for common third-party libraries that can be noisy. 46 | Respects THIRD_PARTY_LOG_LEVEL environment variable if set. 47 | """ 48 | # Get third-party log level from environment, default to ERROR 49 | third_party_level_str = os.getenv("THIRD_PARTY_LOG_LEVEL", "ERROR").upper() 50 | third_party_level = getattr(logging, third_party_level_str, logging.ERROR) 51 | 52 | # LiteLLM - only show errors by default (can be verbose with INFO/DEBUG) 53 | logging.getLogger("LiteLLM").setLevel(third_party_level) 54 | 55 | # urllib3/requests - reduce HTTP connection noise 56 | logging.getLogger("urllib3").setLevel(third_party_level) 57 | logging.getLogger("urllib3.connectionpool").setLevel(third_party_level) 58 | logging.getLogger("requests").setLevel(third_party_level) 59 | 60 | 61 | def setup_logging( 62 | log_level: Optional[str] = None, 63 | log_file: Optional[str] = None, 64 | log_format: Optional[str] = None, 65 | json_format: bool = False, 66 | simple_format: bool = False 67 | ) -> None: 68 | """ 69 | Configure logging for the application. 70 | 71 | Args: 72 | log_level: Logging level (DEBUG, INFO, WARNING, ERROR). 73 | Defaults to environment variable LOG_LEVEL or INFO. 74 | log_file: Optional path to log file. If None, reads from LOG_FILE env var. 75 | log_format: Custom log format string. If None, uses default or JSON. 76 | json_format: If True, use JSON structured logging format. 77 | simple_format: If True, use simpler format without timestamps for console. 78 | """ 79 | global _logging_initialized 80 | 81 | # Prevent duplicate initialization 82 | if _logging_initialized: 83 | return 84 | 85 | # Get configuration from environment or parameters 86 | level_str = log_level or os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper() 87 | log_file_path = log_file or os.getenv("LOG_FILE") 88 | log_format_str = log_format or os.getenv("LOG_FORMAT", "default") 89 | # Console format control: 90 | # - Default: INFO messages are minimal (message only), WARNING/ERROR/CRITICAL use simple format (LEVEL - message) 91 | # - If LOG_VERBOSE_CONSOLE=true: WARNING/ERROR/CRITICAL use full format (timestamp - logger - level - message) 92 | # - INFO always remains minimal regardless of verbose mode 93 | use_verbose_console = os.getenv("LOG_VERBOSE_CONSOLE", "false").lower() == "true" 94 | # Legacy support: LOG_SIMPLE_FORMAT still works but is deprecated in favor of LOG_VERBOSE_CONSOLE 95 | use_simple_format = simple_format or os.getenv("LOG_SIMPLE_FORMAT", "false").lower() == "true" 96 | 97 | # Convert string level to logging constant 98 | numeric_level = getattr(logging, level_str, logging.INFO) 99 | 100 | # Configure root logger 101 | root_logger = logging.getLogger() 102 | root_logger.setLevel(numeric_level) 103 | 104 | # Remove existing handlers to avoid duplicates 105 | root_logger.handlers.clear() 106 | 107 | # Console handler (always present) 108 | console_handler = logging.StreamHandler(sys.stdout) 109 | console_handler.setLevel(numeric_level) 110 | 111 | if json_format or log_format_str.lower() == "json": 112 | # JSON structured logging 113 | try: 114 | import json 115 | from datetime import datetime 116 | 117 | class JSONFormatter(logging.Formatter): 118 | """Formatter that renders log records as JSON strings. 119 | 120 | This formatter is used when JSON logging is enabled. It converts the 121 | LogRecord into a JSON object with a timestamp, logger name, level, and 122 | message, and can optionally include extra fields such as progress. 123 | """ 124 | 125 | def format(self, record: logging.LogRecord) -> str: 126 | """Format a LogRecord as a JSON string. 127 | 128 | Args: 129 | record: The log record to format. 130 | 131 | Returns: 132 | str: A JSON representation of the log record. 133 | """ 134 | log_entry = { 135 | "timestamp": datetime.utcnow().isoformat(), 136 | "level": record.levelname, 137 | "logger": record.name, 138 | "message": record.getMessage(), 139 | } 140 | # Add extra fields if present 141 | if hasattr(record, "progress"): 142 | log_entry["progress"] = record.progress 143 | return json.dumps(log_entry) 144 | 145 | console_handler.setFormatter(JSONFormatter()) 146 | except ImportError: 147 | # Fallback to default format if json not available 148 | formatter = logging.Formatter( 149 | DEFAULT_LOG_FORMAT, 150 | datefmt=DEFAULT_DATE_FORMAT 151 | ) 152 | console_handler.setFormatter(formatter) 153 | else: 154 | # Standard formatted logging with level-based formatting 155 | # Default behavior: 156 | # - INFO: minimal format (message only) 157 | # - WARNING/ERROR/CRITICAL: simple format (LEVEL - message) 158 | # If LOG_VERBOSE_CONSOLE=true: 159 | # - INFO: still minimal (message only) 160 | # - WARNING/ERROR/CRITICAL: full format (timestamp - logger - level - message) 161 | class LevelBasedFormatter(logging.Formatter): 162 | """Formatter that uses different formats depending on log level. 163 | 164 | INFO messages are rendered in a minimal format (message only), 165 | while WARNING/ERROR/CRITICAL messages can use either a simple format 166 | (LEVEL - message) or a full format with timestamp and logger name. 167 | """ 168 | def __init__(self, full_format: str, simple_format: str, datefmt: Optional[str] = None, verbose: bool = False) -> None: 169 | # Initialize with simple format as base (for WARNING/ERROR default behavior) 170 | super().__init__(simple_format, datefmt) 171 | self.full_format = full_format 172 | self.simple_format = simple_format 173 | self.verbose = verbose 174 | self._full_formatter = logging.Formatter(full_format, datefmt) if verbose else None 175 | 176 | def format(self, record: logging.LogRecord) -> str: 177 | """Format a LogRecord using level-based formatting. 178 | 179 | INFO records are formatted as the plain message. Higher-severity 180 | records use either the simple or full format, depending on the 181 | configuration. 182 | 183 | Args: 184 | record: The log record to format. 185 | 186 | Returns: 187 | str: The formatted log message. 188 | """ 189 | # For INFO level, always use minimal format (just the message) 190 | if record.levelno == logging.INFO: 191 | return record.getMessage() 192 | # For WARNING, ERROR, CRITICAL 193 | else: 194 | if self.verbose and self._full_formatter: 195 | # Use full format with timestamp when verbose mode is enabled 196 | return self._full_formatter.format(record) 197 | else: 198 | # Use simple format (LEVEL - message) by default 199 | return super().format(record) 200 | 201 | formatter = LevelBasedFormatter( 202 | DEFAULT_LOG_FORMAT, 203 | DEFAULT_LOG_FORMAT_SIMPLE, 204 | datefmt=DEFAULT_DATE_FORMAT, 205 | verbose=use_verbose_console 206 | ) 207 | console_handler.setFormatter(formatter) 208 | 209 | root_logger.addHandler(console_handler) 210 | 211 | # Suppress noisy third-party loggers 212 | suppress_third_party_loggers() 213 | 214 | # File handler (optional) 215 | if log_file_path: 216 | try: 217 | # Ensure log directory exists 218 | log_path = Path(log_file_path) 219 | log_path.parent.mkdir(parents=True, exist_ok=True) 220 | 221 | file_handler = logging.FileHandler(log_file_path, encoding="utf-8") 222 | file_handler.setLevel(logging.DEBUG) # File always gets DEBUG level 223 | file_formatter = logging.Formatter( 224 | DEFAULT_LOG_FORMAT, 225 | datefmt=DEFAULT_DATE_FORMAT 226 | ) 227 | file_handler.setFormatter(file_formatter) 228 | root_logger.addHandler(file_handler) 229 | except Exception as e: 230 | # If file logging fails, log to console and continue 231 | root_logger.warning("Failed to set up file logging: %s", e) 232 | 233 | _logging_initialized = True 234 | 235 | 236 | def get_logger(name: str) -> logging.Logger: 237 | """ 238 | Get a logger instance for a module. 239 | 240 | This is a convenience function that ensures logging is initialized 241 | and returns a logger with the given name. 242 | 243 | Args: 244 | name: Logger name (typically __name__ from the calling module) 245 | 246 | Returns: 247 | Logger instance 248 | """ 249 | # Ensure logging is set up (idempotent) 250 | if not _logging_initialized: 251 | setup_logging() 252 | 253 | return logging.getLogger(name) 254 | 255 | 256 | # Auto-setup on import 257 | _AUTO_SETUP = os.getenv("VULNHALLA_AUTO_SETUP_LOGGING", "true").lower() == "true" 258 | if _AUTO_SETUP: 259 | setup_logging() 260 | 261 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 5 | 1. Definitions. 6 | "License" shall mean the terms and conditions for use, reproduction, 7 | and distribution as defined by Sections 1 through 9 of this document. 8 | "Licensor" shall mean the copyright owner or entity authorized by 9 | the copyright owner that is granting the License. 10 | "Legal Entity" shall mean the union of the acting entity and all 11 | other entities that control, are controlled by, or are under common 12 | control with that entity. For the purposes of this definition, 13 | "control" means (i) the power, direct or indirect, to cause the 14 | direction or management of such entity, whether by contract or 15 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 16 | outstanding shares, or (iii) beneficial ownership of such entity. 17 | "You" (or "Your") shall mean an individual or Legal Entity 18 | exercising permissions granted by this License. 19 | "Source" form shall mean the preferred form for making modifications, 20 | including but not limited to software source code, documentation 21 | source, and configuration files. 22 | "Object" form shall mean any form resulting from mechanical 23 | transformation or translation of a Source form, including but 24 | not limited to compiled object code, generated documentation, 25 | and conversions to other media types. 26 | "Work" shall mean the work of authorship, whether in Source or 27 | Object form, made available under the License, as indicated by a 28 | copyright notice that is included in or attached to the work 29 | (an example is provided in the Appendix below). 30 | "Derivative Works" shall mean any work, whether in Source or Object 31 | form, that is based on (or derived from) the Work and for which the 32 | editorial revisions, annotations, elaborations, or other modifications 33 | represent, as a whole, an original work of authorship. For the purposes 34 | of this License, Derivative Works shall not include works that remain 35 | separable from, or merely link (or bind by name) to the interfaces of, 36 | the Work and Derivative Works thereof. 37 | "Contribution" shall mean any work of authorship, including 38 | the original version of the Work and any modifications or additions 39 | to that Work or Derivative Works thereof, that is intentionally 40 | submitted to Licensor for inclusion in the Work by the copyright owner 41 | or by an individual or Legal Entity authorized to submit on behalf of 42 | the copyright owner. For the purposes of this definition, "submitted" 43 | means any form of electronic, verbal, or written communication sent 44 | to the Licensor or its representatives, including but not limited to 45 | communication on electronic mailing lists, source code control systems, 46 | and issue tracking systems that are managed by, or on behalf of, the 47 | Licensor for the purpose of discussing and improving the Work, but 48 | excluding communication that is conspicuously marked or otherwise 49 | designated in writing by the copyright owner as "Not a Contribution." 50 | "Contributor" shall mean Licensor and any individual or Legal Entity 51 | on behalf of whom a Contribution has been received by Licensor and 52 | subsequently incorporated within the Work. 53 | 2. Grant of Copyright License. Subject to the terms and conditions of 54 | this License, each Contributor hereby grants to You a perpetual, 55 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 56 | copyright license to reproduce, prepare Derivative Works of, 57 | publicly display, publicly perform, sublicense, and distribute the 58 | Work and such Derivative Works in Source or Object form. 59 | 3. Grant of Patent License. Subject to the terms and conditions of 60 | this License, each Contributor hereby grants to You a perpetual, 61 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 62 | (except as stated in this section) patent license to make, have made, 63 | use, offer to sell, sell, import, and otherwise transfer the Work, 64 | where such license applies only to those patent claims licensable 65 | by such Contributor that are necessarily infringed by their 66 | Contribution(s) alone or by combination of their Contribution(s) 67 | with the Work to which such Contribution(s) was submitted. If You 68 | institute patent litigation against any entity (including a 69 | cross-claim or counterclaim in a lawsuit) alleging that the Work 70 | or a Contribution incorporated within the Work constitutes direct 71 | or contributory patent infringement, then any patent licenses 72 | granted to You under this License for that Work shall terminate 73 | as of the date such litigation is filed. 74 | 4. Redistribution. You may reproduce and distribute copies of the 75 | Work or Derivative Works thereof in any medium, with or without 76 | modifications, and in Source or Object form, provided that You 77 | meet the following conditions: 78 | (a) You must give any other recipients of the Work or 79 | Derivative Works a copy of this License; and 80 | (b) You must cause any modified files to carry prominent notices 81 | stating that You changed the files; and 82 | (c) You must retain, in the Source form of any Derivative Works 83 | that You distribute, all copyright, patent, trademark, and 84 | attribution notices from the Source form of the Work, 85 | excluding those notices that do not pertain to any part of 86 | the Derivative Works; and 87 | (d) If the Work includes a "NOTICE" text file as part of its 88 | distribution, then any Derivative Works that You distribute must 89 | include a readable copy of the attribution notices contained 90 | within such NOTICE file, excluding those notices that do not 91 | pertain to any part of the Derivative Works, in at least one 92 | of the following places: within a NOTICE text file distributed 93 | as part of the Derivative Works; within the Source form or 94 | documentation, if provided along with the Derivative Works; or, 95 | within a display generated by the Derivative Works, if and 96 | wherever such third-party notices normally appear. The contents 97 | of the NOTICE file are for informational purposes only and 98 | do not modify the License. You may add Your own attribution 99 | notices within Derivative Works that You distribute, alongside 100 | or as an addendum to the NOTICE text from the Work, provided 101 | that such additional attribution notices cannot be construed 102 | as modifying the License. 103 | You may add Your own copyright statement to Your modifications and 104 | may provide additional or different license terms and conditions 105 | for use, reproduction, or distribution of Your modifications, or 106 | for any such Derivative Works as a whole, provided Your use, 107 | reproduction, and distribution of the Work otherwise complies with 108 | the conditions stated in this License. 109 | 5. Submission of Contributions. Unless You explicitly state otherwise, 110 | any Contribution intentionally submitted for inclusion in the Work 111 | by You to the Licensor shall be under the terms and conditions of 112 | this License, without any additional terms or conditions. 113 | Notwithstanding the above, nothing herein shall supersede or modify 114 | the terms of any separate license agreement you may have executed 115 | with Licensor regarding such Contributions. 116 | 6. Trademarks. This License does not grant permission to use the trade 117 | names, trademarks, service marks, or product names of the Licensor, 118 | except as required for reasonable and customary use in describing the 119 | origin of the Work and reproducing the content of the NOTICE file. 120 | 7. Disclaimer of Warranty. Unless required by applicable law or 121 | agreed to in writing, Licensor provides the Work (and each 122 | Contributor provides its Contributions) on an "AS IS" BASIS, 123 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 124 | implied, including, without limitation, any warranties or conditions 125 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 126 | PARTICULAR PURPOSE. You are solely responsible for determining the 127 | appropriateness of using or redistributing the Work and assume any 128 | risks associated with Your exercise of permissions under this License. 129 | 8. Limitation of Liability. In no event and under no legal theory, 130 | whether in tort (including negligence), contract, or otherwise, 131 | unless required by applicable law (such as deliberate and grossly 132 | negligent acts) or agreed to in writing, shall any Contributor be 133 | liable to You for damages, including any direct, indirect, special, 134 | incidental, or consequential damages of any character arising as a 135 | result of this License or out of the use or inability to use the 136 | Work (including but not limited to damages for loss of goodwill, 137 | work stoppage, computer failure or malfunction, or any and all 138 | other commercial damages or losses), even if such Contributor 139 | has been advised of the possibility of such damages. 140 | 9. Accepting Warranty or Additional Liability. While redistributing 141 | the Work or Derivative Works thereof, You may choose to offer, 142 | and charge a fee for, acceptance of support, warranty, indemnity, 143 | or other liability obligations and/or rights consistent with this 144 | License. However, in accepting such obligations, You may act only 145 | on Your own behalf and on Your sole responsibility, not on behalf 146 | of any other Contributor, and only if You agree to indemnify, 147 | defend, and hold each Contributor harmless for any liability 148 | incurred by, or claims asserted against, such Contributor by reason 149 | of your accepting any such warranty or additional liability. 150 | END OF TERMS AND CONDITIONS 151 | APPENDIX: How to apply the Apache License to your work. 152 | To apply the Apache License to your work, attach the following 153 | boilerplate notice, with the fields enclosed by brackets "[]" 154 | replaced with your own identifying information. (Don't include 155 | the brackets!) The text should be enclosed in the appropriate 156 | comment syntax for the file format. We also recommend that a 157 | file or class name and description of purpose be included on the 158 | same "printed page" as the copyright notice for easier 159 | identification within third-party archives. 160 | 161 | Copyright (c) 2025 CyberArk Software Ltd. All rights reserved. 162 | Licensed under the Apache License, Version 2.0 (the "License"); 163 | you may not use this file except in compliance with the License. 164 | You may obtain a copy of the License at 165 | http://www.apache.org/licenses/LICENSE-2.0 166 | Unless required by applicable law or agreed to in writing, software 167 | distributed under the License is distributed on an "AS IS" BASIS, 168 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 169 | See the License for the specific language governing permissions and 170 | limitations under the License. -------------------------------------------------------------------------------- /src/codeql/run_codeql_queries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Compile and run CodeQL queries on CodeQL databases for a specific language. 4 | 5 | Requires that CodeQL is installed or available under the CODEQL path. 6 | By default, it compiles all .ql files under 'data/queries//tools' and 7 | 'data/queries//issues', then runs them on each CodeQL database located 8 | in 'output/databases/'. 9 | 10 | Example: 11 | python src/codeql/run_codeql_queries.py 12 | """ 13 | 14 | import subprocess 15 | import os 16 | 17 | # Make sure your common_functions module is in your PYTHONPATH or same folder 18 | from src.utils.common_functions import get_all_dbs 19 | from src.utils.config import get_codeql_path 20 | from src.utils.logger import get_logger 21 | from src.utils.exceptions import CodeQLError, CodeQLConfigError, CodeQLExecutionError 22 | 23 | logger = get_logger(__name__) 24 | 25 | 26 | # Default locations/values 27 | DEFAULT_CODEQL = get_codeql_path() 28 | DEFAULT_LANG = "c" # Mapped to data/queries/cpp for some tasks 29 | 30 | 31 | def pre_compile_ql(file_name: str, threads: int, codeql_bin: str) -> None: 32 | """ 33 | Pre-compile a single .ql file using CodeQL. 34 | 35 | Args: 36 | file_name (str): The path to the .ql query file. 37 | threads (int): Number of threads to use during compilation. 38 | codeql_bin (str): Full path to the 'codeql' executable. 39 | 40 | Raises: 41 | CodeQLConfigError: If CodeQL executable not found. 42 | CodeQLExecutionError: If query compilation fails. 43 | """ 44 | if not os.path.exists(file_name + "x"): 45 | try: 46 | subprocess.run( 47 | [ 48 | codeql_bin, 49 | "query", 50 | "compile", 51 | file_name, 52 | f'--threads={threads}', 53 | "--precompile" 54 | ], 55 | check=True, 56 | text=True, 57 | stdout=subprocess.DEVNULL, 58 | stderr=subprocess.DEVNULL 59 | ) 60 | except FileNotFoundError as e: 61 | raise CodeQLConfigError( 62 | f"CodeQL executable not found: {codeql_bin}. " 63 | "Please check your CODEQL_PATH configuration." 64 | ) from e 65 | except subprocess.CalledProcessError as e: 66 | raise CodeQLExecutionError( 67 | f"Failed to compile query {file_name}: CodeQL returned exit code {e.returncode}" 68 | ) from e 69 | 70 | 71 | def compile_all_queries(queries_folder: str, threads: int, codeql_bin: str) -> None: 72 | """ 73 | Recursively pre-compile all .ql files in a folder. 74 | 75 | Args: 76 | queries_folder (str): Directory containing .ql files (and possibly subdirectories). 77 | threads (int): Number of threads to use during compilation. 78 | codeql_bin (str): Full path to the 'codeql' executable. 79 | 80 | Raises: 81 | CodeQLConfigError: If CodeQL executable not found. 82 | CodeQLExecutionError: If query compilation fails. 83 | """ 84 | for subdir, dirs, files in os.walk(queries_folder): 85 | for file in files: 86 | if os.path.splitext(file)[1].lower() == ".ql": 87 | file_path = os.path.join(subdir, file) 88 | pre_compile_ql(file_path, threads, codeql_bin) 89 | 90 | 91 | def run_one_query( 92 | query_file: str, 93 | curr_db: str, 94 | output_bqrs: str, 95 | output_csv: str, 96 | threads: int, 97 | codeql_bin: str 98 | ) -> None: 99 | """ 100 | Execute a single CodeQL query on a specific database and export the results. 101 | 102 | Args: 103 | query_file (str): The path to the .ql file to run. 104 | curr_db (str): The path to the CodeQL database on which to run queries. 105 | output_bqrs (str): Where to write the intermediate BQRS output. 106 | output_csv (str): Where to write the CSV representation of the results. 107 | threads (int): Number of threads to use during query execution. 108 | codeql_bin (str): Full path to the 'codeql' executable. 109 | 110 | Raises: 111 | CodeQLConfigError: If CodeQL executable not found. 112 | CodeQLExecutionError: If query execution or BQRS decoding fails. 113 | """ 114 | # Run the query 115 | try: 116 | subprocess.run( 117 | [ 118 | codeql_bin, "query", "run", query_file, 119 | f'--database={curr_db}', 120 | f'--output={output_bqrs}', 121 | f'--threads={threads}' 122 | ], 123 | check=True, 124 | text=True, 125 | stdout=subprocess.DEVNULL, 126 | stderr=subprocess.DEVNULL 127 | ) 128 | except FileNotFoundError as e: 129 | raise CodeQLConfigError( 130 | f"CodeQL executable not found: {codeql_bin}. " 131 | "Please check your CODEQL_PATH configuration." 132 | ) from e 133 | except subprocess.CalledProcessError as e: 134 | raise CodeQLExecutionError( 135 | f"Failed to run query {query_file} on database {curr_db}: " 136 | f"CodeQL returned exit code {e.returncode}" 137 | ) from e 138 | 139 | # Decode BQRS to CSV 140 | try: 141 | subprocess.run( 142 | [ 143 | codeql_bin, "bqrs", "decode", output_bqrs, 144 | '--format=csv', f'--output={output_csv}' 145 | ], 146 | check=True, 147 | text=True, 148 | stdout=subprocess.DEVNULL, 149 | stderr=subprocess.DEVNULL 150 | ) 151 | except subprocess.CalledProcessError as e: 152 | raise CodeQLExecutionError( 153 | f"Failed to decode BQRS file {output_bqrs} to CSV: " 154 | f"CodeQL returned exit code {e.returncode}" 155 | ) from e 156 | 157 | 158 | def run_queries_on_db( 159 | curr_db: str, 160 | tools_folder: str, 161 | queries_folder: str, 162 | threads: int, 163 | codeql_bin: str, 164 | timeout: int = 300 165 | ) -> None: 166 | """ 167 | Execute all tool queries in 'tools_folder' individually on a given database, 168 | then run a bulk 'database analyze' with all queries in 'queries_folder'. 169 | 170 | Args: 171 | curr_db (str): The path to the CodeQL database. 172 | tools_folder (str): Folder containing individual .ql files to run. 173 | queries_folder (str): Folder containing .ql queries for bulk analysis. 174 | threads (int): Number of threads to use during query execution. 175 | codeql_bin (str): Full path to the 'codeql' executable. 176 | timeout (int, optional): Timeout in seconds for the bulk 'database analyze'. 177 | Defaults to 300. 178 | 179 | Raises: 180 | CodeQLConfigError: If CodeQL executable not found. 181 | CodeQLExecutionError: If query execution or database analysis fails. 182 | """ 183 | # 1) Run each .ql in tools_folder individually 184 | if os.path.isdir(tools_folder): 185 | for file in os.listdir(tools_folder): 186 | if os.path.splitext(file)[1].lower() == ".ql": 187 | run_one_query( 188 | os.path.join(tools_folder, file), 189 | curr_db, 190 | os.path.join(curr_db, os.path.splitext(file)[0] + ".bqrs"), 191 | os.path.join(curr_db, os.path.splitext(file)[0] + ".csv"), 192 | threads, 193 | codeql_bin 194 | ) 195 | else: 196 | logger.warning("Tools folder '%s' not found. Skipping individual queries.", tools_folder) 197 | 198 | # 2) Run the entire queries folder in one go (bulk analysis) 199 | if os.path.isdir(queries_folder): 200 | try: 201 | subprocess.run( 202 | [ 203 | codeql_bin, 204 | "database", 205 | "analyze", 206 | curr_db, 207 | queries_folder, 208 | f'--timeout={timeout}', 209 | '--format=csv', 210 | f'--output={os.path.join(curr_db, "issues.csv")}', 211 | f'--threads={threads}' 212 | ], 213 | check=True, 214 | text=True, 215 | stdout=subprocess.DEVNULL, 216 | stderr=subprocess.DEVNULL 217 | ) 218 | except FileNotFoundError as e: 219 | raise CodeQLConfigError( 220 | f"CodeQL executable not found: {codeql_bin}. " 221 | "Please check your CODEQL_PATH configuration." 222 | ) from e 223 | except subprocess.CalledProcessError as e: 224 | raise CodeQLExecutionError( 225 | f"Failed to analyze database {curr_db} with queries from {queries_folder}: " 226 | f"CodeQL returned exit code {e.returncode}" 227 | ) from e 228 | else: 229 | logger.warning("Queries folder '%s' not found. Skipping bulk analysis.", queries_folder) 230 | 231 | 232 | def compile_and_run_codeql_queries( 233 | codeql_bin: str = DEFAULT_CODEQL, 234 | lang: str = DEFAULT_LANG, 235 | threads: int = 16, 236 | timeout: int = 300 237 | ) -> None: 238 | """ 239 | Compile and run CodeQL queries on CodeQL databases for a specific language. 240 | 241 | 1. Pre-compile all .ql files in the tools and queries folders. 242 | 2. Enumerate all CodeQL DBs for the given language. 243 | 3. Run each DB against both the 'tools' and 'issues' queries folders. 244 | 245 | Args: 246 | codeql_bin (str, optional): Full path to the 'codeql' executable. Defaults to DEFAULT_CODEQL. 247 | lang (str, optional): Language code. Defaults to 'c' (which maps to data/queries/cpp). 248 | threads (int, optional): Number of threads for compilation/execution. Defaults to 16. 249 | timeout (int, optional): Timeout in seconds for bulk analysis. Defaults to 300. 250 | 251 | Raises: 252 | CodeQLConfigError: If CodeQL executable not found (from compilation or query execution). 253 | CodeQLExecutionError: If query compilation or execution fails. 254 | """ 255 | # Setup paths 256 | queries_subfolder = "cpp" if lang == "c" else lang 257 | queries_folder = os.path.join("data/queries", queries_subfolder, "issues") 258 | tools_folder = os.path.join("data/queries", queries_subfolder, "tools") 259 | dbs_folder = os.path.join("output/databases", lang) 260 | 261 | # Step 1: Pre-compile all queries 262 | compile_all_queries(tools_folder, threads, codeql_bin) 263 | compile_all_queries(queries_folder, threads, codeql_bin) 264 | 265 | # Step 2: List databases and run queries 266 | logger.info("Running queries on each DB in %s", dbs_folder) 267 | 268 | # List what's in the folder for debugging 269 | try: 270 | contents = os.listdir(dbs_folder) 271 | if len(contents) == 0: 272 | logger.warning("Database folder '%s' is empty. No databases to process.", dbs_folder) 273 | return 274 | logger.debug("Found %d item(s) in database folder: %s", len(contents), contents) 275 | except OSError as e: 276 | logger.warning("Cannot access database folder '%s': %s. No databases to process.", dbs_folder, e) 277 | return 278 | 279 | dbs_path = get_all_dbs(dbs_folder) 280 | 281 | if len(dbs_path) == 0: 282 | logger.warning("No valid databases found in '%s'. Expected structure: ///codeql-database.yml", dbs_folder) 283 | logger.warning("Make sure databases were downloaded and extracted successfully.") 284 | return 285 | 286 | for curr_db in dbs_path: 287 | logger.info("Processing DB: %s", curr_db) 288 | 289 | # Check if database folder is empty 290 | if os.path.isdir(curr_db): 291 | try: 292 | if len(os.listdir(curr_db)) == 0: 293 | logger.warning("Database folder '%s' is empty. Skipping queries.", curr_db) 294 | continue 295 | except OSError: 296 | logger.warning("Cannot access database folder '%s'. Skipping.", curr_db) 297 | continue 298 | 299 | # If issues.csv was not generated yet, or FunctionTree.csv missing, run 300 | if (not os.path.exists(os.path.join(curr_db, "FunctionTree.csv")) or 301 | not os.path.exists(os.path.join(curr_db, "issues.csv"))): 302 | run_queries_on_db( 303 | curr_db, 304 | tools_folder, 305 | queries_folder, 306 | threads, 307 | codeql_bin, 308 | timeout 309 | ) 310 | else: 311 | logger.info("Output files already exist for this DB, skipping...") 312 | 313 | logger.info("All databases processed.") 314 | 315 | 316 | def main_cli() -> None: 317 | """ 318 | CLI entry point for running codeql queries with defaults. 319 | """ 320 | compile_and_run_codeql_queries( 321 | codeql_bin=DEFAULT_CODEQL, 322 | lang=DEFAULT_LANG, 323 | threads=16, 324 | timeout=300 325 | ) 326 | 327 | 328 | if __name__ == '__main__': 329 | # Initialize logging 330 | from src.utils.logger import setup_logging 331 | setup_logging() 332 | 333 | main_cli() 334 | -------------------------------------------------------------------------------- /src/utils/config_validator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Configuration Validator Module 4 | 5 | Validates configuration at startup to catch errors early with clear messages. 6 | """ 7 | 8 | import os 9 | from shlex import join 10 | import shutil 11 | from typing import Any, Dict, List, Optional, Tuple 12 | from src.utils.config import get_codeql_path 13 | from src.utils.llm_config import load_llm_config, ALLOWED_LLM_PROVIDERS 14 | from src.utils.logger import get_logger 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | def is_placeholder_api_key(api_key: Optional[str]) -> bool: 20 | """ 21 | Check if an API key is a placeholder value. 22 | 23 | Checks for common placeholders: "your_api_key" (from .env.example) and "sk-..." 24 | 25 | Args: 26 | api_key: API key to check 27 | 28 | Returns: 29 | True if the API key appears to be a placeholder, False otherwise 30 | """ 31 | if not api_key: 32 | return True 33 | 34 | api_key_str = str(api_key).strip() 35 | # Strip quotes if present (from .env file) 36 | api_key_str = api_key_str.strip('"').strip("'") 37 | api_key_lower = api_key_str.lower() 38 | 39 | # Check for the placeholder used in .env.example 40 | if "your_api_key" in api_key_lower or api_key_lower == "your-api-key": 41 | return True 42 | 43 | # Check for "sk-..." placeholder pattern 44 | if api_key_str == "sk-...": 45 | return True 46 | 47 | return False 48 | 49 | 50 | def find_codeql_executable() -> Optional[str]: 51 | """ 52 | Find the actual CodeQL executable path to use. 53 | 54 | Returns: 55 | Path to CodeQL executable if found, None otherwise. 56 | On Windows, returns path with .cmd extension if needed. 57 | """ 58 | try: 59 | codeql_path = get_codeql_path() 60 | 61 | # Strip quotes if present 62 | if codeql_path: 63 | codeql_path = codeql_path.strip('"').strip("'") 64 | 65 | # If default "codeql", check if it's in PATH 66 | if codeql_path == "codeql": 67 | return shutil.which("codeql") 68 | 69 | # Custom path provided - check if file exists 70 | if os.path.exists(codeql_path): 71 | return codeql_path 72 | 73 | # Check with extensions (Windows) 74 | if os.name == 'nt': 75 | # Check .cmd extension (CodeQL uses .cmd on Windows) 76 | if os.path.exists(codeql_path + ".cmd"): 77 | return codeql_path + ".cmd" 78 | # Also check .exe for compatibility 79 | if os.path.exists(codeql_path + ".exe"): 80 | return codeql_path + ".exe" 81 | 82 | return None 83 | except Exception: 84 | # Fallback to checking PATH 85 | return shutil.which("codeql") 86 | 87 | 88 | def validate_codeql_path() -> Tuple[bool, Optional[str]]: 89 | """ 90 | Validate that CodeQL executable exists. 91 | 92 | Returns: 93 | Tuple of (is_valid, error_message) 94 | - is_valid: True if CodeQL path is valid 95 | - error_message: Error message if invalid, None if valid 96 | """ 97 | codeql_path = get_codeql_path() 98 | 99 | # Check for placeholder value 100 | codeql_path_str = str(codeql_path).strip().lower() 101 | if "your_codeql_path" in codeql_path_str or codeql_path_str == "your-codeql-path": 102 | return False, ( 103 | "CODEQL_PATH appears to be a placeholder value.\n" 104 | "Please set CODEQL_PATH in your .env file to the actual path of the CodeQL executable.\n" 105 | "On Windows: C:\\path\\to\\codeql\\codeql.cmd\n" 106 | "On Linux/macOS: /path/to/codeql/codeql or add 'codeql' to your PATH" 107 | ) 108 | 109 | # If default "codeql", check if it's in PATH 110 | if codeql_path == "codeql": 111 | codeql_cmd = shutil.which("codeql") 112 | if not codeql_cmd: 113 | return False, ( 114 | "CodeQL not found in PATH. Please either:\n" 115 | " 1. Install CodeQL and add it to your PATH, or\n" 116 | " 2. Set CODEQL_PATH in your .env file to the full path of the CodeQL executable.\n" 117 | " On Windows: C:\\path\\to\\codeql\\codeql.cmd" 118 | ) 119 | return True, None 120 | 121 | # Custom path provided - check if file exists 122 | if not os.path.exists(codeql_path): 123 | # Check with .cmd extension (CodeQL uses .cmd on Windows) 124 | if os.name == 'nt': 125 | if os.path.exists(codeql_path + ".cmd"): 126 | return True, None 127 | 128 | return False, ( 129 | f"CodeQL executable not found at: {codeql_path}\n" 130 | "Please check that CODEQL_PATH in your .env file is correct.\n" 131 | "On Windows, the path must end with .cmd (e.g., C:\\path\\to\\codeql\\codeql.cmd)" 132 | ) 133 | 134 | return True, None 135 | 136 | 137 | def validate_llm_config_dict(config: Dict[str, Any]) -> bool: 138 | """ 139 | Validate LLM configuration dictionary. 140 | 141 | Args: 142 | config: Configuration dictionary 143 | 144 | Returns: 145 | True if valid, raises ValueError if invalid 146 | """ 147 | # Check required fields 148 | required_fields = ["provider", "model"] 149 | 150 | for field in required_fields: 151 | if field not in config: 152 | raise ValueError(f"Missing required configuration field: {field}") 153 | 154 | # Normalize aliases to canonical provider name 155 | provider = config["provider"] 156 | if provider == "google": 157 | provider = "gemini" 158 | config["provider"] = provider # Update config with normalized value 159 | 160 | # Validate provider is in allowed list 161 | if provider not in ALLOWED_LLM_PROVIDERS: 162 | raise ValueError( 163 | f"Provider '{provider}' is not supported. " 164 | f"Allowed providers: {', '.join(sorted(ALLOWED_LLM_PROVIDERS))}" 165 | ) 166 | 167 | # Validate provider specific requirements 168 | if provider == "azure": 169 | if "endpoint" not in config: 170 | raise ValueError("Azure provider requires 'endpoint' in configuration") 171 | if "api_key" not in config or not config["api_key"]: 172 | raise ValueError("Azure provider requires 'api_key' in configuration") 173 | if is_placeholder_api_key(config["api_key"]): 174 | raise ValueError("Azure provider requires a valid 'api_key'. Please set AZURE_OPENAI_API_KEY in your .env file with your actual API key.") 175 | 176 | elif provider == "bedrock": 177 | if "api_key" not in config or not config["api_key"]: 178 | raise ValueError("Bedrock provider requires 'api_key' (AWS_ACCESS_KEY_ID) in configuration") 179 | if is_placeholder_api_key(config["api_key"]): 180 | raise ValueError("Bedrock provider requires a valid 'api_key' (AWS_ACCESS_KEY_ID). Please set AWS_ACCESS_KEY_ID in your .env file with your actual AWS access key.") 181 | if "aws_secret_access_key" not in config or not config.get("aws_secret_access_key"): 182 | raise ValueError("Bedrock provider requires 'aws_secret_access_key' (AWS_SECRET_ACCESS_KEY) in configuration") 183 | if is_placeholder_api_key(config.get("aws_secret_access_key")): 184 | raise ValueError("Bedrock provider requires a valid 'aws_secret_access_key' (AWS_SECRET_ACCESS_KEY). Please set AWS_SECRET_ACCESS_KEY in your .env file with your actual AWS secret key.") 185 | if "endpoint" not in config or not config["endpoint"]: 186 | raise ValueError("Bedrock provider requires 'endpoint' (AWS_REGION_NAME) in configuration") 187 | 188 | elif provider == "ollama": 189 | # Ollama uses placeholder api_key 190 | if "endpoint" not in config: 191 | raise ValueError("Ollama provider requires 'endpoint' (OLLAMA_BASE_URL) in configuration") 192 | 193 | else: 194 | # All other providers require api_key 195 | if "api_key" not in config or not config["api_key"]: 196 | raise ValueError(f"{provider} provider requires 'api_key' in configuration") 197 | if is_placeholder_api_key(config["api_key"]): 198 | # Get the environment variable name for this provider 199 | env_var_map = { 200 | "openai": "OPENAI_API_KEY", 201 | "anthropic": "ANTHROPIC_API_KEY", 202 | "gemini": "GOOGLE_API_KEY", 203 | "mistral": "MISTRAL_API_KEY", 204 | "codestral": "MISTRAL_API_KEY", 205 | "groq": "GROQ_API_KEY", 206 | "openrouter": "OPENROUTER_API_KEY", 207 | "huggingface": "HUGGINGFACE_API_KEY", 208 | "cohere": "COHERE_API_KEY", 209 | } 210 | env_var = env_var_map.get(provider, "API_KEY") 211 | raise ValueError( 212 | f"{provider} provider requires a valid 'api_key'. " 213 | f"Please set {env_var} in your .env file with your actual API key. " 214 | f"Current value appears to be a placeholder." 215 | ) 216 | 217 | return True 218 | 219 | 220 | def validate_llm_config() -> Tuple[bool, Optional[str]]: 221 | """ 222 | Validate LLM configuration. 223 | 224 | Returns: 225 | Tuple of (is_valid, error_message) 226 | - is_valid: True if LLM config is valid 227 | - error_message: Error message if invalid, None if valid 228 | """ 229 | try: 230 | config = load_llm_config() 231 | validate_llm_config_dict(config) 232 | 233 | return True, None 234 | except ValueError as e: 235 | return False, str(e) 236 | except Exception as e: 237 | return False, f"Error loading LLM configuration: {str(e)}" 238 | 239 | 240 | def validate_logging_config() -> Tuple[bool, Optional[str]]: 241 | """ 242 | Validate logging configuration from environment variables. 243 | 244 | Returns: 245 | Tuple of (is_valid, error_message) 246 | - is_valid: True if logging config is valid 247 | - error_message: Error message if invalid, None if valid 248 | """ 249 | import logging 250 | 251 | # Validate LOG_LEVEL 252 | log_level = os.getenv("LOG_LEVEL", "INFO").upper() 253 | valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"} 254 | if log_level not in valid_levels: 255 | return False, ( 256 | f"Invalid LOG_LEVEL: '{log_level}'. " 257 | f"Must be one of: {', '.join(sorted(valid_levels))}" 258 | ) 259 | 260 | # Validate LOG_FORMAT 261 | log_format = os.getenv("LOG_FORMAT", "default").lower() 262 | valid_formats = {"default", "json"} 263 | if log_format not in valid_formats: 264 | return False, ( 265 | f"Invalid LOG_FORMAT: '{log_format}'. " 266 | f"Must be one of: {', '.join(sorted(valid_formats))}" 267 | ) 268 | 269 | # Validate LOG_VERBOSE_CONSOLE 270 | verbose_console = os.getenv("LOG_VERBOSE_CONSOLE", "false").lower() 271 | if verbose_console not in {"true", "false"}: 272 | return False, ( 273 | f"Invalid LOG_VERBOSE_CONSOLE: '{verbose_console}'. " 274 | f"Must be 'true' or 'false'" 275 | ) 276 | 277 | # Validate THIRD_PARTY_LOG_LEVEL 278 | third_party_level = os.getenv("THIRD_PARTY_LOG_LEVEL", "ERROR").upper() 279 | if third_party_level not in valid_levels: 280 | return False, ( 281 | f"Invalid THIRD_PARTY_LOG_LEVEL: '{third_party_level}'. " 282 | f"Must be one of: {', '.join(sorted(valid_levels))}" 283 | ) 284 | 285 | # Validate LOG_FILE (if set, check if path is valid format) 286 | log_file = os.getenv("LOG_FILE") 287 | if log_file: 288 | # Check if path contains invalid characters (basic validation) 289 | from pathlib import Path 290 | try: 291 | # Try to create a Path object to validate format 292 | log_path = Path(log_file) 293 | # Check if parent directory can be determined (basic path validation) 294 | if log_path.is_absolute() or log_path.parent != Path("."): 295 | # Path seems valid, but we don't check if directory exists (it will be created) 296 | pass 297 | except (ValueError, OSError) as e: 298 | return False, ( 299 | f"Invalid LOG_FILE path: '{log_file}'. " 300 | f"Error: {str(e)}" 301 | ) 302 | 303 | return True, None 304 | 305 | 306 | def validate_all_config() -> Tuple[bool, List[str]]: 307 | """ 308 | Validate all configuration (CodeQL, LLM, and Logging). 309 | 310 | Returns: 311 | Tuple of (is_valid, error_messages) 312 | - is_valid: True if all config is valid 313 | - error_messages: List of error messages (empty if valid) 314 | """ 315 | errors: List[str] = [] 316 | 317 | # Validate CodeQL path 318 | codeql_valid, codeql_error = validate_codeql_path() 319 | if not codeql_valid: 320 | errors.append(f"❌ CodeQL Configuration Error:\n{codeql_error}") 321 | 322 | # Validate LLM config 323 | llm_valid, llm_error = validate_llm_config() 324 | if not llm_valid: 325 | errors.append(f"❌ LLM Configuration Error:\n{llm_error}") 326 | 327 | # Validate Logging config 328 | logging_valid, logging_error = validate_logging_config() 329 | if not logging_valid: 330 | errors.append(f"❌ Logging Configuration Error:\n{logging_error}") 331 | 332 | is_valid = len(errors) == 0 333 | return is_valid, errors 334 | 335 | 336 | def validate_and_exit_on_error() -> None: 337 | """ 338 | Validate all configuration and exit with error message if invalid. 339 | 340 | This is the main function to call at startup. 341 | 342 | Raises: 343 | LLMConfigError: If LLM configuration is invalid 344 | CodeQLConfigError: If CodeQL configuration is invalid 345 | VulnhallaError: If Logging configuration is invalid 346 | """ 347 | from src.utils.exceptions import LLMConfigError, CodeQLConfigError, VulnhallaError 348 | 349 | is_valid, errors = validate_all_config() 350 | 351 | if not is_valid: 352 | errors_block = "\n\n".join(errors) 353 | 354 | has_llm_error = any("LLM" in error for error in errors) 355 | has_codeql_error = any("CodeQL" in error for error in errors) 356 | has_logging_error = any("Logging" in error for error in errors) 357 | 358 | # Priority: LLM > CodeQL > Logging 359 | if has_llm_error: 360 | raise LLMConfigError(errors_block) 361 | elif has_codeql_error: 362 | raise CodeQLConfigError(errors_block) 363 | elif has_logging_error: 364 | raise VulnhallaError(errors_block) 365 | else: 366 | raise VulnhallaError(errors_block) 367 | 368 | -------------------------------------------------------------------------------- /src/ui/results_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Results loader for parsing issue results from output/results/ directory. 4 | """ 5 | 6 | import os 7 | import json 8 | import re 9 | import sys 10 | from pathlib import Path 11 | from typing import Dict, List, Optional, Tuple 12 | 13 | # Add project root to path for imports 14 | PROJECT_ROOT = Path(__file__).parent.parent.parent 15 | if str(PROJECT_ROOT) not in sys.path: 16 | sys.path.insert(0, str(PROJECT_ROOT)) 17 | 18 | from src.ui.models import Issue 19 | from src.utils.logger import get_logger 20 | from src.utils.exceptions import VulnhallaError 21 | 22 | logger = get_logger(__name__) 23 | 24 | 25 | class ResultsLoader: 26 | """ 27 | Loads and parses issue results from output/results/ directory. 28 | """ 29 | 30 | def __init__(self, results_root: str = "output/results"): 31 | """ 32 | Initialize the ResultsLoader. 33 | 34 | Args: 35 | results_root (str): Root directory containing analysis results. 36 | Defaults to "output/results". 37 | """ 38 | self.results_root = Path(results_root) 39 | 40 | def extract_status(self, content: str) -> str: 41 | """ 42 | Extract status code from LLM content. 43 | 44 | Args: 45 | content (str): The LLM message content to analyze. 46 | 47 | Returns: 48 | str: Status code - "true" (if 1337 found), "false" (if 1007 found), 49 | or "more" (otherwise). 50 | """ 51 | if not content: 52 | return "more" 53 | content_lower = content.lower() 54 | if "1337" in content_lower: 55 | return "true" 56 | elif "1007" in content_lower: 57 | return "false" 58 | return "more" 59 | 60 | def parse_final_json(self, path: Path) -> Optional[List[Dict]]: 61 | """ 62 | Parse _final.json file containing LLM messages. 63 | 64 | Handles both valid JSON and malformed Python list representations. 65 | 66 | Args: 67 | path (Path): Path to the _final.json file. 68 | 69 | Returns: 70 | Optional[List[Dict]]: List of message dictionaries, or None if parsing fails. 71 | """ 72 | try: 73 | with open(path, 'r', encoding='utf-8') as f: 74 | content = f.read() 75 | except FileNotFoundError as e: 76 | logger.error("File not found: %s", path) 77 | return None 78 | except PermissionError as e: 79 | logger.error("Permission denied reading file: %s", path) 80 | return None 81 | except OSError as e: 82 | logger.error("OS error reading file: %s", path) 83 | return None 84 | 85 | try: 86 | return json.loads(content) 87 | except json.JSONDecodeError: 88 | # Parse manually 89 | messages = [] 90 | for match in re.finditer(r"\{'role':", content): 91 | start = match.start() 92 | # Find the matching closing brace 93 | brace_count = 0 94 | end = start 95 | in_single_quote = False 96 | in_double_quote = False 97 | escape_next = False 98 | for i in range(start, len(content)): 99 | char = content[i] 100 | if escape_next: 101 | escape_next = False 102 | continue 103 | if char == '\\': 104 | escape_next = True 105 | continue 106 | if char == "'" and not escape_next and not in_double_quote: 107 | in_single_quote = not in_single_quote 108 | continue 109 | if char == '"' and not escape_next and not in_single_quote: 110 | in_double_quote = not in_double_quote 111 | continue 112 | if not in_single_quote and not in_double_quote: 113 | if char == '{': 114 | brace_count += 1 115 | elif char == '}': 116 | brace_count -= 1 117 | if brace_count == 0: 118 | end = i + 1 119 | break 120 | 121 | dict_str = content[start:end] 122 | 123 | role_match = re.search(r"'role':\s*['\"]([^'\"]+)['\"]", dict_str) 124 | # Extract content field 125 | content_match = None 126 | 127 | # Determine which quote type used for content 128 | content_key_pos = dict_str.find("'content':") 129 | if content_key_pos >= 0: 130 | # Find the quote character after 'content': 131 | quote_start = content_key_pos + len("'content':") 132 | # Skip whitespace 133 | while quote_start < len(dict_str) and dict_str[quote_start] in ' \t\n': 134 | quote_start += 1 135 | if quote_start < len(dict_str): 136 | quote_char = dict_str[quote_start] 137 | if quote_char == '"': 138 | content_pattern = r"'content':\s*\"((?:[^\"\\]|\\.)*)\"" 139 | content_match = re.search(content_pattern, dict_str, re.DOTALL) 140 | elif quote_char == "'": 141 | content_pattern = r"'content':\s*'((?:[^'\\]|\\.|'')*)'" 142 | content_match = re.search(content_pattern, dict_str, re.DOTALL) 143 | 144 | if not content_match: 145 | content_pattern = r"'content':\s*'((?:[^'\\]|\\.|'')*)'" 146 | content_match = re.search(content_pattern, dict_str, re.DOTALL) 147 | if not content_match: 148 | content_pattern = r"'content':\s*\"((?:[^\"\\]|\\.)*)\"" 149 | content_match = re.search(content_pattern, dict_str, re.DOTALL) 150 | 151 | if role_match and content_match: 152 | content_str = content_match.group(1) 153 | content_str = content_str.replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"').replace('\\\\', '\\') 154 | messages.append({ 155 | 'role': role_match.group(1), 156 | 'content': content_str 157 | }) 158 | return messages if messages else None 159 | except Exception: 160 | return None 161 | 162 | def parse_raw_json(self, path: Path) -> Optional[Dict]: 163 | """ 164 | Parse _raw.json file containing original CodeQL issue data. 165 | 166 | Args: 167 | path (Path): Path to the _raw.json file. 168 | 169 | Returns: 170 | Optional[Dict]: Parsed JSON data as a dictionary, or None if parsing fails. 171 | """ 172 | try: 173 | with open(path, "r", encoding="utf-8") as f: 174 | return json.loads(f.read().replace("\n", "\\n")) 175 | except FileNotFoundError as e: 176 | logger.error("File not found: %s", path) 177 | return None 178 | except PermissionError as e: 179 | logger.error("Permission denied reading file: %s", path) 180 | return None 181 | except json.JSONDecodeError as e: 182 | logger.error("JSON error parsing %s: %s", path, e) 183 | return None 184 | except OSError as e: 185 | logger.error("OS error reading file: %s", path) 186 | return None 187 | 188 | @staticmethod 189 | def _extract_issue_name(raw_data: Dict, issue_type: str) -> str: 190 | """ 191 | Extract issue name from raw_data. 192 | 193 | Args: 194 | raw_data (Dict): Raw JSON data containing issue information. 195 | issue_type (str): Fallback issue type if name cannot be extracted. 196 | 197 | Returns: 198 | str: Issue name extracted from prompt or function name, or issue_type as fallback. 199 | """ 200 | issue_name = raw_data.get("current_function", {}).get("function_name", issue_type) 201 | if "prompt" in raw_data: 202 | name_match = re.search(r'Name:\s*([^\n]+)', raw_data["prompt"]) 203 | if name_match: 204 | return name_match.group(1).strip() 205 | return issue_name 206 | 207 | @staticmethod 208 | def _extract_file_info(raw_data: Dict) -> tuple[str, int]: 209 | """ 210 | Extract file basename and line number from raw_data. 211 | 212 | Args: 213 | raw_data (Dict): Raw JSON data containing function information. 214 | 215 | Returns: 216 | tuple[str, int]: Tuple of (file_basename, line_number). 217 | """ 218 | func = raw_data.get("current_function", {}) 219 | file_path = func.get("file", "") 220 | return (os.path.basename(file_path) if file_path else "unknown", int(func.get("start_line", 0))) 221 | 222 | @staticmethod 223 | def _extract_repo_from_db_path(db_path: str) -> str: 224 | """ 225 | Extract repository name (org/repo) from database path. 226 | 227 | Database path structure: output/databases/// 228 | We extract the repo name from the basename of db_path, and the org name from 229 | the parent directory. 230 | 231 | Args: 232 | db_path (str): The database path from raw_data (e.g., "output/databases/c/redis/cpp") 233 | 234 | Returns: 235 | str: Repository name in format "org/repo" (e.g., "redis/cpp") 236 | """ 237 | if not db_path: 238 | return "unknown/unknown" 239 | 240 | try: 241 | # DB path Structure: output/databases/// 242 | # Example: output/databases/c/redis/cpp 243 | repo_name = os.path.basename(db_path) 244 | parent_dir = os.path.dirname(db_path) 245 | org_name = os.path.basename(parent_dir) 246 | 247 | if org_name and repo_name: 248 | return f"{org_name}/{repo_name}" 249 | else: 250 | return "unknown/unknown" 251 | except Exception: 252 | return "unknown/unknown" 253 | 254 | def load_all_issues(self, lang: str) -> Tuple[List[Issue], List[str]]: 255 | """ 256 | Scan output/results/// and load all issues. 257 | 258 | Args: 259 | lang (str): Language code to scan (e.g., "c"). 260 | 261 | Returns: 262 | Tuple[List[Issue], List[str]]: 263 | - List of Issue objects loaded from all issue type directories. 264 | - List of error messages for files that failed to load. 265 | """ 266 | issues = [] 267 | errors = [] 268 | lang_dir = self.results_root / lang 269 | 270 | if not lang_dir.exists(): 271 | return issues, errors 272 | 273 | # Scan each issue_type directory 274 | for issue_type_dir in lang_dir.iterdir(): 275 | if not issue_type_dir.is_dir(): 276 | continue 277 | 278 | issue_type = issue_type_dir.name 279 | 280 | # Find all _final.json files 281 | for final_file in issue_type_dir.glob("*_final.json"): 282 | # Extract issue ID from filename 283 | issue_id = final_file.stem.replace("_final", "") 284 | 285 | # Find corresponding _raw.json 286 | raw_file = final_file.parent / f"{issue_id}_raw.json" 287 | 288 | if not raw_file.exists(): 289 | errors.append(f"Missing raw file for issue {issue_id}: {raw_file}") 290 | continue 291 | 292 | # Parse JSON files 293 | final_data = self.parse_final_json(final_file) 294 | raw_data = self.parse_raw_json(raw_file) 295 | 296 | if not final_data: 297 | errors.append(f"Failed to parse final JSON: {final_file}") 298 | continue 299 | 300 | if not raw_data: 301 | errors.append(f"Failed to parse raw JSON: {raw_file}") 302 | continue 303 | 304 | file_basename, start_line = self._extract_file_info(raw_data) 305 | issue_name = self._extract_issue_name(raw_data, issue_type) 306 | 307 | # Extract repo from db_path in raw_data 308 | db_path = raw_data.get("db_path", "") 309 | repo = self._extract_repo_from_db_path(db_path) if db_path else "unknown/unknown" 310 | 311 | # Extract status from final_data 312 | status = "more" 313 | # Try to find status in assistant messages 314 | for msg in reversed(final_data): 315 | if isinstance(msg, dict) and msg.get("role", "").lower() == "assistant": 316 | content = msg.get("content", "") 317 | if content: 318 | status = self.extract_status(content) 319 | if status != "more": 320 | break 321 | # No status found in assistant messages, check all messages 322 | if status == "more": 323 | for msg in reversed(final_data): 324 | if isinstance(msg, dict) and "content" in msg: 325 | status = self.extract_status(msg.get("content", "")) 326 | if status != "more": 327 | break 328 | 329 | issue = Issue( 330 | id=issue_id, 331 | name=issue_name, 332 | file=file_basename, 333 | line=start_line, 334 | status=status, 335 | issue_type=issue_type, 336 | lang=lang, 337 | repo=repo, 338 | raw_path=str(raw_file), 339 | final_path=str(final_file), 340 | raw_data=raw_data, 341 | final_data=final_data 342 | ) 343 | issues.append(issue) 344 | 345 | return issues, errors 346 | 347 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vulnhalla 2 | # Automated CodeQL Analysis with LLM Classification 3 | 4 |
5 | Vulnhalla 6 |
7 | 8 | For a detailed overview of the research and motivation behind Vulnhalla, see the official CyberArk Threat Research blog post: 9 | 10 | **["Vulnhalla: Picking the True Vulnerabilities from the CodeQL Haystack"](https://www.cyberark.com/resources/threat-research-blog/vulnhalla-picking-the-true-vulnerabilities-from-the-codeql-haystack)** 11 | 12 | ### Vulnhalla automates the complete security analysis pipeline: 13 | 14 | 1. **Fetching repositories** of a given programming language from GitHub 15 | 2. **Downloading** their corresponding [CodeQL](https://github.com/github/codeql) databases (if available) 16 | 3. **Running CodeQL queries** on those databases to detect security or code-quality issues 17 | 4. **Post-processing** the results with an LLM (ChatGPT, Gemini, etc.) to classify and filter issues 18 | 19 | --- 20 | 21 | ## 🚀 Quick Start 22 | 23 | ### Step 1: Prerequisites 24 | 25 | Before starting, ensure you have: 26 | 27 | - **Python 3.10 – 3.13** (Python 3.11 or 3.12 recommended) 28 | - Python 3.14+ is not supported (this tool uses grpcio which is not supported by Python 3.14+) 29 | - Download from [python.org](https://www.python.org/downloads/) 30 | 31 | - **CodeQL CLI** 32 | - Download from [CodeQL CLI releases](https://github.com/github/codeql-cli-binaries/releases) 33 | - Make sure `codeql` is in your PATH, or you'll set the path in `.env` (see Step 2) 34 | 35 | - **(Optional) GitHub API token** 36 | - For higher rate limits when downloading databases 37 | - Get from [GitHub Settings > Tokens](https://github.com/settings/tokens) 38 | 39 | - **LLM API key** 40 | - OpenAI, Azure, or Gemini API key (depending on your provider) 41 | 42 | ### Step 2: Configure Environment 43 | 44 | All configuration is in a single file: `.env` 45 | 46 | 1. **Clone the repository:** 47 | ```bash 48 | git clone https://github.com/cyberark/Vulnhalla 49 | cd Vulnhalla 50 | ``` 51 | 52 | 2. **Copy `.env.example` to `.env`:** 53 | ```bash 54 | cp .env.example .env 55 | ``` 56 | 57 | 3. **Edit `.env` and fill in your values:** 58 | 59 | **Example for OpenAI:** 60 | ```env 61 | CODEQL_PATH=codeql 62 | GITHUB_TOKEN=ghp_your_token_here 63 | PROVIDER=openai 64 | MODEL=gpt-4o 65 | OPENAI_API_KEY=your-api-key-here 66 | LLM_TEMPERATURE=0.2 67 | LLM_TOP_P=0.2 68 | 69 | # Optional: Logging Configuration 70 | LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR 71 | LOG_FILE= # Optional: path to log file (e.g., logs/vulnhalla.log) 72 | LOG_FORMAT=default # default or json 73 | # LOG_VERBOSE_CONSOLE=false # If true, WARNING/ERROR use full format (timestamp - logger - level - message) 74 | ``` 75 | 76 | > **📖 For complete configuration reference:** See [Configuration Reference](#-configuration-reference) below for all supported providers (OpenAI, Azure, Gemini), required/optional variables, and detailed examples. 77 | 78 | **Optional:** Create a virtual environment: 79 | 80 | ```bash 81 | # (Optional) Create virtual environment 82 | python3 -m venv venv 83 | venv\Scripts\activate # On Windows 84 | # On MacOS/Linux: source venv/bin/activate 85 | ``` 86 | 87 | ### Step 3: setup 88 | 89 | **Option 1: Automated Setup (Recommended)** 90 | 91 | ```bash 92 | python setup.py 93 | ``` 94 | 95 | **Note:** Virtual environment is optional. If `venv/` exists, setup will use it. Otherwise, it installs to your current Python environment. 96 | 97 | The setup script will: 98 | - Install Python dependencies from `requirements.txt` 99 | - Initialize CodeQL packs 100 | 101 | **Option 2: Manual Setup** 102 | 103 | If you prefer to install manually: 104 | 105 | ### Install dependencies 106 | ```bash 107 | pip install -r requirements.txt 108 | ``` 109 | 110 | ### Initialize CodeQL packs 111 | ```bash 112 | cd data/queries/cpp/tools 113 | codeql pack install 114 | cd ../issues 115 | codeql pack install 116 | cd ../../../.. 117 | ``` 118 | 119 | ### Step 4: Run the Pipeline 120 | 121 | **Option 1: Using the Unified Pipeline** 122 | 123 | Run the complete pipeline with a single command: 124 | 125 | ```bash 126 | # Analyze a specific repository 127 | python src/pipeline.py redis/redis 128 | 129 | # Analyze top 100 repositories 130 | python src/pipeline.py 131 | ``` 132 | 133 | This will automatically: 134 | 1. Fetch CodeQL databases 135 | 2. Run CodeQL queries on all downloaded databases 136 | 3. Analyze results with LLM and save to `output/results/` 137 | 4. Open the UI to browse results 138 | 139 | **Option 2: Using the Example Script** 140 | 141 | Run the end-to-end example: 142 | 143 | ```bash 144 | python examples/example.py 145 | ``` 146 | 147 | This will: 148 | 1. Fetch CodeQL databases for `videolan/vlc` and `redis/redis` 149 | 2. Run CodeQL queries on all downloaded databases 150 | 3. Analyze results with LLM and save to `output/results/` 151 | 152 | --- 153 | 154 | ## 🖥️ User Interface (UI) 155 | 156 | Vulnhalla includes a full-featured User Interface for browsing and exploring analysis results. 157 | 158 | ### Running the UI 159 | 160 | ```bash 161 | python src/ui/ui_app.py 162 | # or 163 | python examples/ui_example.py 164 | ``` 165 | 166 | ### UI Layout 167 | 168 | The UI displays a two-panel top area with a controls bar at the bottom: 169 | 170 | **Top Area (side-by-side, resizable):** 171 | 172 | - **Left Panel (Issues List):** 173 | - DataTable showing: **ID**, **Repo**, **Issue Name**, **File**, **LLM decision**, **Manual decision** 174 | - Issues count and sort indicator 175 | - Search input box at the bottom, updates as you type (case-insensitive). 176 | 177 | - **Right Panel (Details):** 178 | - **LLM decision Section**: Shows the LLM's classification (True Positive, False Positive, or Needs More Data) 179 | - **Metadata Section**: Issue name, Repo, File, Line, Type, Function name 180 | - **Code Section**: 181 | - 📌 Initial Code Context (first code snippet the LLM saw) 182 | - 📥 Additional Code (code that the LLM requested during the conversation) - only shown if additional code exists 183 | - Vulnerable line highlighted in red 184 | - **Summary Section**: LLM final answer/decision 185 | - **Manual Decision Select**: Dropdown at the bottom to set manual verdict (True Positive, False Positive, Uncertain, or Not Set) 186 | 187 | **Bottom Controls Bar:** 188 | 189 | - Language: C (only language currently supported) 190 | - Filter by llm desicion dropdown: All, True Positive, False Positive, Needs more Info to decide 191 | - Action buttons: Refresh, Run Analysis 192 | - Key bindings help text 193 | 194 | ### Key Bindings 195 | 196 | - `↑`/`↓` - Navigate issue list (row-by-row) 197 | - `Tab` / `Shift+Tab` - Switch focus between panels 198 | - `Enter` - Show details for selected issue 199 | - `/` - Focus search input box (in left panel) 200 | - `Esc` - Clear search and return focus to issues table 201 | - `r` - Reload results from disk 202 | - `[` / `]` - Resize left/right panels (adjust split position) 203 | - `q` - Quit application 204 | 205 | ### Interactive Features 206 | 207 | #### Column Sorting 208 | 209 | - **Click any column header** to sort by that column 210 | - Default sorting: by Repo (ascending), then by ID (ascending) 211 | 212 | #### Resizable Panels 213 | 214 | - **Draggable divider** between Issues List and Details panels 215 | - **Mouse**: Click and drag the divider to resize 216 | - **Keyboard**: Use `[` to move divider left, `]` to move divider right 217 | - Split position is remembered during the session 218 | 219 | --- 220 | 221 | ## 📊 Output Structure 222 | 223 | After running the pipeline, results are organized in `output/results///`: 224 | 225 | ``` 226 | output/results/c/Copy_function_using_source_size/ 227 | ├── 1_raw.json # Original CodeQL issue data 228 | ├── 1_final.json # LLM conversation and classification 229 | ├── 2_raw.json 230 | ├── 2_final.json 231 | └── ... 232 | ``` 233 | 234 | Each `*_final.json` contains: 235 | - Full LLM conversation (system prompts, user messages, assistant responses, tool calls) 236 | - Final status code (1337 = vulnerable, 1007 = secure, 7331/3713 = needs more info) 237 | 238 | Each `*_raw.json` contains: 239 | - Original CodeQL issue data 240 | - Function context 241 | - Database path (includes org/repo information: `output/databases///`) 242 | - Issue location 243 | 244 | --- 245 | 246 | ## 🛠 Troubleshooting 247 | 248 | - **CodeQL CLI not found**: 249 | Set `CODEQL_PATH` in your `.env` file to the full path of your CodeQL executable. 250 | **On Windows**: The path must end with `.cmd` (e.g., `C:\path\to\codeql\codeql.cmd`). 251 | 252 | - **GitHub rate limits**: 253 | Set `GITHUB_TOKEN` in your `.env` file (get token from https://github.com/settings/tokens). 254 | 255 | - **LLM issues**: 256 | Check your API keys in `.env` file match your selected provider. 257 | 258 | - **Import errors in UI**: 259 | Make sure you're running from the project root directory, or use `python examples/ui_example.py` which handles path setup. 260 | 261 | --- 262 | 263 | ## ⚙️ Configuration Reference 264 | 265 | ### Environment Variables 266 | 267 | All configuration is managed through environment variables in your `.env` file. Here's a complete reference: 268 | 269 | #### Required Variables 270 | 271 | | Variable | Required For | Description | 272 | |----------|--------------|-------------| 273 | | `CODEQL_PATH` | All | Path to CodeQL executable. Defaults to `codeql` if CodeQL is in PATH. Use full path if not in PATH (e.g., `C:\path\to\codeql\codeql.cmd` on Windows) | 274 | | `PROVIDER` | All | LLM provider: `openai`, `azure`, or `gemini` | 275 | | `MODEL` | All | Model name (e.g., `gpt-4o`, `gpt-4-turbo`, `gemini-2.5-flash`) | 276 | 277 | #### Provider-Specific Required Variables 278 | 279 | **OpenAI:** 280 | | Variable | Description | 281 | |----------|-------------| 282 | | `OPENAI_API_KEY` | Your OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys) | 283 | 284 | **Azure OpenAI:** 285 | | Variable | Description | 286 | |----------|-------------| 287 | | `AZURE_OPENAI_API_KEY` or `AZURE_API_KEY` | Your Azure OpenAI API key | 288 | | `AZURE_OPENAI_ENDPOINT` or `AZURE_API_BASE` | Your Azure OpenAI endpoint URL (e.g., `https://your-resource.openai.azure.com`) | 289 | | `AZURE_OPENAI_API_VERSION` or `AZURE_API_VERSION` | API version (default: `2024-08-01-preview`) | 290 | 291 | **Gemini (Google):** 292 | | Variable | Description | 293 | |----------|-------------| 294 | | `GOOGLE_API_KEY` | Your Google API key from [Google AI Studio](https://makersuite.google.com/app/apikey) | 295 | 296 | #### Optional Variables 297 | 298 | | Variable | Default | Description | 299 | |----------|---------|-------------| 300 | | `GITHUB_TOKEN` | - | GitHub API token for higher rate limits. Get from [GitHub Settings > Tokens](https://github.com/settings/tokens) | 301 | | `LLM_TEMPERATURE` | `0.2` | LLM temperature (0.0-2.0). Lower = more deterministic. **Recommended: keep at 0.2** | 302 | | `LLM_TOP_P` | `0.2` | LLM top-p sampling (0.0-1.0). Lower = more focused. **Recommended: keep at 0.2** | 303 | | `LOG_LEVEL` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, or `ERROR`. Controls verbosity of console output | 304 | | `LOG_FILE` | - | Optional path to log file (e.g., `logs/vulnhalla.log`). If set, logs are written to both console and file. File logging uses DEBUG level for detailed output | 305 | | `LOG_FORMAT` | `default` | Log format style: `default` (human-readable), or `json` (structured JSON format) | 306 | | `LOG_VERBOSE_CONSOLE` | `false` | If `true`, WARNING/ERROR/CRITICAL use full format (timestamp - logger - level - message). Default: WARNING/ERROR use simple format (LEVEL - message), INFO always minimal (message only) | 307 | | `THIRD_PARTY_LOG_LEVEL` | `ERROR` | Log level for third-party libraries (LiteLLM, urllib3, requests). Options: `DEBUG`, `INFO`, `WARNING`, `ERROR`. Default suppresses most third-party noise | 308 | 309 | > **⚠️ Important:** Do not increase `LLM_TEMPERATURE` or `LLM_TOP_P` unless you fully understand the impact. Lower values keep the model stable and deterministic, which is critical for security analysis. Higher values may cause the model to become inconsistent, creative, or hallucinate results. 310 | 311 | > **📝 Note:** For additional configuration examples, see the `.env.example` file in the project root. 312 | 313 | ### Configuration Validation 314 | 315 | Vulnhalla validates your configuration at startup. If required variables are missing or invalid, you'll see clear error messages indicating what needs to be fixed. 316 | 317 | **Common validation errors:** 318 | - Missing API key for selected provider 319 | - Invalid provider name (must be `openai`, `azure`, or `gemini`) 320 | - Missing Azure endpoint (required for Azure provider) 321 | - Invalid CodeQL path (if `CODEQL_PATH` is set but file doesn't exist) 322 | 323 | --- 324 | 325 | ## 📝 Status Codes 326 | 327 | The LLM uses the following status codes: 328 | 329 | - **1337**: Security vulnerability found (True Positive) 330 | - **1007**: Code is secure, no vulnerability (False Positive) 331 | - **7331**: More code/information needed to validate security 332 | - **3713**: Likely not a security problem, but more info needed (used with 7331) 333 | 334 | The UI maps these to: 335 | - `1337` → "True Positive" 336 | - `1007` → "False Positive" 337 | - `7331` or `3713` → "Needs More Data" 338 | 339 | --- 340 | 341 | ## 🔧 Development 342 | 343 | ### Running Tests 344 | 345 | The project includes basic test infrastructure using pytest: 346 | 347 | ```bash 348 | # Run all tests 349 | pytest 350 | 351 | # Run with verbose output 352 | pytest -v 353 | ``` 354 | 355 | The test suite includes smoke tests to verify the test infrastructure is set up correctly. 356 | 357 | ### Project Dependencies 358 | 359 | See `requirements.txt` for Python dependencies: 360 | - `requests` - HTTP requests for GitHub API 361 | - `pySmartDL` - Smart download manager for CodeQL databases 362 | - `litellm` - Unified LLM interface supporting multiple providers 363 | - `python-dotenv` - Environment variable management 364 | - `PyYAML` - YAML parsing for CodeQL pack files 365 | - `textual` - Terminal UI framework 366 | - `pytest` - Testing framework 367 | 368 | ### CodeQL Queries 369 | 370 | CodeQL queries are organized in `data/queries//`: 371 | - `issues/` - Security issue detection queries 372 | - `tools/` - Helper queries (function trees, classes, global variables, macros) 373 | 374 | Each directory contains a `qlpack.yml` file defining the CodeQL pack. 375 | 376 | --- 377 | 378 | ## 📄 License 379 | 380 | Copyright (c) 2025 CyberArk Software Ltd. All rights reserved. 381 | 382 | This repository is licensed under the Apache License, Version 2.0 - see [LICENSE.txt](LICENSE.txt) for more details. 383 | 384 | --- 385 | 386 | ## 🤝 Contributing 387 | 388 | 389 | We welcome contributions of all kinds to this repository. For instructions on how to get started and descriptions of our development workflows, please see our [contributing guide](https://github.com/cyberark/Vulnhalla/blob/main/CONTRIBUTING.md). 390 | 391 | --- 392 | ### Code of Conduct 393 | 394 | Please read and follow our [Code of Conduct](CODE_OF_CONDUCT.md). We are committed to providing a welcoming and inclusive environment for all contributors. 395 | 396 | --- 397 | 398 | ## 📧 Contact 399 | 400 | Feel free to contact us via GitHub issues if you have any feature requests or project issues. 401 | -------------------------------------------------------------------------------- /NOTICES.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025 CyberArk Software Ltd. All rights reserved 2 | 3 | Vulnhalla is using the following open source components: 4 | 5 | 1) Requests (https://github.com/psf/requests/blob/main/LICENSE) : Apache-2.0 License 6 | Copyright 2019 Kenneth Reitz 7 | 8 | 2) python-dotenv (https://github.com/theskumar/python-dotenv/blob/main/LICENSE) : BSD-3-Clause License 9 | Copyright (c) 2014, Saurabh Kumar (python-dotenv), 2013, Ted Tieken (django-dotenv-rw), 2013, Jacob Kaplan-Moss (django-dotenv) 10 | 11 | 3) litellm (https://github.com/BerriAI/litellm/blob/main/LICENSE) : MIT License 12 | Copyright (c) 2023 Berri AI 13 | 14 | 4) pyYaml (https://pyyaml.org/) : MIT License 15 | Copyright (c) 2017-2021 Ingy döt Net 16 | Copyright (c) 2006-2016 Kirill Simonov 17 | 18 | 5) textual (https://github.com/Textualize/textual/blob/main/LICENSE) : MIT License 19 | Copyright (c) 2021 Will McGugan 20 | 21 | 6) pySmartDL (http://pypi.python.org/pypi/pySmartDL/) : Public Domain 22 | 23 | ===================================================================== 24 | 1) 25 | 26 | Apache License 27 | Version 2.0, January 2004 28 | http://www.apache.org/licenses/ 29 | 30 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 31 | 32 | 1. Definitions. 33 | 34 | "License" shall mean the terms and conditions for use, reproduction, 35 | and distribution as defined by Sections 1 through 9 of this document. 36 | 37 | "Licensor" shall mean the copyright owner or entity authorized by 38 | the copyright owner that is granting the License. 39 | 40 | "Legal Entity" shall mean the union of the acting entity and all 41 | other entities that control, are controlled by, or are under common 42 | control with that entity. For the purposes of this definition, 43 | "control" means (i) the power, direct or indirect, to cause the 44 | direction or management of such entity, whether by contract or 45 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 46 | outstanding shares, or (iii) beneficial ownership of such entity. 47 | 48 | "You" (or "Your") shall mean an individual or Legal Entity 49 | exercising permissions granted by this License. 50 | 51 | "Source" form shall mean the preferred form for making modifications, 52 | including but not limited to software source code, documentation 53 | source, and configuration files. 54 | 55 | "Object" form shall mean any form resulting from mechanical 56 | transformation or translation of a Source form, including but 57 | not limited to compiled object code, generated documentation, 58 | and conversions to other media types. 59 | 60 | "Work" shall mean the work of authorship, whether in Source or 61 | Object form, made available under the License, as indicated by a 62 | copyright notice that is included in or attached to the work 63 | (an example is provided in the Appendix below). 64 | 65 | "Derivative Works" shall mean any work, whether in Source or Object 66 | form, that is based on (or derived from) the Work and for which the 67 | editorial revisions, annotations, elaborations, or other modifications 68 | represent, as a whole, an original work of authorship. For the purposes 69 | of this License, Derivative Works shall not include works that remain 70 | separable from, or merely link (or bind by name) to the interfaces of, 71 | the Work and Derivative Works thereof. 72 | 73 | "Contribution" shall mean any work of authorship, including 74 | the original version of the Work and any modifications or additions 75 | to that Work or Derivative Works thereof, that is intentionally 76 | submitted to Licensor for inclusion in the Work by the copyright owner 77 | or by an individual or Legal Entity authorized to submit on behalf of 78 | the copyright owner. For the purposes of this definition, "submitted" 79 | means any form of electronic, verbal, or written communication sent 80 | to the Licensor or its representatives, including but not limited to 81 | communication on electronic mailing lists, source code control systems, 82 | and issue tracking systems that are managed by, or on behalf of, the 83 | Licensor for the purpose of discussing and improving the Work, but 84 | excluding communication that is conspicuously marked or otherwise 85 | designated in writing by the copyright owner as "Not a Contribution." 86 | 87 | "Contributor" shall mean Licensor and any individual or Legal Entity 88 | on behalf of whom a Contribution has been received by Licensor and 89 | subsequently incorporated within the Work. 90 | 91 | 2. Grant of Copyright License. Subject to the terms and conditions of 92 | this License, each Contributor hereby grants to You a perpetual, 93 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 94 | copyright license to reproduce, prepare Derivative Works of, 95 | publicly display, publicly perform, sublicense, and distribute the 96 | Work and such Derivative Works in Source or Object form. 97 | 98 | 3. Grant of Patent License. Subject to the terms and conditions of 99 | this License, each Contributor hereby grants to You a perpetual, 100 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 101 | (except as stated in this section) patent license to make, have made, 102 | use, offer to sell, sell, import, and otherwise transfer the Work, 103 | where such license applies only to those patent claims licensable 104 | by such Contributor that are necessarily infringed by their 105 | Contribution(s) alone or by combination of their Contribution(s) 106 | with the Work to which such Contribution(s) was submitted. If You 107 | institute patent litigation against any entity (including a 108 | cross-claim or counterclaim in a lawsuit) alleging that the Work 109 | or a Contribution incorporated within the Work constitutes direct 110 | or contributory patent infringement, then any patent licenses 111 | granted to You under this License for that Work shall terminate 112 | as of the date such litigation is filed. 113 | 114 | 4. Redistribution. You may reproduce and distribute copies of the 115 | Work or Derivative Works thereof in any medium, with or without 116 | modifications, and in Source or Object form, provided that You 117 | meet the following conditions: 118 | 119 | (a) You must give any other recipients of the Work or 120 | Derivative Works a copy of this License; and 121 | 122 | (b) You must cause any modified files to carry prominent notices 123 | stating that You changed the files; and 124 | 125 | (c) You must retain, in the Source form of any Derivative Works 126 | that You distribute, all copyright, patent, trademark, and 127 | attribution notices from the Source form of the Work, 128 | excluding those notices that do not pertain to any part of 129 | the Derivative Works; and 130 | 131 | (d) If the Work includes a "NOTICE" text file as part of its 132 | distribution, then any Derivative Works that You distribute must 133 | include a readable copy of the attribution notices contained 134 | within such NOTICE file, excluding those notices that do not 135 | pertain to any part of the Derivative Works, in at least one 136 | of the following places: within a NOTICE text file distributed 137 | as part of the Derivative Works; within the Source form or 138 | documentation, if provided along with the Derivative Works; or, 139 | within a display generated by the Derivative Works, if and 140 | wherever such third-party notices normally appear. The contents 141 | of the NOTICE file are for informational purposes only and 142 | do not modify the License. You may add Your own attribution 143 | notices within Derivative Works that You distribute, alongside 144 | or as an addendum to the NOTICE text from the Work, provided 145 | that such additional attribution notices cannot be construed 146 | as modifying the License. 147 | 148 | You may add Your own copyright statement to Your modifications and 149 | may provide additional or different license terms and conditions 150 | for use, reproduction, or distribution of Your modifications, or 151 | for any such Derivative Works as a whole, provided Your use, 152 | reproduction, and distribution of the Work otherwise complies with 153 | the conditions stated in this License. 154 | 155 | 5. Submission of Contributions. Unless You explicitly state otherwise, 156 | any Contribution intentionally submitted for inclusion in the Work 157 | by You to the Licensor shall be under the terms and conditions of 158 | this License, without any additional terms or conditions. 159 | Notwithstanding the above, nothing herein shall supersede or modify 160 | the terms of any separate license agreement you may have executed 161 | with Licensor regarding such Contributions. 162 | 163 | 6. Trademarks. This License does not grant permission to use the trade 164 | names, trademarks, service marks, or product names of the Licensor, 165 | except as required for reasonable and customary use in describing the 166 | origin of the Work and reproducing the content of the NOTICE file. 167 | 168 | 7. Disclaimer of Warranty. Unless required by applicable law or 169 | agreed to in writing, Licensor provides the Work (and each 170 | Contributor provides its Contributions) on an "AS IS" BASIS, 171 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 172 | implied, including, without limitation, any warranties or conditions 173 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 174 | PARTICULAR PURPOSE. You are solely responsible for determining the 175 | appropriateness of using or redistributing the Work and assume any 176 | risks associated with Your exercise of permissions under this License. 177 | 178 | 8. Limitation of Liability. In no event and under no legal theory, 179 | whether in tort (including negligence), contract, or otherwise, 180 | unless required by applicable law (such as deliberate and grossly 181 | negligent acts) or agreed to in writing, shall any Contributor be 182 | liable to You for damages, including any direct, indirect, special, 183 | incidental, or consequential damages of any character arising as a 184 | result of this License or out of the use or inability to use the 185 | Work (including but not limited to damages for loss of goodwill, 186 | work stoppage, computer failure or malfunction, or any and all 187 | other commercial damages or losses), even if such Contributor 188 | has been advised of the possibility of such damages. 189 | 190 | 9. Accepting Warranty or Additional Liability. While redistributing 191 | the Work or Derivative Works thereof, You may choose to offer, 192 | and charge a fee for, acceptance of support, warranty, indemnity, 193 | or other liability obligations and/or rights consistent with this 194 | License. However, in accepting such obligations, You may act only 195 | on Your own behalf and on Your sole responsibility, not on behalf 196 | of any other Contributor, and only if You agree to indemnify, 197 | defend, and hold each Contributor harmless for any liability 198 | incurred by, or claims asserted against, such Contributor by reason 199 | of your accepting any such warranty or additional liability. 200 | 201 | ===================================================================== 202 | 2) BSD-3-Clause License 203 | 204 | Copyright (c) 2014, Saurabh Kumar (python-dotenv), 2013, Ted Tieken (django-dotenv-rw), 2013, Jacob Kaplan-Moss (django-dotenv) 205 | Redistribution and use in source and binary forms, with or without modification, 206 | are permitted provided that the following conditions are met: 207 | - Redistributions of source code must retain the above copyright notice, 208 | this list of conditions and the following disclaimer. 209 | - Redistributions in binary form must reproduce the above copyright notice, 210 | this list of conditions and the following disclaimer in the documentation 211 | and/or other materials provided with the distribution. 212 | - Neither the name of django-dotenv nor the names of its contributors 213 | may be used to endorse or promote products derived from this software 214 | without specific prior written permission. 215 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 216 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 217 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 218 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 219 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 220 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 221 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 222 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 223 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 224 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 225 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 226 | 227 | ===================================================================== 228 | 3) MIT License 229 | 230 | Copyright (c) 2023 Berri AI 231 | 232 | Permission is hereby granted, free of charge, to any person obtaining a copy 233 | of this software and associated documentation files (the "Software"), to deal 234 | in the Software without restriction, including without limitation the rights 235 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 236 | copies of the Software, and to permit persons to whom the Software is 237 | furnished to do so, subject to the following conditions: 238 | 239 | The above copyright notice and this permission notice shall be included in all 240 | copies or substantial portions of the Software. 241 | 242 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 243 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 244 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 245 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 246 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 247 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 248 | SOFTWARE. 249 | 250 | ===================================================================== 251 | 4) MIT License 252 | 253 | Copyright (c) 2017-2021 Ingy döt Net 254 | Copyright (c) 2006-2016 Kirill Simonov 255 | 256 | Permission is hereby granted, free of charge, to any person obtaining a copy of 257 | this software and associated documentation files (the "Software"), to deal in 258 | the Software without restriction, including without limitation the rights to 259 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 260 | of the Software, and to permit persons to whom the Software is furnished to do 261 | so, subject to the following conditions: 262 | 263 | The above copyright notice and this permission notice shall be included in all 264 | copies or substantial portions of the Software. 265 | 266 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 267 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 268 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 269 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 270 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 271 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 272 | SOFTWARE. 273 | 274 | ===================================================================== 275 | 5) MIT License 276 | 277 | Copyright (c) 2021 Will McGugan 278 | 279 | Permission is hereby granted, free of charge, to any person obtaining a copy 280 | of this software and associated documentation files (the "Software"), to deal 281 | in the Software without restriction, including without limitation the rights 282 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 283 | copies of the Software, and to permit persons to whom the Software is 284 | furnished to do so, subject to the following conditions: 285 | 286 | The above copyright notice and this permission notice shall be included in all 287 | copies or substantial portions of the Software. 288 | 289 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 290 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 291 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 292 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 293 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 294 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 295 | SOFTWARE. -------------------------------------------------------------------------------- /src/vulnhalla.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Core analysis engine for Vulnhalla. 4 | 5 | This module coordinates the aggregation of raw CodeQL findings and their 6 | classification by an LLM. It loads issues from CodeQL result files, 7 | groups them by issue type, runs LLM-based analysis to decide whether 8 | each finding is a true positive, false positive, or needs more data, 9 | and writes structured result files for further inspection (e.g. in the UI). 10 | """ 11 | 12 | import os 13 | import csv 14 | import re 15 | import json 16 | from typing import Any, Callable, Dict, List, Optional, Tuple 17 | 18 | # Import from common 19 | from src.utils.common_functions import ( 20 | get_all_dbs, 21 | read_file_lines_from_zip, 22 | read_file as read_file_utf8, 23 | write_file_ascii, 24 | read_yml 25 | ) 26 | 27 | # Script that holds your GPT logic 28 | from src.llm.llm_analyzer import LLMAnalyzer 29 | from src.utils.config_validator import validate_and_exit_on_error 30 | from src.utils.logger import get_logger 31 | from src.utils.exceptions import VulnhallaError, CodeQLError 32 | 33 | logger = get_logger(__name__) 34 | 35 | 36 | class IssueAnalyzer: 37 | """ 38 | Analyzes all issues in CodeQL databases, fetches relevant code snippets, 39 | and forwards them to an LLM (via llm_analyzer) for triage. 40 | """ 41 | 42 | def __init__(self, lang: str = "c", config: Optional[Dict[str, Any]] = None) -> None: 43 | """ 44 | Initialize the IssueAnalyzer with default parameters. 45 | 46 | Args: 47 | lang (str, optional): The language code. Defaults to 'c'. 48 | config (Dict, optional): Full LLM configuration dictionary. If not provided, loads from .env file. 49 | """ 50 | self.lang = lang 51 | self.db_path: Optional[str] = None 52 | self.code_path: Optional[str] = None 53 | self.config = config 54 | 55 | # ---------------------------------------------------------------------- 56 | # 1. CSV Parsing and Data Gathering 57 | # ---------------------------------------------------------------------- 58 | 59 | def parse_issues_csv(self, file_name: str) -> List[Dict[str, str]]: 60 | """ 61 | Reads the issues.csv file produced by CodeQL (with a custom or default 62 | set of columns) and returns a list of dicts. 63 | 64 | Args: 65 | file_name (str): The path to 'issues.csv'. 66 | 67 | Returns: 68 | List[Dict[str, str]]: A list of issue objects parsed from CSV rows. 69 | 70 | Raises: 71 | CodeQLError: If file cannot be read (not found, permission denied, etc.). 72 | """ 73 | field_names = [ 74 | "name", "help", "type", "message", 75 | "file", "start_line", "start_offset", 76 | "end_line", "end_offset" 77 | ] 78 | issues = [] 79 | try: 80 | with open(file_name, "r", encoding="utf-8") as f: 81 | csv_reader = csv.DictReader(f, fieldnames=field_names) 82 | for row in csv_reader: 83 | issues.append(row) 84 | except FileNotFoundError as e: 85 | raise CodeQLError(f"Issues CSV file not found: {file_name}") from e 86 | except PermissionError as e: 87 | raise CodeQLError(f"Permission denied reading issues CSV: {file_name}") from e 88 | except OSError as e: 89 | raise CodeQLError(f"OS error while reading issues CSV: {file_name}") from e 90 | return issues 91 | 92 | def collect_issues_from_databases(self, dbs_folder: str) -> Dict[str, List[Dict[str, str]]]: 93 | """ 94 | Searches through all CodeQL databases in `dbs_folder`, collects issues 95 | from each DB, and groups them by issue name. 96 | 97 | Args: 98 | dbs_folder (str): The folder containing the language-specific databases. 99 | 100 | Returns: 101 | Dict[str, List[Dict[str, str]]]: All issues, grouped by issue name. 102 | 103 | Raises: 104 | CodeQLError: If database folder cannot be accessed or issues cannot be read. 105 | """ 106 | issues_statistics: Dict[str, List[Dict[str, str]]] = {} 107 | # get_all_dbs() raises CodeQLError on errors 108 | dbs_path = get_all_dbs(dbs_folder) 109 | for curr_db in dbs_path: 110 | logger.info("Processing DB: %s", curr_db) 111 | function_tree_csv = os.path.join(curr_db, "FunctionTree.csv") 112 | issues_file = os.path.join(curr_db, "issues.csv") 113 | 114 | if os.path.exists(function_tree_csv) and os.path.exists(issues_file): 115 | # parse_issues_csv() raises CodeQLError on errors 116 | issues = self.parse_issues_csv(issues_file) 117 | for issue in issues: 118 | if issue["name"] not in issues_statistics: 119 | issues_statistics[issue["name"]] = [] 120 | issue["db_path"] = curr_db 121 | issues_statistics[issue["name"]].append(issue) 122 | else: 123 | logger.error("Error: Execute run_codeql_queries.py first!") 124 | continue 125 | 126 | return issues_statistics 127 | 128 | # ---------------------------------------------------------------------- 129 | # 2. Function and Snippet Extraction 130 | # ---------------------------------------------------------------------- 131 | 132 | def find_function_by_line(self, function_tree_file: str, file_path: str, line: int) -> Optional[Dict[str, str]]: 133 | """ 134 | Finds the most specific (smallest) function in the function tree file that includes the given file and line number. 135 | 136 | Args: 137 | function_tree_file (str): Path to the 'FunctionTree.csv' file. 138 | file_path (str): Partial or full file path to match in the CSV rows. 139 | line (int): The line number to check within function range. 140 | 141 | Returns: 142 | Optional[Dict[str, str]]: The best matching function dictionary, or None if not found. 143 | 144 | Raises: 145 | CodeQLError: If function tree file cannot be read (not found, permission denied, etc.). 146 | """ 147 | keys = ["function_name", "file", "start_line", "function_id", "end_line", "caller_id"] 148 | best_function = None 149 | smallest_range = float('inf') 150 | 151 | try: 152 | with open(function_tree_file, "r", encoding="utf-8") as f: 153 | for row in f: 154 | if file_path in row: 155 | fields = re.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', row.strip()) 156 | if len(fields) != len(keys): 157 | continue # Skip malformed rows 158 | 159 | function = dict(zip(keys, fields)) 160 | try: 161 | start_line = int(function["start_line"]) 162 | end_line = int(function["end_line"]) 163 | except ValueError: 164 | continue # Skip if lines aren't integers 165 | 166 | if start_line <= line <= end_line: 167 | if file_path in function["file"]: 168 | size = end_line - start_line 169 | if size < smallest_range: 170 | best_function = function 171 | smallest_range = size 172 | except FileNotFoundError as e: 173 | raise CodeQLError(f"Function tree file not found: {function_tree_file}") from e 174 | except PermissionError as e: 175 | raise CodeQLError(f"Permission denied reading function tree file: {function_tree_file}") from e 176 | except OSError as e: 177 | raise CodeQLError(f"OS error while reading function tree file: {function_tree_file}") from e 178 | 179 | return best_function 180 | 181 | def extract_function_code(self, code_file: List[str], function_dict: Dict[str, str]) -> str: 182 | """ 183 | Produces lines of the function's code from a list of lines. 184 | 185 | Args: 186 | code_file (List[str]): A list of lines for the entire file. 187 | function_dict (Dict[str, str]): The dictionary describing the function. 188 | 189 | Returns: 190 | str: A snippet string of code for the function. 191 | """ 192 | if not function_dict: 193 | return "" 194 | start_line = int(function_dict["start_line"]) - 1 195 | end_line = int(function_dict["end_line"]) 196 | snippet_lines = code_file[start_line:end_line] 197 | snippet = "\n".join( 198 | f"{start_line + i}: {s.replace(chr(9), ' ')}" 199 | for i, s in enumerate(snippet_lines) 200 | ) 201 | return snippet 202 | 203 | # ---------------------------------------------------------------------- 204 | # 3. Text Replacement & Prompt Building 205 | # ---------------------------------------------------------------------- 206 | 207 | def create_bracket_reference_replacer( 208 | self, 209 | db_path: str, 210 | code_path: str 211 | ) -> Callable[[re.Match], str]: 212 | """ 213 | Creates and returns a 'replacement' callback function that can be used with 214 | `re.sub` to transform bracketed references (like [[var|"file://path:line:..."]]) 215 | into a more readable snippet inline with line references. 216 | 217 | Args: 218 | db_path (str): Path to the current CodeQL database. 219 | code_path (str): Base path to the code. May differ on Windows vs. Linux. 220 | 221 | Returns: 222 | Callable[[re.Match], str]: A function that can be used with `re.sub`. 223 | 224 | Note: 225 | The returned callback function may raise `CodeQLError` if ZIP file cannot be read. 226 | """ 227 | def replacement(match): 228 | variable = match.group(1) 229 | path_type = match.group(2) 230 | file_path = match.group(3) 231 | line_number = match.group(4) 232 | start_offset = match.group(5) 233 | end_offset = match.group(6) 234 | 235 | # Read snippet from the code 236 | if path_type == "relative://": 237 | full_path = code_path + file_path 238 | else: 239 | # Handle 'file://' or something else by removing the leading slash 240 | full_path = file_path[1:] if file_path.startswith("/") else file_path 241 | 242 | code_text = read_file_lines_from_zip( 243 | os.path.join(db_path, "src.zip"), 244 | full_path 245 | ) 246 | code_lines = code_text.split("\n") 247 | snippet = code_lines[int(line_number) - 1][int(start_offset) - 1:int(end_offset)] 248 | 249 | file_name = os.path.split(file_path)[1] 250 | return f"{variable} '{snippet}' ({file_name}:{int(line_number)})" 251 | 252 | return replacement 253 | 254 | def build_prompt_by_template( 255 | self, 256 | issue: Dict[str, str], 257 | message: str, 258 | snippet: str, 259 | code: str 260 | ) -> str: 261 | """ 262 | Builds the final 'prompt' template to feed into an LLM, combining 263 | the code snippet, code content, and a set of hints. 264 | 265 | Args: 266 | issue (Dict[str, str]): The issue dictionary from parse_issues_csv. 267 | message (str): The processed "message" text to embed. 268 | snippet (str): The direct snippet from the code for the particular highlight. 269 | code (str): Additional code context (e.g. entire function). 270 | 271 | Returns: 272 | str: A final prompt string with the template + hints + snippet + code. 273 | 274 | Raises: 275 | VulnhallaError: If template files cannot be read (not found, permission denied, etc.). 276 | """ 277 | # If language is 'c', many queries are stored under 'cpp' 278 | lang_folder = "cpp" if self.lang == "c" else self.lang 279 | 280 | # Try to read an existing template specific to the issue name 281 | hints_path = os.path.join("data/templates", lang_folder, issue["name"] + ".template") 282 | if not os.path.exists(hints_path): 283 | hints_path = os.path.join("data/templates", lang_folder, "general.template") 284 | 285 | hints = read_file_utf8(hints_path) 286 | 287 | # Read the larger general template 288 | template_path = os.path.join("data/templates", lang_folder, "template.template") 289 | template = read_file_utf8(template_path) 290 | 291 | location = "look at {file_line} with '{snippet}'".format( 292 | file_line=os.path.split(issue["file"])[1] + ":" + str(int(issue["start_line"]) - 1), 293 | snippet=snippet 294 | ) 295 | 296 | # Special case for "Use of object after its lifetime has ended" 297 | if issue["name"] == "Use of object after its lifetime has ended": 298 | message = message.replace("here", f"here ({location})", 1) 299 | 300 | prompt = template.format( 301 | name=issue["name"], 302 | description=issue["help"], 303 | message=message, 304 | location=location, 305 | hints=hints, 306 | code=code 307 | ) 308 | return prompt 309 | 310 | # ---------------------------------------------------------------------- 311 | # 4. Saving LLM Results 312 | # ---------------------------------------------------------------------- 313 | 314 | def ensure_directories_exist(self, dirs: List[str]) -> None: 315 | """ 316 | Creates all directories in the given list if they do not already exist. 317 | 318 | Args: 319 | dirs (List[str]): A list of directory paths to create if missing. 320 | 321 | Raises: 322 | VulnhallaError: If directory creation fails (permission denied, etc.). 323 | """ 324 | for d in dirs: 325 | if not os.path.exists(d): 326 | try: 327 | os.makedirs(d, exist_ok=True) 328 | except PermissionError as e: 329 | raise VulnhallaError(f"Permission denied creating directory: {d}") from e 330 | except OSError as e: 331 | raise VulnhallaError(f"OS error creating directory: {d}") from e 332 | 333 | 334 | # ---------------------------------------------------------------------- 335 | # 5. Main Analysis Routine 336 | # ---------------------------------------------------------------------- 337 | 338 | def save_raw_input_data( 339 | self, 340 | prompt: str, 341 | function_tree_file: str, 342 | current_function: Dict[str, str], 343 | results_folder: str, 344 | issue_id: int 345 | ) -> None: 346 | """ 347 | Saves the raw input data (prompt, function tree info, etc.) to a JSON file before 348 | sending it to the LLM. 349 | 350 | Args: 351 | prompt (str): The final prompt text sent to the LLM. 352 | function_tree_file (str): Path to 'FunctionTree.csv'. 353 | current_function (Dict[str, str]): The currently found function dict. 354 | results_folder (str): Folder path where we store the result files. 355 | issue_id (int): The numeric ID of the current issue. 356 | 357 | Raises: 358 | VulnhallaError: If file cannot be written (permission denied, etc.). 359 | """ 360 | raw_data = json.dumps({ 361 | "function_tree_file": function_tree_file, 362 | "current_function": current_function, 363 | "db_path": self.db_path, 364 | "code_path": self.code_path, 365 | "prompt": prompt 366 | }, ensure_ascii=False) 367 | 368 | raw_output_file = os.path.join(results_folder, f"{issue_id}_raw.json") 369 | write_file_ascii(raw_output_file, raw_data) 370 | 371 | def format_llm_messages(self, messages: List[str]) -> str: 372 | """ 373 | Converts the list of messages returned by the LLM into a JSON-ish string to 374 | store as output. 375 | 376 | Args: 377 | messages (List[str]): The messages from the LLM. 378 | 379 | Returns: 380 | str: A string representation of LLM messages (somewhat JSON-formatted). 381 | """ 382 | gpt_result = "[\n " + ",\n ".join( 383 | f"'''{item}'''" if "\n" in item else repr(item) for item in messages).replace("\\n", "\n ").replace( 384 | "\\t", " ") + "\n]" 385 | return gpt_result 386 | 387 | def determine_issue_status(self, llm_content: str) -> str: 388 | """ 389 | Checks the content returned by the LLM to see if it includes certain 390 | status codes that classify the issue as 'true' or 'false' or 'more'. 391 | 392 | Args: 393 | llm_content (str): The text content from the LLM's final response. 394 | 395 | Returns: 396 | str: "true" if content has '1337', "false" if content has '1007', 397 | otherwise "more". 398 | """ 399 | if "1337" in llm_content: 400 | return "true" 401 | elif "1007" in llm_content: 402 | return "false" 403 | else: 404 | return "more" 405 | 406 | def append_extra_functions( 407 | self, 408 | extra_lines: List[tuple[str, str, str]], 409 | function_tree_file: str, 410 | src_zip_path: str, 411 | code: str, 412 | current_function: Dict[str, str] 413 | ) -> Tuple[str, List[Dict[str, str]]]: 414 | """ 415 | Searches for additional functions (via bracket references) outside the current one 416 | and appends their code to the main snippet. 417 | 418 | Args: 419 | extra_lines (List[tuple[str, str, str]]): All matches of additional references. 420 | function_tree_file (str): Path to 'FunctionTree.csv'. 421 | src_zip_path (str): Path to the DB's src.zip file. 422 | code (str): The existing code snippet. 423 | current_function (Dict[str, str]): The currently found function dict. 424 | 425 | Returns: 426 | str: The extended code snippet, possibly including multiple functions. 427 | 428 | Raises: 429 | CodeQLError: If function tree file or ZIP file cannot be read. 430 | """ 431 | functions = [current_function] 432 | for another_func_ref in extra_lines: 433 | path_type, file_ref, line_ref = another_func_ref 434 | file_ref = file_ref.strip() 435 | 436 | if path_type == "relative://": 437 | file_ref = self.code_path + file_ref 438 | else: 439 | file_ref = file_ref[1:] if file_ref.startswith("/") else file_ref 440 | 441 | # If it's within the same function's line range, skip 442 | start_line_func = int(current_function["start_line"]) 443 | end_line_func = int(current_function["end_line"]) 444 | if start_line_func <= int(line_ref) <= end_line_func: 445 | continue 446 | 447 | # Attempt to find the new function 448 | new_function = self.find_function_by_line(function_tree_file, "/" + file_ref, int(line_ref)) 449 | if new_function and new_function not in functions: 450 | functions.append(new_function) 451 | code_file2 = read_file_lines_from_zip(src_zip_path, file_ref).split("\n") 452 | code += ( 453 | "\n\nfile: " + file_ref + "\n" + 454 | self.extract_function_code(code_file2, new_function) 455 | ) 456 | 457 | return code, functions 458 | 459 | def process_issue_type( 460 | self, 461 | issue_type: str, 462 | issues_of_type: List[Dict[str, str]], 463 | llm_analyzer: LLMAnalyzer 464 | ) -> None: 465 | """ 466 | Processes all issues of a single type. Builds file/folder paths, runs 467 | analysis, calls the LLM, and saves results. 468 | 469 | Args: 470 | issue_type (str): The name of the issue type. 471 | issues_of_type (List[Dict[str, str]]): All issues belonging to that type. 472 | llm_analyzer (LLMAnalyzer): The LLM analyzer instance to use for queries. 473 | 474 | Raises: 475 | CodeQLError: If database files cannot be read (YAML, ZIP, CSV, etc.). 476 | VulnhallaError: If result files cannot be written. 477 | LLMError: If LLM analysis fails. 478 | """ 479 | results_folder = os.path.join("output/results", self.lang, issue_type.replace(" ", "_").replace("/", "-")) 480 | self.ensure_directories_exist([results_folder]) 481 | 482 | issue_id = 0 483 | real_issues = [] 484 | false_issues = [] 485 | more_data = [] 486 | 487 | logger.info("Found %d issues of type %s", len(issues_of_type), issue_type) 488 | logger.info("") 489 | for issue in issues_of_type: 490 | issue_id += 1 491 | self.db_path = issue["db_path"] 492 | db_yml_path = os.path.join(self.db_path, "codeql-database.yml") 493 | db_yml = read_yml(db_yml_path) 494 | self.code_path = db_yml["sourceLocationPrefix"] 495 | 496 | # Adjust Windows / Linux path references 497 | if ":" in self.code_path: 498 | self.code_path = self.code_path.replace(":", "_").replace("\\", "/") 499 | else: 500 | self.code_path = self.code_path[1:] 501 | 502 | function_tree_file = os.path.join(self.db_path, "FunctionTree.csv") 503 | src_zip_path = os.path.join(self.db_path, "src.zip") 504 | 505 | full_file_path = self.code_path + issue["file"] 506 | code_file_contents = read_file_lines_from_zip(src_zip_path, full_file_path).split("\n") 507 | 508 | current_function = self.find_function_by_line( 509 | function_tree_file, 510 | "/" + self.code_path + issue["file"], 511 | int(issue["start_line"]) 512 | ) 513 | if not current_function: 514 | logger.warning("issue %s: Can't find the function or function is too big!", issue_id) 515 | continue 516 | 517 | snippet = code_file_contents[int(issue["start_line"]) - 1][ 518 | int(issue["start_offset"]) - 1:int(issue["end_offset"]) 519 | ] 520 | 521 | code = ( 522 | "file: " + self.code_path + issue["file"] + "\n" + 523 | self.extract_function_code(code_file_contents, current_function) 524 | ) 525 | 526 | # Replace bracket references in the issue message 527 | bracket_pattern = r'\[\["(.*?)"\|"((?:relative://|file://))?(/.*?):(\d+):(\d+):\d+:(\d+)"\]\]' 528 | transform_func = self.create_bracket_reference_replacer(self.db_path, self.code_path) 529 | message = re.sub(bracket_pattern, transform_func, issue["message"]) 530 | 531 | # Also check for lines referencing other code blocks 532 | extra_lines_pattern = r'\[\[".*?"\|"((?:relative://|file://)?)(/.*?):(\d+):\d+:\d+:\d+"\]\]' 533 | extra_lines = re.findall(extra_lines_pattern, issue["message"]) 534 | functions = [current_function] 535 | 536 | if extra_lines: 537 | code, functions = self.append_extra_functions( 538 | extra_lines, function_tree_file, src_zip_path, code, current_function 539 | ) 540 | 541 | prompt = self.build_prompt_by_template(issue, message, snippet, code) 542 | 543 | # Save raw input to the LLM 544 | self.save_raw_input_data(prompt, function_tree_file, current_function, results_folder, issue_id) 545 | 546 | # Send to LLM 547 | messages, content = llm_analyzer.run_llm_security_analysis( 548 | prompt, 549 | function_tree_file, 550 | current_function, 551 | functions, 552 | self.db_path 553 | ) 554 | gpt_result = self.format_llm_messages(messages) 555 | final_file = os.path.join(results_folder, f"{issue_id}_final.json") 556 | write_file_ascii(final_file, gpt_result) 557 | 558 | # Check status code in LLM content 559 | status = self.determine_issue_status(content) 560 | if status == "true": 561 | real_issues.append(issue_id) 562 | status = "True Positive" 563 | elif status == "false": 564 | false_issues.append(issue_id) 565 | status = "False Positive" 566 | else: 567 | more_data.append(issue_id) 568 | status = "LLM needs More Data" 569 | 570 | # Log issue status 571 | logger.info("Issue ID: %s, LLM decision: → %s", issue_id, status) 572 | 573 | logger.info("") 574 | logger.info("Issue type: %s", issue_type) 575 | logger.info("Total issues: %d", len(issues_of_type)) 576 | logger.info("True Positive: %d", len(real_issues)) 577 | logger.info("False Positive: %d", len(false_issues)) 578 | logger.info("LLM needs More Data: %d", len(more_data)) 579 | logger.info("") 580 | 581 | def run(self) -> None: 582 | """ 583 | Main analysis routine: 584 | 1. Initializes the LLM. 585 | 2. Finds all CodeQL DBs for the given language. 586 | 3. Parses each DB's issues.csv, aggregates them by issue type. 587 | 4. Asks the LLM for each issue's snippet context, saving final results 588 | in various directory structures. 589 | 590 | Raises: 591 | CodeQLError: If database files cannot be accessed or read. 592 | VulnhallaError: If directory creation or file writing fails. 593 | LLMError: If LLM initialization or analysis fails. 594 | """ 595 | # Validate configuration before starting 596 | if self.config is None: 597 | validate_and_exit_on_error() 598 | 599 | llm_analyzer = LLMAnalyzer() 600 | llm_analyzer.init_llm_client(config=self.config) 601 | 602 | dbs_folder = os.path.join("output/databases", self.lang) 603 | 604 | # Gather issues from all DBs 605 | issues_statistics = self.collect_issues_from_databases(dbs_folder) 606 | 607 | total_issues = 0 608 | for issue_type in issues_statistics: 609 | total_issues += len(issues_statistics[issue_type]) 610 | logger.info("Total issues found: %d", total_issues) 611 | logger.info("") 612 | 613 | # Process all issues, type by type 614 | for issue_type in issues_statistics.keys(): 615 | self.process_issue_type(issue_type, issues_statistics[issue_type], llm_analyzer) 616 | 617 | if __name__ == '__main__': 618 | # Initialize logging 619 | from src.utils.logger import setup_logging 620 | setup_logging() 621 | 622 | # Loads configuration from .env file 623 | # Or use: analyzer = IssueAnalyzer(lang="c", config={...}) 624 | analyzer = IssueAnalyzer(lang="c") 625 | analyzer.run() 626 | 627 | --------------------------------------------------------------------------------