├── data
    ├── templates
    │   └── cpp
    │   │   ├── general.template
    │   │   ├── template.template
    │   │   └── Copy function using source size.template
    └── queries
    │   └── cpp
    │       ├── issues
    │           ├── qlpack.yml
    │           └── Copy function using source size.ql
    │       └── tools
    │           ├── qlpack.yml
    │           ├── Macros.ql
    │           ├── GlobalVars.ql
    │           ├── FunctionTree.ql
    │           └── Classes.ql
├── requirements.txt
├── images
    └── vulnhalla_logo.png
├── pytest.ini
├── tests
    ├── conftest.py
    └── test_smoke.py
├── examples
    ├── ui_example.py
    └── example.py
├── SECURITY.md
├── src
    ├── utils
    │   ├── exceptions
    │   │   ├── llm.py
    │   │   ├── codeql.py
    │   │   ├── __init__.py
    │   │   └── base.py
    │   ├── config.py
    │   ├── common_functions.py
    │   ├── llm_config.py
    │   ├── logger.py
    │   └── config_validator.py
    ├── ui
    │   ├── components
    │   │   ├── issues_list_panel.py
    │   │   ├── details_panel.py
    │   │   ├── controls_bar.py
    │   │   └── splitter_divider.py
    │   ├── models.py
    │   ├── issue_parser.py
    │   └── results_loader.py
    ├── pipeline.py
    ├── codeql
    │   └── run_codeql_queries.py
    └── vulnhalla.py
├── .gitignore
├── .env.example
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── setup.py
├── LICENSE.txt
├── README.md
└── NOTICES.txt


/data/templates/cpp/general.template:
--------------------------------------------------------------------------------
1 | 1. Is this a real security issue that you can exploit?


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | python-dotenv
3 | litellm
4 | PyYAML
5 | textual
6 | pySmartDL
7 | pytest


--------------------------------------------------------------------------------
/images/vulnhalla_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyberark/Vulnhalla/HEAD/images/vulnhalla_logo.png


--------------------------------------------------------------------------------
/data/queries/cpp/issues/qlpack.yml:
--------------------------------------------------------------------------------
1 | name: vulnhalla-cpp
2 | version: 0.0.0
3 | dependencies:
4 |   codeql/cpp-all: "*"
5 | 


--------------------------------------------------------------------------------
/data/queries/cpp/tools/qlpack.yml:
--------------------------------------------------------------------------------
1 | name: vulnhalla-cpp
2 | version: 0.0.0
3 | dependencies:
4 |   codeql/cpp-all: "*"
5 | 


--------------------------------------------------------------------------------
/data/queries/cpp/tools/Macros.ql:
--------------------------------------------------------------------------------
1 | import cpp
2 | 
3 | from Macro m
4 | select m.getName() as macro_name, "#define " + m.getHead() + " " + m.getBody() as body
5 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | python_files = test_*.py
4 | python_classes = Test*
5 | python_functions = test_*
6 | addopts = 
7 |     -v
8 |     --strict-markers
9 |     --tb=short


--------------------------------------------------------------------------------
/data/templates/cpp/template.template:
--------------------------------------------------------------------------------
 1 | ### Issue Overview
 2 | Name: {name}
 3 | Description: {description}
 4 | Message: {message}
 5 | Location: {location}
 6 | 
 7 | ### Hints for Validation
 8 | {hints}
 9 | 
10 | ### Code
11 | {code}


--------------------------------------------------------------------------------
/data/queries/cpp/tools/GlobalVars.ql:
--------------------------------------------------------------------------------
1 | import cpp
2 | 
3 | from GlobalOrNamespaceVariable g
4 | select g.getName() as global_var_name, g.getLocation().getFile() as file, g.getLocation().getStartLine() as start_line, g.getLocation().getEndLine() as end_line
5 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Shared pytest fixtures and configuration."""
 2 | 
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | # Add project root to Python path for imports
 7 | PROJECT_ROOT = Path(__file__).parent.parent
 8 | if str(PROJECT_ROOT) not in sys.path:
 9 |     sys.path.insert(0, str(PROJECT_ROOT))
10 | 


--------------------------------------------------------------------------------
/examples/ui_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Entry point for running the Vulnhalla UI.
 4 | 
 5 | Usage:
 6 |     python examples/ui_example.py
 7 | """
 8 | 
 9 | import sys
10 | from pathlib import Path
11 | 
12 | # Add project root to Python path
13 | PROJECT_ROOT = Path(__file__).parent.parent
14 | sys.path.insert(0, str(PROJECT_ROOT))
15 | 
16 | from src.ui.ui_app import main
17 | 
18 | if __name__ == "__main__":
19 |     main()
20 | 
21 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policies and Procedures
2 | 
3 | CyberArk takes product security very seriously. If you believe you have found a vulnerability in one of our products, we ask that you follow responsible disclosure guidelines and contact product_security@cyberark.com and work with us toward a quick resolution to protect our customers.
4 | 
5 | Refer to [CyberArk's Security Vulnerability Policy](https://www.cyberark.com/cyberark-security-vulinerability-policy.pdf) for more details.


--------------------------------------------------------------------------------
/src/utils/exceptions/llm.py:
--------------------------------------------------------------------------------
 1 | """LLM-related exceptions."""
 2 | 
 3 | from src.utils.exceptions.base import VulnhallaError
 4 | 
 5 | 
 6 | class LLMError(VulnhallaError):
 7 |     """Base class for all LLM-related errors."""
 8 |     pass
 9 | 
10 | 
11 | class LLMConfigError(LLMError):
12 |     """LLM configuration errors (missing keys, invalid provider, etc.)."""
13 |     pass
14 | 
15 | 
16 | class LLMApiError(LLMError):
17 |     """LLM API call failures (timeouts, rate limits, 5xx, etc.)."""
18 |     pass
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/tests/test_smoke.py:
--------------------------------------------------------------------------------
 1 | """Smoke tests to verify basic test infrastructure works."""
 2 | 
 3 | def test_pytest_runs():
 4 |     """Test that pytest can discover and run tests."""
 5 |     assert True
 6 | 
 7 | 
 8 | def test_can_import_src():
 9 |     """Test that we can import the main project modules."""
10 |     
11 |     from src.utils.common_functions import read_file
12 |     from src.vulnhalla import IssueAnalyzer
13 |     from src.llm.llm_analyzer import LLMAnalyzer
14 |     assert read_file and IssueAnalyzer and LLMAnalyzer
15 | 


--------------------------------------------------------------------------------
/src/utils/exceptions/codeql.py:
--------------------------------------------------------------------------------
 1 | """CodeQL-related exceptions."""
 2 | 
 3 | from src.utils.exceptions.base import VulnhallaError
 4 | 
 5 | 
 6 | class CodeQLError(VulnhallaError):
 7 |     """Base class for all CodeQL-related errors."""
 8 |     pass
 9 | 
10 | 
11 | class CodeQLConfigError(CodeQLError):
12 |     """CodeQL configuration errors (path, executable, packs, etc.)."""
13 |     pass
14 | 
15 | 
16 | class CodeQLExecutionError(CodeQLError):
17 |     """CodeQL query/database execution/decoding errors."""
18 |     pass
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/src/utils/exceptions/__init__.py:
--------------------------------------------------------------------------------
 1 | """Vulnhalla exception hierarchy."""
 2 | 
 3 | from src.utils.exceptions.base import VulnhallaError
 4 | from src.utils.exceptions.codeql import (
 5 |     CodeQLError,
 6 |     CodeQLConfigError,
 7 |     CodeQLExecutionError,
 8 | )
 9 | from src.utils.exceptions.llm import (
10 |     LLMError,
11 |     LLMConfigError,
12 |     LLMApiError,
13 | )
14 | 
15 | __all__ = [
16 |     "VulnhallaError",
17 |     "CodeQLError",
18 |     "CodeQLConfigError",
19 |     "CodeQLExecutionError",
20 |     "LLMError",
21 |     "LLMConfigError",
22 |     "LLMApiError",
23 | ]
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/data/queries/cpp/tools/FunctionTree.ql:
--------------------------------------------------------------------------------
 1 | import cpp
 2 | 
 3 | string get_caller(Function c){
 4 |   if exists(FunctionCall d | c.getACallToThisFunction() = d)
 5 |   then result = c.getACallToThisFunction().getEnclosingFunction().getLocation().getFile() + ":" + c.getACallToThisFunction().getEnclosingFunction().getLocation().getStartLine()
 6 |   else result = ""
 7 | }
 8 | 
 9 | 
10 | from Function f
11 | select f.getName() as function_name, f.getLocation().getFile() as file, f.getLocation().getStartLine() as start_line, file + ":" + start_line as function_id, f.getBlock().getLocation().getEndLine() as end_line, get_caller(f) as caller_id
12 | 
13 | 


--------------------------------------------------------------------------------
/src/utils/exceptions/base.py:
--------------------------------------------------------------------------------
 1 | """Base exception class for all Vulnhalla-specific errors."""
 2 | 
 3 | 
 4 | class VulnhallaError(Exception):
 5 |     """
 6 |     Base exception for all Vulnhalla-specific errors.
 7 | 
 8 |     Args:
 9 |         message: Human-readable error message.
10 |         cause: Optional underlying exception that caused this error.
11 |     """
12 |     def __init__(self, message: str, cause: Exception | None = None) -> None:
13 |         super().__init__(message)
14 |         self.cause = cause
15 |         if cause is not None:
16 |             # Enables chained traceback: VulnhallaError <- cause
17 |             self.__cause__ = cause
18 | 
19 | 


--------------------------------------------------------------------------------
/data/templates/cpp/Copy function using source size.template:
--------------------------------------------------------------------------------
1 | This static analysis checks if we are using source size in copy functions. This analysis does not check if there is a correlation between source and destination. This is your job!
2 | 1. What size are we using in the copy? Are we really using the source size and not the destination?
3 | 2. Does the source buffer point inside the destination buffer?
4 | 3. What is the size of the source buffer and what is the size of the destination? Is destination size derived from source? Answer this question only if source is not pointer inside dest!
5 | 4. Can the source buffer be bigger than the destination? (yes/no)
6 | Use the tools to get all data needed.
7 | If source is smaller than destination, there is no issue of buffer overflow!
8 | Only if source is bigger than destination it's a problem of buffer overflow!


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python build artifacts
 2 | __pycache__/
 3 | *.pyc
 4 | *.pyo
 5 | *.pyd
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # Virtual environments
10 | venv/
11 | env/
12 | ENV/
13 | .venv/
14 | 
15 | # Environment variables (contains API keys)
16 | .env
17 | .env.local
18 | .env.*.local
19 | 
20 | # Data directories (runtime-generated)
21 | output/databases/
22 | output/results/
23 | output/zip_dbs/
24 | data/queries/**/*.qlx
25 | data/queries/**/codeql-pack.lock.yml
26 | 
27 | # IDE / OS files
28 | .vscode/
29 | .idea/
30 | .DS_Store
31 | Thumbs.db
32 | *.swp
33 | *.swo
34 | *~
35 | 
36 | # Temp / log files
37 | *.log
38 | *.tmp
39 | *.bak
40 | 
41 | # Distribution / packaging
42 | .Python
43 | build/
44 | develop-eggs/
45 | dist/
46 | downloads/
47 | eggs/
48 | .eggs/
49 | lib/
50 | lib64/
51 | parts/
52 | sdist/
53 | var/
54 | wheels/
55 | *.egg-info/
56 | .installed.cfg
57 | *.egg
58 | MANIFEST
59 | 
60 | # Jupyter Notebook
61 | .ipynb_checkpoints
62 | 
63 | # pytest
64 | .pytest_cache/
65 | .coverage
66 | htmlcov/
67 | 
68 | # mypy
69 | .mypy_cache/
70 | .dmypy.json
71 | dmypy.json
72 | 


--------------------------------------------------------------------------------
/src/ui/components/issues_list_panel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Issues list panel component for Vulnhalla UI.
 4 | """
 5 | 
 6 | from textual.containers import Container, Vertical
 7 | from textual.widgets import Label, DataTable, Static, Input
 8 | from textual.app import ComposeResult
 9 | 
10 | 
11 | class IssuesListPanel(Container):
12 |     """
13 |     Left-hand panel showing list of issues in a DataTable.
14 |     """
15 |     
16 |     def compose(self) -> ComposeResult:
17 |         """Compose the issues list panel layout.
18 | 
19 |         Builds the left-hand panel that displays the table of issues,
20 |         including columns such as ID, repository, file and decisions,
21 |         along with the search input.
22 |         """
23 |         with Vertical():
24 |             yield Label("Issues", classes="panel-title")
25 |             table = DataTable(id="issues-table")
26 |             table.cursor_type = "row"
27 |             yield table
28 |             yield Static("", id="issues-count")
29 |             yield Input(placeholder="Search by issue name, file, repo, LLM decision, or manual decision...", id="issues-search")
30 | 
31 | 


--------------------------------------------------------------------------------
/src/utils/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Application Configuration Module
 4 | 
 5 | Loads general application configuration from .env file or environment variables.
 6 | Handles CodeQL path, GitHub token, and other non-LLM settings.
 7 | """
 8 | 
 9 | import os
10 | from typing import Optional
11 | from dotenv import load_dotenv
12 | 
13 | # Load .env file if it exists, otherwise try .env.example
14 | if os.path.exists(".env"):
15 |     load_dotenv(".env")
16 | elif os.path.exists(".env.example"):
17 |     load_dotenv(".env.example")
18 | 
19 | 
20 | def get_codeql_path() -> str:
21 |     """
22 |     Get CodeQL executable path from .env file or environment variables.
23 |     
24 |     Returns:
25 |         Path to CodeQL executable. Defaults to "codeql" if not set.
26 |     """
27 |     path = os.getenv("CODEQL_PATH", "codeql")
28 |     # Strip quotes and Python raw string prefix if present
29 |     if path and path != "codeql":
30 |         path = path.strip('"').strip("'")
31 |         # Remove 'r' prefix if present (Python raw string syntax, not valid in .env)
32 |         if path.startswith("r\"") or path.startswith("r'"):
33 |             path = path[2:]
34 |             path = path.strip('"').strip("'")
35 |     return path
36 | 
37 | 
38 | def get_github_token() -> Optional[str]:
39 |     """
40 |     Get GitHub API token from .env file or environment variables.
41 |     
42 |     Returns:
43 |         GitHub token string if set, None otherwise.
44 |     """
45 |     return os.getenv("GITHUB_TOKEN")
46 | 
47 | 


--------------------------------------------------------------------------------
/src/ui/components/details_panel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Details panel component for Vulnhalla UI.
 4 | """
 5 | 
 6 | from textual.containers import Container, Vertical, ScrollableContainer
 7 | from textual.widgets import Static, Label, Select
 8 | from textual.app import ComposeResult
 9 | 
10 | 
11 | class DetailsPanel(Container):
12 |     """
13 |     Right panel showing issue details with scrollable content and manual decision selector.
14 |     """
15 |     
16 |     def compose(self) -> ComposeResult:
17 |         """Compose the issue details panel layout.
18 | 
19 |         Builds the right-hand panel that shows LLM decisions, metadata,
20 |         code snippets and the manual decision selector for the selected issue.
21 |         """
22 |         with Vertical():
23 |             # Scrollable content area
24 |             scrollable_content = ScrollableContainer(id="details-scrollable")
25 |             with scrollable_content:
26 |                 yield Static("Select an issue to view details", id="details-content", markup=True)
27 |             # Manual decision controls
28 |             with Vertical(id="manual-decision-container"):
29 |                 yield Label("Enter your manual decision:", id="manual-decision-label")
30 |                 yield Select(
31 |                     [
32 |                         ("Not Set", None),
33 |                         ("True Positive", "True Positive"),
34 |                         ("False Positive", "False Positive"),
35 |                         ("Uncertain", "Uncertain"),
36 |                     ],
37 |                     value=None,
38 |                     id="manual-decision-select",
39 |                     prompt="Not Set"
40 |                 )
41 | 
42 | 


--------------------------------------------------------------------------------
/src/ui/components/controls_bar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Controls bar component for Vulnhalla UI.
 4 | """
 5 | 
 6 | from textual.containers import Container, Horizontal, Vertical
 7 | from textual.widgets import Static, Label, Select, Button
 8 | from textual.app import ComposeResult
 9 | 
10 | 
11 | class ControlsBar(Container):
12 |     """
13 |     Bottom horizontal bar with controls, filters, and actions.
14 |     """
15 |     
16 |     def compose(self) -> ComposeResult:
17 |         """Compose the controls bar layout.
18 | 
19 |         Creates the bottom bar with language information, filters,
20 |         action buttons and keyboard shortcut help text.
21 |         """
22 |         with Vertical():
23 |             # Language label only
24 |             with Horizontal():
25 |                 yield Static("Language: C (only language currently supported)", classes="control-label")
26 |             # Filter and buttons
27 |             with Horizontal():
28 |                 yield Label("Filter by llm decision:", classes="control-label")
29 |                 yield Select(
30 |                     [("All", "all"), ("True Positive", "true"), ("False Positive", "false"), ("Needs more Info to decide", "more")],
31 |                     value="all",
32 |                     id="filter-select"
33 |                 )
34 |                 yield Button("Refresh", id="refresh-btn")
35 |                 yield Button("Run Analysis", id="run-analysis-btn")
36 |             yield Static("")
37 |             # Key Bindings help text
38 |             with Horizontal():
39 |                 yield Label("Key Bindings:", classes="control-label")
40 |                 yield Static("↑/↓: Navigate | Tab: Switch focus | Enter: Show details | /: Search | [: Resize left | ]: Resize right | r: Reload | q: Quit", classes="help-text")
41 | 
42 | 


--------------------------------------------------------------------------------
/data/queries/cpp/tools/Classes.ql:
--------------------------------------------------------------------------------
 1 | import cpp
 2 | 
 3 | private predicate isNamespaceEntity(NameQualifyingElement n) { n instanceof Namespace }
 4 | private predicate isUserTypeEntity(NameQualifyingElement n) { n instanceof UserType }
 5 | private predicate isClassEntity(NameQualifyingElement n) { n instanceof Class }
 6 | 
 7 | 
 8 | private int getEndLine(NameQualifyingElement n) {
 9 |   exists (Namespace c | c = n and result = max(c.getADeclaration().getLocation().getEndLine()))
10 |   or
11 |   // Anonymus structs. This will get the real end line
12 |   exists (Class c | c = n and c.getName() = "(unnamed class/struct/union)" and exists (TypedefType u | u.getUnderlyingType() instanceof Class and u.getUnderlyingType() = c and result = u.getLocation().getStartLine()))
13 |   or
14 |   exists (Class c | c = n and result = max(c.getAMember().getLocation().getEndLine()))
15 |   or
16 |   exists (UserType u | u = n and result = max(u.getADeclaration().getLocation().getEndLine()))
17 | }
18 | 
19 | private string getType(NameQualifyingElement c) {
20 |   isNamespaceEntity(c) and result = "NameSapce"
21 |   or
22 |   isClassEntity(c) and result = "Class"
23 |   or
24 |   isUserTypeEntity(c) and result = "UserType"
25 | }
26 | 
27 | private string getName(NameQualifyingElement n) {
28 |   // Anonymus structs
29 |   n.getName() = "(unnamed class/struct/union)" and exists (TypedefType u | u.getUnderlyingType() instanceof Class and u.getUnderlyingType() = n and result = u.getName())
30 |   or
31 |   result = n.getName()
32 | }
33 | 
34 | private string getSimpleName(NameQualifyingElement n) {
35 |   isNamespaceEntity(n) and result = ""
36 |   or
37 |   // Anonymus structs
38 |   n.getName() = "(unnamed class/struct/union)" and exists (TypedefType u | u.getUnderlyingType() instanceof Class and u.getUnderlyingType() = n and result = u.getSimpleName())
39 |   or
40 |   exists (UserType u | u = n and result = u.getSimpleName())
41 | }
42 | 
43 | from NameQualifyingElement c
44 | where isNamespaceEntity(c) or isUserTypeEntity(c)
45 | select getType(c) as type, getName(c) as name, c.getLocation().getFile() as file, c.getLocation().getStartLine() as start_line, getEndLine(c) as end_line, getSimpleName(c) as simple_name
46 | 


--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | example.py
 5 | ----------
 6 | Example usage of Vulnhalla - demonstrates a full pipeline run:
 7 | 1) Fetch CodeQL databases (from fetch_repos.py),
 8 | 2) Run CodeQL queries (from run_codeql_queries.py),
 9 | 3) Analyze results with LLM (from vulnhalla.py).
10 | """
11 | 
12 | import sys
13 | from pathlib import Path
14 | 
15 | # Add project root to Python path
16 | PROJECT_ROOT = Path(__file__).parent.parent
17 | sys.path.insert(0, str(PROJECT_ROOT))
18 | 
19 | from src.codeql.fetch_repos import fetch_codeql_dbs
20 | from src.codeql.run_codeql_queries import compile_and_run_codeql_queries, DEFAULT_LANG
21 | from src.vulnhalla import IssueAnalyzer
22 | from src.utils.config import get_codeql_path
23 | from src.utils.config_validator import validate_and_exit_on_error
24 | from src.utils.logger import setup_logging, get_logger
25 | from src.ui.ui_app import main as ui_main
26 | 
27 | logger = get_logger(__name__)
28 | 
29 | 
30 | def main():
31 |     """Run an end-to-end example of the Vulnhalla pipeline.
32 | 
33 |     This function fetches CodeQL databases for two demo
34 |     repositories, runs CodeQL queries, classifies the findings
35 |     using the configured LLM provider, writes the results
36 |     to the output directory, and opens the results UI.
37 |     """
38 |     # Initialize logging
39 |     setup_logging()
40 |     logger.info("Starting Vulnhalla pipeline... This may take a few minutes.")
41 |     logger.info("")
42 |     
43 |     # Validate configuration before starting
44 |     validate_and_exit_on_error()
45 |     
46 |     # 1) Fetch CodeQL database
47 |     logger.info("[1/3] Fetching CodeQL DBs")
48 |     fetch_codeql_dbs(
49 |         lang="c",          # Or use fetch_repos.LANG if set
50 |         threads=4,        # Higher threads may exceed GitHub rate limits. Add a GitHub token if you need higher throughput.
51 |         
52 |         single_repo="videolan/vlc"
53 |     )
54 |     fetch_codeql_dbs(lang="c", threads=16, single_repo="redis/redis")
55 | 
56 |     # 2) Run CodeQL queries on all downloaded databases
57 |     logger.info("\n[2/3] Running CodeQL Queries")
58 |     compile_and_run_codeql_queries(
59 |         codeql_bin=get_codeql_path(),
60 |         lang="c",
61 |         threads=16,
62 |         timeout=300
63 |     )
64 | 
65 |     # 3) Build/Analyze CodeQL results
66 |     logger.info("\n[3/3] Building and Analyzing Results")
67 |     # Load configuration from .env file (create .env from .env.example)
68 |     # Or use: analyzer = IssueAnalyzer(lang="c", api_key="your-api-key")
69 |     analyzer = IssueAnalyzer(lang="c")
70 |     analyzer.run()
71 | 
72 |     logger.info("\n✅ Pipeline completed successfully!")
73 |     logger.info("Opening results UI...")
74 |     ui_main()
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/src/ui/components/splitter_divider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Splitter divider component for Vulnhalla UI.
 4 | """
 5 | 
 6 | from textual.widget import Widget
 7 | 
 8 | 
 9 | class SplitterDivider(Widget):
10 |     """
11 |     Draggable divider widget between panels for resizing.
12 |     """
13 |     
14 |     DEFAULT_CSS = """
15 |     SplitterDivider {
16 |         width: 1;
17 |         background: transparent;
18 |         color: $surface-lighten-2;
19 |     }
20 |     SplitterDivider:hover {
21 |         background: $primary;
22 |         color: $primary;
23 |     }
24 |     """
25 |     
26 |     def __init__(self, app_instance=None):
27 |         """
28 |         Initialize the SplitterDivider.
29 | 
30 |         Args:
31 |             app_instance: Reference to the VulnhallaUI app instance for updating split position.
32 |         """
33 |         super().__init__()
34 |         self.app_instance = app_instance
35 |         self.dragging = False
36 |     
37 |     def render(self):
38 |         """
39 |         Render the divider as a thin vertical line.
40 | 
41 |         Returns:
42 |             str: Single vertical line character "│".
43 |         """
44 |         return "│"
45 |     
46 |     def on_mouse_down(self, event) -> None:
47 |         """
48 |         Start dragging when mouse is pressed.
49 | 
50 |         Args:
51 |             event: Mouse down event.
52 |         """
53 |         self.dragging = True
54 |         self.capture_mouse()
55 |     
56 |     def on_mouse_move(self, event) -> None:
57 |         """
58 |         Update split position while dragging.
59 | 
60 |         Args:
61 |             event: Mouse move event containing position information.
62 |         """
63 |         if self.dragging and self.app_instance:
64 |             parent = self.parent
65 |             if parent and parent.region:
66 |                 try:
67 |                     # Get mouse position relative to parent container
68 |                     mouse_x = event.screen_x - parent.region.x
69 |                     parent_width = parent.size.width
70 |                     if parent_width > 0:
71 |                         new_position = max(0.2, min(0.8, mouse_x / parent_width))
72 |                         self.app_instance.split_position = new_position
73 |                         self.app_instance._update_split_position()
74 |                 except (AttributeError, TypeError):
75 |                     # Fallback: use delta if available
76 |                     if hasattr(event, 'delta_x') and event.delta_x != 0:
77 |                         parent_width = parent.size.width
78 |                         if parent_width > 0:
79 |                             delta = event.delta_x / parent_width
80 |                             new_position = max(0.2, min(0.8, self.app_instance.split_position + delta))
81 |                             self.app_instance.split_position = new_position
82 |                             self.app_instance._update_split_position()
83 |     
84 |     def on_mouse_up(self, event) -> None:
85 |         """
86 |         Stop dragging when mouse is released.
87 | 
88 |         Args:
89 |             event: Mouse up event.
90 |         """
91 |         if self.dragging:
92 |             self.dragging = False
93 |             self.release_mouse()
94 | 
95 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # CodeQL Configuration
 2 | 
 3 | # Path to CodeQL executable (required)
 4 | # Examples:
 5 | #   Linux/macOS: /usr/local/bin/codeql or /path/to/codeql
 6 | #   Windows: IMPORTANT - Path MUST end with .cmd
 7 | #            Example: C:\path\to\codeql\codeql.cmd
 8 | #            Use forward slashes or escaped backslashes: C:/path/to/codeql/codeql.cmd
 9 | #            Or use raw string format: r"C:\path\to\codeql\codeql.cmd"
10 | CODEQL_PATH="your_codeql_path"
11 | 
12 | # GitHub Configuration (optional, for higher rate limits)
13 | # Get token from: https://github.com/settings/tokens
14 | # GITHUB_TOKEN=ghp_your_token_here
15 | 
16 | # LLM Configuration
17 | # Copy this file to .env and fill in your API keys
18 | 
19 | # Provider selection (required)
20 | # Allowed providers: openai, azure, gemini
21 | 
22 | # Model name (required, provider-specific)
23 | # Examples by provider:
24 | #   OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo
25 | #   Azure: gpt-4o, gpt-4
26 | #   Google AI Studio: gemini-2.5-flash, gemini-2.0-flash
27 | 
28 | # Optional: Override default LLM parameters
29 | # NOTE:
30 | # Do NOT increase these values unless you fully understand the impact.
31 | # Lower values keep the model stable and deterministic — critical for security analysis.
32 | # Higher values may cause the model to become inconsistent, creative, or hallucinate results.
33 | # Recommended: leave these at their default values.
34 | # LLM_TEMPERATURE=0.2
35 | # LLM_TOP_P=0.2
36 | 
37 | # ============================================================================
38 | # Provider-Specific Configuration
39 | # ============================================================================
40 | # Uncomment and fill in the section for your chosen provider
41 | 
42 | # ----------------------------------------------------------------------------
43 | # OpenAI
44 | # ----------------------------------------------------------------------------
45 | PROVIDER=openai
46 | MODEL=gpt-4o
47 | OPENAI_API_KEY="your_api_key"
48 | 
49 | # ----------------------------------------------------------------------------
50 | # Azure OpenAI
51 | # ----------------------------------------------------------------------------
52 | # AZURE_OPENAI_API_KEY="your_api_key"
53 | # AZURE_OPENAI_ENDPOINT="https://your-name.openai.azure.com/"
54 | # AZURE_OPENAI_API_VERSION="2024-08-01-preview"
55 | # PROVIDER=azure
56 | # MODEL=gpt-4o
57 | 
58 | # ----------------------------------------------------------------------------
59 | # Google AI Studio
60 | # ----------------------------------------------------------------------------
61 | # GOOGLE_API_KEY="your_api_key"
62 | # PROVIDER=gemini
63 | # MODEL=gemini-2.5-flash
64 | 
65 | # Logging Configuration
66 | 
67 | # DEBUG, INFO, WARNING, ERROR
68 | LOG_LEVEL=INFO
69 | 
70 | # Optional: path to log file (e.g., logs/vulnhalla.log)
71 | # If empty or commented out, no file logging is used
72 | # LOG_FILE=logs/vulnhalla.log
73 | LOG_FILE=
74 | 
75 | # default or json
76 | LOG_FORMAT=default
77 | 
78 | # Console format control:
79 | # - Default: INFO messages are minimal (message only) 
80 | #           WARNING/ERROR/CRITICAL use simple format (LEVEL - message)
81 | # - If LOG_VERBOSE_CONSOLE=true: 
82 | #   WARNING/ERROR/CRITICAL use full format
83 | #   (timestamp - logger - level - message)
84 | # - INFO always remains minimal regardless of verbose mode
85 | LOG_VERBOSE_CONSOLE=false
86 | 
87 | # Control third-party library logging verbosity (LiteLLM, urllib3, requests). Default: ERROR
88 | THIRD_PARTY_LOG_LEVEL=ERROR


--------------------------------------------------------------------------------
/data/queries/cpp/issues/Copy function using source size.ql:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @name Copy function using source size
 3 |  * @description Calling a copy operation with a size derived from the source
 4 |  *              buffer instead of the destination buffer may result in a buffer overflow.
 5 |  * @kind path-problem
 6 |  * @id cpp/overflow-destination
 7 |  * @problem.severity warning
 8 |  * @security-severity 9.3
 9 |  * @precision low
10 |  * @tags reliability
11 |  *       security
12 |  *       external/cwe/cwe-119
13 |  *       external/cwe/cwe-131
14 |  */
15 | 
16 | import cpp
17 | import semmle.code.cpp.ir.dataflow.TaintTracking
18 | import semmle.code.cpp.controlflow.IRGuards
19 | import semmle.code.cpp.security.FlowSources
20 | import OverflowDestination::PathGraph
21 | 
22 | /**
23 |  * Holds if `fc` is a call to a copy operation where the size argument contains
24 |  * a reference to the source argument.  For example:
25 |  * ```
26 |  *   memcpy(dest, src, sizeof(src));
27 |  * ```
28 |  */
29 | predicate sourceSized(FunctionCall fc, Expr src) {
30 |   fc.getTarget().hasGlobalOrStdName(["strncpy", "strncat", "memcpy", "memmove"]) and
31 |   exists(Expr dest, Expr size, Variable v |
32 |     fc.getArgument(0) = dest and
33 |     fc.getArgument(1).getFullyConverted() = src and
34 |     fc.getArgument(2) = size and
35 |     src = v.getAnAccess().getFullyConverted() and
36 |     size.getAChild+() = v.getAnAccess() and
37 |     // exception: `dest` is also referenced in the size argument
38 |     not exists(Variable other |
39 |       dest = other.getAnAccess() and size.getAChild+() = other.getAnAccess()
40 |     ) and
41 |     // exception: `src` and `dest` are both arrays of the same type and size
42 |     not exists(ArrayType srctype, ArrayType desttype |
43 |       dest.getType().getUnderlyingType() = desttype and
44 |       src.getType().getUnderlyingType() = srctype and
45 |       desttype.getBaseType().getUnderlyingType() = srctype.getBaseType().getUnderlyingType() and
46 |       desttype.getArraySize() = srctype.getArraySize()
47 |     )
48 |   )
49 | }
50 | 
51 | predicate readsVariable(LoadInstruction load, Variable var) {
52 |   load.getSourceAddress().(VariableAddressInstruction).getAstVariable() = var
53 | }
54 | 
55 | predicate hasUpperBoundsCheck(Variable var) {
56 |   exists(RelationalOperation oper, VariableAccess access |
57 |     oper.getAnOperand() = access and
58 |     access.getTarget() = var and
59 |     // Comparing to 0 is not an upper bound check
60 |     not oper.getAnOperand().getValue() = "0"
61 |   )
62 | }
63 | 
64 | predicate nodeIsBarrierEqualityCandidate(DataFlow::Node node, Operand access, Variable checkedVar) {
65 |   readsVariable(node.asInstruction(), checkedVar) and
66 |   any(IRGuardCondition guard).ensuresEq(access, _, _, node.asInstruction().getBlock(), true)
67 | }
68 | 
69 | module OverflowDestinationConfig implements DataFlow::ConfigSig {
70 |   predicate isSource(DataFlow::Node source) { source instanceof FlowSource }
71 | 
72 |   predicate isSink(DataFlow::Node sink) { sourceSized(_, sink.asIndirectConvertedExpr()) }
73 | 
74 |   predicate isBarrier(DataFlow::Node node) {
75 |     exists(Variable checkedVar |
76 |       readsVariable(node.asInstruction(), checkedVar) and
77 |       hasUpperBoundsCheck(checkedVar)
78 |     )
79 |     or
80 |     exists(Variable checkedVar, Operand access |
81 |       readsVariable(access.getDef(), checkedVar) and
82 |       nodeIsBarrierEqualityCandidate(node, access, checkedVar)
83 |     )
84 |   }
85 | }
86 | 
87 | module OverflowDestination = TaintTracking::Global<OverflowDestinationConfig>;
88 | 
89 | from FunctionCall fc, OverflowDestination::PathNode source, OverflowDestination::PathNode sink
90 | where
91 |   OverflowDestination::flowPath(source, sink) and
92 |   sourceSized(fc, sink.getNode().asIndirectConvertedExpr())
93 | select fc, source, sink,
94 |   "To avoid overflow, this operation should be bounded by destination-buffer size, not source-buffer size."
95 | 


--------------------------------------------------------------------------------
/src/ui/models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Data models for Vulnhalla UI.
  4 | """
  5 | 
  6 | from typing import Callable, Dict, List, Optional, Tuple
  7 | from dataclasses import dataclass
  8 | 
  9 | 
 10 | @dataclass
 11 | class Issue:
 12 |     """
 13 |     Represents a single analyzed issue from CodeQL analysis results.
 14 |     
 15 |     Attributes:
 16 |         id (str): Issue identifier extracted from filename (e.g., "1", "2").
 17 |         name (str): Issue name or type.
 18 |         file (str): File path basename.
 19 |         line (int): Line number where the issue occurs.
 20 |         status (str): LLM classification status ("true", "false", or "more").
 21 |         issue_type (str): Issue type directory name.
 22 |         lang (str): Language code.
 23 |         repo (str): Repository name in format "org/repo" (e.g., "redis/redis").
 24 |         raw_path (str): Path to the _raw.json file.
 25 |         final_path (str): Path to the _final.json file.
 26 |         raw_data (Optional[Dict]): Parsed raw JSON data.
 27 |         final_data (Optional[List]): Parsed final JSON containing LLM messages.
 28 |         manual_decision (Optional[str]): Manual verdict set by user ("True Positive", 
 29 |             "False Positive", "Uncertain", or None for "Not Set").
 30 |     """
 31 |     id: str
 32 |     name: str
 33 |     file: str
 34 |     line: int
 35 |     status: str
 36 |     issue_type: str
 37 |     lang: str
 38 |     repo: str
 39 |     raw_path: str
 40 |     final_path: str
 41 |     raw_data: Optional[Dict] = None
 42 |     final_data: Optional[List] = None
 43 |     manual_decision: Optional[str] = None
 44 | 
 45 | 
 46 | # Constants for status ordering (used in sorting)
 47 | STATUS_ORDER: Dict[str, int] = {"true": 0, "false": 1, "more": 2}
 48 | 
 49 | MANUAL_DECISION_ORDER: Dict[Optional[str], int] = {
 50 |     "True Positive": 0,
 51 |     "False Positive": 1,
 52 |     "Uncertain": 2,
 53 |     "Not Set": 3,
 54 |     None: 3
 55 | }
 56 | 
 57 | # Status display mapping (internal status -> display text)
 58 | STATUS_DISPLAY_MAP: Dict[str, str] = {
 59 |     "true": "True Positive",
 60 |     "false": "False Positive",
 61 |     "more": "Needs More Data"
 62 | }
 63 | 
 64 | 
 65 | def format_status_display(status: str) -> str:
 66 |     """
 67 |     Format status value for display.
 68 | 
 69 |     Args:
 70 |         status (str): Internal status value ("true", "false", or "more").
 71 | 
 72 |     Returns:
 73 |         str: Display text for the status.
 74 |     """
 75 |     return STATUS_DISPLAY_MAP.get(status, status)
 76 | 
 77 | 
 78 | def format_manual_decision(manual_decision: Optional[str]) -> str:
 79 |     """
 80 |     Format manual decision value for display.
 81 | 
 82 |     Args:
 83 |         manual_decision (Optional[str]): Manual decision value or None.
 84 | 
 85 |     Returns:
 86 |         str: Display text for the manual decision ("Not Set" if None).
 87 |     """
 88 |     return manual_decision if manual_decision else "Not Set"
 89 | 
 90 | 
 91 | def get_default_sort_key(issue: "Issue") -> Tuple[str, float]:
 92 |     """
 93 |     Get default sort key for an issue (repo, then ID).
 94 | 
 95 |     Args:
 96 |         issue (Issue): Issue to get sort key for.
 97 | 
 98 |     Returns:
 99 |         Tuple[str, float]: Sort key tuple (repo lowercase, numeric ID or inf).
100 |     """
101 |     repo_key = issue.repo.lower()
102 |     id_key = int(issue.id) if issue.id.isdigit() else float('inf')
103 |     return (repo_key, id_key)
104 | 
105 | 
106 | def get_sort_key_for_column(column: str) -> Optional[Callable[["Issue"], any]]:
107 |     """
108 |     Get sort key function for a given column name.
109 | 
110 |     Args:
111 |         column (str): Column name to sort by.
112 | 
113 |     Returns:
114 |         Optional[Callable]: Sort key function, or None if column not supported.
115 |     """
116 |     sort_keys: Dict[str, Callable[["Issue"], any]] = {
117 |         "ID": lambda issue: int(issue.id) if issue.id.isdigit() else float('inf'),
118 |         "Repo": lambda issue: issue.repo.lower(),
119 |         "Issue name": lambda issue: issue.name.lower(),
120 |         "File": lambda issue: issue.file.lower(),
121 |         "LLM decision": lambda issue: STATUS_ORDER.get(issue.status, 99),
122 |         "Manual decision": lambda issue: MANUAL_DECISION_ORDER.get(issue.manual_decision, 3),
123 |     }
124 |     return sort_keys.get(column)
125 | 
126 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Vulnhalla
  2 | 
  3 | Thank you for your interest in contributing to Vulnhalla! We are always delighted to welcome new contributors!
  4 | 
  5 | For general contributions and community guidelines, please see the [cyberark community documentation](https://github.com/cyberark/community/).
  6 | 
  7 | ## Found an Issue?
  8 | 
  9 | If you have found a bug, please raise an issue on the Vulnhalla repo: https://github.com/cyberark/Vulnhalla/issues
 10 | 
 11 | ## Found a Vulnerability?
 12 | 
 13 | If you think you have found a vulnerability in Vulnhalla, please refer to [Security](SECURITY.md)
 14 | 
 15 | We are always very grateful to researchers who report vulnerabilities responsibly.
 16 | 
 17 | ## Development
 18 | 
 19 | We recommend using Python 3.10 – 3.13. Python 3.14+ is not supported (grpcio wheels unavailable).
 20 | 
 21 | 
 22 | ### Contribution Guidelines
 23 | 
 24 | - **Code Style**: Follow Python PEP 8 style guidelines
 25 | - **Testing**: Test your changes using `examples/example.py` and `examples/ui_example.py`
 26 | - **Documentation**: Update the README.md if you're adding new features or changing behavior
 27 | - **Commit Messages**: Write clear, descriptive commit messages
 28 | - **Pull Requests**: 
 29 |   - Provide a clear description of your changes
 30 |   - Reference any related issues
 31 |   - Ensure your code works with Python 3.10-3.13
 32 | - **Logging**: Use structured logging instead of `print()` statements (see [Logging Guidelines](#logging-guidelines) below)
 33 | 
 34 | 
 35 | ### General Steps for Contributing (Creating a Pull Request)
 36 | 
 37 | 1. Fork the project.
 38 | 
 39 | 2. Clone your fork.
 40 | 
 41 | ```
 42 | # Clone the repository (fork)
 43 | git clone https://github.com/cyberark/Vulnhalla.git
 44 | cd Vulnhalla
 45 | ```
 46 | 
 47 | 3. Install the project's requirements and configure your environment. See [README.md](README.md) for detailed instructions on installing dependencies, setting up CodeQL packs, and configuring your `.env` file.
 48 | 
 49 | 4. Make local changes to your fork by editing files
 50 | 
 51 | 5. Test your changes
 52 | 
 53 | ```
 54 | # Test the full pipeline
 55 | python examples/example.py
 56 | 
 57 | # Test UI changes (if applicable)
 58 | python examples/ui_example.py
 59 | ```
 60 | 
 61 | 6. Commit your changes. Use clear, descriptive commit messages.
 62 | 
 63 | 7. Push your local changes to the remote server.
 64 | 
 65 | 8. Create a new Pull Request. Please include:
 66 |    - A clear description of your changes
 67 |    - Reference to any related issues (e.g., "Fixes #123")
 68 |    - Any testing you performed
 69 | 
 70 | From here, your pull request will be reviewed, and once it is merged into the project. Congratulations, you're a contributor!
 71 | 
 72 | ### Reporting Issues
 73 | 
 74 | Before reporting issues, please:
 75 | - Check existing issues to avoid duplicates
 76 | - Include Python version, OS, and error messages
 77 | - Provide steps to reproduce the issue
 78 | 
 79 | ## Logging Guidelines
 80 | 
 81 | Vulnhalla uses centralized logging. Always use `get_logger(__name__)` instead of `print()` for application messages.
 82 | 
 83 | ### Basic Usage
 84 | 
 85 | ```python
 86 | from src.utils.logger import get_logger
 87 | 
 88 | logger = get_logger(__name__)
 89 | 
 90 | # ✅ Good
 91 | logger.info("Processing database: %s", db_path)
 92 | logger.warning("Rate limit approaching: %d requests remaining", remaining)
 93 | logger.error("Failed to process: %s", error_message)
 94 | logger.debug("Debug information: %s", debug_data)
 95 | 
 96 | # ❌ Bad
 97 | print("Processing database:", db_path)  # Don't use print()
 98 | ```
 99 | 
100 | ### Log Levels
101 | 
102 | - **`logger.debug()`** - Detailed diagnostics (shown with `LOG_LEVEL=DEBUG`)
103 | - **`logger.info()`** - Status updates, progress messages
104 | - **`logger.warning()`** - Warnings (rate limits, missing data)
105 | - **`logger.error()`** - Errors, failures, exceptions
106 | 
107 | ### When Print() is Acceptable
108 | 
109 | `print()` is only acceptable for:
110 | - Interactive CLI prompts
111 | - Real-time progress indicators with `\r` (e.g., download progress bars)
112 | 
113 | ## Testing
114 | 
115 | Please test your changes manually using the example scripts:
116 | 
117 | - `python examples/example.py` - Tests the full pipeline
118 | - `python examples/ui_example.py` - Tests the UI
119 | 
120 | Ensure your code works with Python 3.10-3.13 before submitting.
121 | 
122 | **Testing with Different Log Levels:**
123 | ```bash
124 | # Test with debug logging
125 | LOG_LEVEL=DEBUG python examples/example.py
126 | 
127 | # Test with warning level only
128 | LOG_LEVEL=WARNING python examples/example.py
129 | ```
130 | 
131 | ## Releases
132 | 
133 | Releases should only be created by our core maintainers.
134 | 
135 | ## Legal
136 | 
137 | Any submission of work, including any modification of, or addition to, an existing work ("Contribution") to "Vulnhalla" shall be governed by and subject to the terms of the Apache License, Version 2.0 (the "License") and to the following complementary terms. In case of any conflict or inconsistency between the provisions of the License and the complementary terms, the complementary terms shall prevail. By submitting the Contribution, you represent and warrant that the Contribution is your original creation and you own all right, title and interest in the Contribution. You represent that you are legally entitled to grant the rights set out in the License and herein, without violation of, or conflict with, the rights of any other party. You represent that your Contribution includes complete details of any third-party license or other restriction associated with any part of your Contribution of which you are personally aware.
138 | 


--------------------------------------------------------------------------------
/src/utils/common_functions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Common utility functions for Vulnhalla.
  3 | 
  4 | This module provides reusable helpers for file and path handling,
  5 | working with CodeQL database directories, and other small I/O utilities
  6 | that are shared across multiple parts of the project.
  7 | """
  8 | 
  9 | import os
 10 | import zipfile
 11 | import yaml
 12 | from typing import Any, Dict, List 
 13 | 
 14 | from src.utils.exceptions import VulnhallaError, CodeQLError
 15 | 
 16 | 
 17 | def read_file(file_name: str) -> str:
 18 |     """
 19 |     Read text from a file (UTF-8).
 20 | 
 21 |     Args:
 22 |         file_name (str): The path to the file to be read.
 23 | 
 24 |     Returns:
 25 |         str: The contents of the file, decoded as UTF-8.
 26 |     
 27 |     Raises:
 28 |         VulnhallaError: If file cannot be read (not found, permission denied, encoding error).
 29 |     """
 30 |     try:
 31 |         with open(file_name, "r", encoding="utf-8") as f:
 32 |             return f.read()
 33 |     except FileNotFoundError as e:
 34 |         raise VulnhallaError(f"File not found: {file_name}") from e
 35 |     except PermissionError as e:
 36 |         raise VulnhallaError(f"Permission denied reading file: {file_name}") from e
 37 |     except UnicodeDecodeError as e:
 38 |         raise VulnhallaError(f"Failed to decode file as UTF-8: {file_name}") from e
 39 |     except OSError as e:
 40 |         raise VulnhallaError(f"OS error while reading file: {file_name}") from e
 41 | 
 42 | 
 43 | def write_file_text(file_name: str, data: str) -> None:
 44 |     """
 45 |     Write text data to a file (UTF-8).
 46 | 
 47 |     Args:
 48 |         file_name (str): The path to the file to be written.
 49 |         data (str): The string data to write to the file.
 50 |     
 51 |     Raises:
 52 |         VulnhallaError: If file cannot be written (permission denied, disk full, etc.).
 53 |     """
 54 |     try:
 55 |         with open(file_name, "w", encoding="utf-8") as f:
 56 |             f.write(data)
 57 |     except PermissionError as e:
 58 |         raise VulnhallaError(f"Permission denied writing file: {file_name}") from e
 59 |     except OSError as e:
 60 |         raise VulnhallaError(f"OS error while writing file: {file_name}") from e
 61 | 
 62 | 
 63 | def write_file_ascii(file_name: str, data: str) -> None:
 64 |     """
 65 |     Write data to a file in ASCII mode (ignores errors).
 66 |     Useful for contexts similar to the original 'wb' approach
 67 |     where non-ASCII characters are simply dropped.
 68 | 
 69 |     Args:
 70 |         file_name (str): The path to the file to be written.
 71 |         data (str): The string data to write (non-ASCII chars ignored).
 72 |     
 73 |     Raises:
 74 |         VulnhallaError: If file cannot be written (permission denied, disk full, etc.).
 75 |     """
 76 |     try:
 77 |         with open(file_name, "wb") as f:
 78 |             f.write(data.encode("ascii", "ignore"))
 79 |     except PermissionError as e:
 80 |         raise VulnhallaError(f"Permission denied writing file: {file_name}") from e
 81 |     except OSError as e:
 82 |         raise VulnhallaError(f"OS error while writing file: {file_name}") from e
 83 | 
 84 | 
 85 | def get_all_dbs(dbs_folder: str) -> List[str]:
 86 |     """
 87 |     Return a list of all CodeQL database paths under `dbs_folder`.
 88 | 
 89 |     Args:
 90 |         dbs_folder (str): The folder containing CodeQL databases.
 91 | 
 92 |     Returns:
 93 |         List[str]: A list of file-system paths pointing to valid CodeQL databases.
 94 |     
 95 |     Raises:
 96 |         CodeQLError: If database folder cannot be accessed (permission denied, not found, etc.).
 97 |     """
 98 |     try:
 99 |         dbs_path = []
100 |         for folder in os.listdir(dbs_folder):
101 |             folder_path = os.path.join(dbs_folder, folder)
102 |             if os.path.isdir(folder_path):
103 |                 for sub_folder in os.listdir(folder_path):
104 |                     curr_db_path = os.path.join(folder_path, sub_folder)
105 |                     if os.path.exists(os.path.join(curr_db_path, "codeql-database.yml")):
106 |                         dbs_path.append(curr_db_path)
107 |         return dbs_path
108 |     except PermissionError as e:
109 |         raise CodeQLError(f"Permission denied accessing database folder: {dbs_folder}") from e
110 |     except OSError as e:
111 |         raise CodeQLError(f"OS error while accessing database folder: {dbs_folder}") from e
112 | 
113 | 
114 | def read_file_lines_from_zip(zip_path: str, file_path_in_zip: str) -> str:
115 |     """
116 |     Read text from a single file within a ZIP archive (UTF-8).
117 | 
118 |     Args:
119 |         zip_path (str): The path to the ZIP file.
120 |         file_path_in_zip (str): The internal path within the ZIP to the file.
121 | 
122 |     Returns:
123 |         str: The contents of the file (as UTF-8) located within the ZIP.
124 |     
125 |     Raises:
126 |         CodeQLError: If ZIP file cannot be read or file not found in archive.
127 |     """
128 |     try:
129 |         with zipfile.ZipFile(zip_path, 'r') as zip_ref:
130 |             with zip_ref.open(file_path_in_zip) as file:
131 |                 return file.read().decode('utf-8')
132 |     except zipfile.BadZipFile as e:
133 |         raise CodeQLError(f"Invalid or corrupted ZIP file: {zip_path}") from e
134 |     except KeyError as e:
135 |         raise CodeQLError(f"File '{file_path_in_zip}' not found in ZIP archive: {zip_path}") from e
136 |     except PermissionError as e:
137 |         raise CodeQLError(f"Permission denied reading ZIP file: {zip_path}") from e
138 |     except OSError as e:
139 |         raise CodeQLError(f"OS error while reading ZIP file: {zip_path}") from e
140 | 
141 | 
142 | def read_yml(file_path: str) -> Dict[str, Any]:
143 |     """
144 |     Read and parse a YAML file, returning its data as a Python dictionary.
145 | 
146 |     Args:
147 |         file_path (str): The path to the YAML file.
148 | 
149 |     Returns:
150 |         Dict[str, Any]: The YAML data as a dictionary.
151 |     
152 |     Raises:
153 |         VulnhallaError: If file cannot be read or YAML parsing fails.
154 |     """
155 |     try:
156 |         with open(file_path, 'r', encoding="utf-8") as file:
157 |             return yaml.safe_load(file)
158 |     except FileNotFoundError as e:
159 |         raise VulnhallaError(f"YAML file not found: {file_path}") from e
160 |     except PermissionError as e:
161 |         raise VulnhallaError(f"Permission denied reading YAML file: {file_path}") from e
162 |     except yaml.YAMLError as e:
163 |         raise VulnhallaError(f"Failed to parse YAML file: {file_path}") from e
164 |     except OSError as e:
165 |         raise VulnhallaError(f"OS error while reading YAML file: {file_path}") from e


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # CyberArk Community Code of Conduct
 2 | 
 3 | CyberArk is a leader in Privileged Access Management, thanks to its customers and community. We listen to our community and wish to provide additional relevant tools. We believe that our mission is best served in an environment that is friendly, safe, and accepting; free from intimidation or harassment.
 4 | 
 5 | Towards this end, CyberArk's developers have created this Community Code of Conduct for the CyberArk open source community. Our Code of Conduct sets the standard for how developers, and community members can work together in a respectful and collaborative manner. Those who do not abide by this Code of Conduct will not be permitted to remain part of our community.
 6 | 
 7 | 
 8 | ## Summary of Key Principles
 9 | 
10 | - Be respectful to others in the community at all times.
11 | 
12 | - Report harassing or abusive behavior that you experience or witness at ReportAbuse@cyberark.com
13 | 
14 | - The CyberArk community will not tolerate abusive or disrespectful behavior towards its members; anyone engaging in such behavior will be suspended from the CyberArk community.
15 | 
16 | 
17 | ## Scope
18 | 
19 | This Code of Conduct applies to all members of the CyberArk community, including paid and unpaid agents, administrators, users, and customers of CyberArk. It applies in all CyberArk community venues, online and in person, including CyberArk Open Source project communities (such as public GitHub repositories, chat channels, social media, mailing lists, and public events) and in one-on-one communications pertaining to CyberArk affairs.
20 | 
21 | This policy covers the usage of CyberArk hosted services, as well as the CyberArk website, CyberArk related events, and any other services offered by or on behalf of CyberArk (collectively, the "Service").
22 | 
23 | This Code of Conduct is in addition to, and does not in any way nullify or invalidate, any other terms or conditions related to use of the Service.
24 | 
25 | 
26 | ## Maintaining a Friendly, Harassment-Free Space
27 | 
28 | We are committed to providing a friendly, safe and welcoming environment for all, regardless of gender identity, sexual orientation, ability, ethnicity, religion, age, physical appearance, body size, race, or similar personal characteristics.
29 | 
30 | We ask that you please respect that people have differences of opinion regarding technical choices, and that every design or implementation choice carries a trade-off and numerous costs. There is seldom a single right answer. A difference of technology preferences is not a license to be rude.
31 | 
32 | Harassing other users of the Service for any reason is never tolerated, whether via public or private media. Any spamming, trolling, flaming, baiting, or other attention-stealing behavior is not welcome, and will not be tolerated.
33 | 
34 | Even if your intent is not to harass or offend others, be mindful of how your comments might be perceived by others in the community.
35 | 
36 | 
37 | ## Unacceptable Behavior
38 | 
39 | The following behaviors are considered harassment under this Code of Conduct and are unacceptable within our community:
40 | 
41 | - Violence, threats of violence, or violent language directed against another person or group of people.
42 | 
43 | - Sexist, racist, homophobic, transphobic, ableist, or otherwise discriminatory jokes and language.
44 | 
45 | - Posting or displaying sexually explicit or violent material.
46 | 
47 | - Posting or threatening to post other people's personally identifying information ("doxing").
48 | 
49 | - Personal insults, particularly those related to related to gender identity, sexual orientation, ability, ethnicity, religion, age, physical appearance, body size, race, or similar personal characteristics.
50 | 
51 | - Using offensive or harassing nicknames or other identifiers.
52 | 
53 | - Inappropriate photography or recording.
54 | 
55 | - Inappropriate physical contact. You should have someone's consent before touching them.
56 | 
57 | - Unwelcome sexual attention. This includes: sexualized comments or jokes; inappropriate touching, groping, and unwelcome sexual advances.
58 | 
59 | - Deliberate intimidation, stalking, or following (online or in person).
60 | 
61 | - Sustained disruption of community events, including talks and presentations.
62 | 
63 | - Advocating for, or encouraging, any of the above behavior.
64 | 
65 | 
66 | ## Reporting Violations
67 | 
68 | If you witness or experience unacceptable behavior in the CyberArk community, please promptly report it to our team at ReportAbuse@cyberark.com. If this is the initial report of a problem, please include as much detail as possible. It is easiest for us to address issues when we have more context.
69 | 
70 | The CyberArk Community Team will look into any reported issues in a confidential manner and take any necessary actions to address and resolve the problem.
71 | 
72 | We will not tolerate any form of retaliation towards users who report these issues to us.
73 | 
74 | If you feel that you have been falsely or unfairly accused of violating this Code of Conduct by others in the community, you should notify the ReportAbuse@cyberark.com team so that we can address and resolve the accusation.
75 | 
76 | As always, if you have an urgent security issue, contact product_security@cyberark.com and if you have concerns about a potential copyright violation, contact legal@cyberark.com.
77 | 
78 | 
79 | ## Consequences
80 | 
81 | All content published to the Service, including user account credentials, is hosted at the sole discretion of the CyberArk administrators. If a community member engages in unacceptable behavior, the CyberArk administrators may take any action they deem appropriate, up to and including a temporary ban or permanent expulsion from the community without warning. In general, we will choose the course of action that we judge as being most in the interest of fostering a safe and friendly community.
82 | 
83 | 
84 | ## Contact Info
85 | 
86 | Please contact ReportAbuse@cyberark.com if you need to report a problem or address a grievance related to an abuse report.
87 | 
88 | You are also encouraged to contact us if you have questions about what constitutes appropriate and inappropriate content. We are happy to provide guidance to help you be a successful part of our community. Our technical community is available [here](https://cyberark-customers.force.com/s/).
89 | 
90 | 
91 | ## Credit and License
92 | 
93 | 
94 | This Code of Conduct borrows from the [npm Code of Conduct](https://www.npmjs.com/policies/conduct), Stumptown Syndicate [Citizen's Code of Conduct](http://citizencodeofconduct.org/), and the [Rust Project Code of Conduct](https://www.rust-lang.org/conduct.html).
95 | 
96 | This document may be reused under a [Creative Commons Attribution-ShareAlike License](https://creativecommons.org/licenses/by-sa/4.0/).


--------------------------------------------------------------------------------
/src/utils/llm_config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | LLM Configuration Module
  4 | 
  5 | Loads LLM configuration from .env file or environment variables.
  6 | Supports multiple providers via LiteLLM.
  7 | """
  8 | 
  9 | import os
 10 | from typing import Dict, Optional, Any
 11 | from dotenv import load_dotenv
 12 | 
 13 | # Load .env file if it exists, otherwise try .env.example
 14 | if os.path.exists(".env"):
 15 |     load_dotenv(".env")
 16 | elif os.path.exists(".env.example"):
 17 |     load_dotenv(".env.example")
 18 | 
 19 | # Allowed LLM providers
 20 | ALLOWED_LLM_PROVIDERS = {
 21 |     "openai", "azure", "anthropic", "mistral", "codestral",
 22 |     "groq", "openrouter", "huggingface", "cohere", "bedrock",
 23 |     "vertex_ai", "gemini", "ollama"
 24 | }
 25 | 
 26 | 
 27 | def get_model_name(provider: Optional[str], model: Optional[str]) -> str:
 28 |     """
 29 |     Construct the model name in LiteLLM format.
 30 |     
 31 |     Args:
 32 |         provider: Provider name (e.g., "openai", "azure", "anthropic")
 33 |         model: Model name (e.g., "gpt-4o", "claude-3-opus", "openrouter/google/gemini-pro")
 34 |     
 35 |     Returns:
 36 |         Model name in LiteLLM format (e.g., "gpt-4o" or "azure/gpt-4o")
 37 |     """
 38 |     if not model:
 39 |         return "gpt-4o"  # Default fallback
 40 |     
 41 |     # For OpenAI, return as-is (no prefix needed)
 42 |     if provider == "openai":
 43 |         return model
 44 |     
 45 |     # For Azure, ensure model looks like "azure/<deployment_name>"
 46 |     if provider == "azure":
 47 |         if model.startswith("azure/"):
 48 |             return model
 49 |         return f"azure/{model}"
 50 |     
 51 |     # For all other providers, add provider/ prefix if not already present
 52 |     if provider:
 53 |         if model.startswith(f"{provider}/"):
 54 |             return model  # Already has correct prefix
 55 |         return f"{provider}/{model}"
 56 |     
 57 |     return model
 58 | 
 59 | 
 60 | def load_llm_config() -> Dict[str, Any]:
 61 |     """
 62 |     Load LLM configuration from .env file or environment variables.
 63 |     
 64 |     Returns:
 65 |         Dictionary with LLM configuration:
 66 |         {
 67 |             "provider": str,
 68 |             "model": str,
 69 |             "api_key": str,
 70 |             "endpoint": Optional[str],
 71 |             "api_version": Optional[str],
 72 |             "temperature": float,
 73 |             "top_p": float
 74 |         }
 75 |     
 76 |     Raises:
 77 |         ValueError: If required configuration is missing
 78 |     """
 79 |     # Determine provider
 80 |     provider = os.getenv("PROVIDER", "openai").lower()
 81 |     
 82 |     # Normalize aliases to canonical provider name
 83 |     if provider == "google":
 84 |         provider = "gemini"
 85 |     
 86 |     # Validate provider is in allowed list
 87 |     if provider not in ALLOWED_LLM_PROVIDERS:
 88 |         raise ValueError(
 89 |             f"Provider '{provider}' is not supported. "
 90 |             f"Allowed providers: {', '.join(sorted(ALLOWED_LLM_PROVIDERS))}"
 91 |         )
 92 |     
 93 |     # Get model name
 94 |     model = os.getenv("MODEL", "gpt-4o")
 95 |     
 96 |     # Get API key and provider-specific config based on provider
 97 |     api_key = None
 98 |     endpoint = None
 99 |     api_version = None
100 |     
101 |     if provider == "openai":
102 |         api_key = os.getenv("OPENAI_API_KEY")
103 |     
104 |     elif provider == "azure":
105 |         api_key = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AZURE_API_KEY")
106 |         endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("AZURE_API_BASE")
107 |         api_version = os.getenv("AZURE_OPENAI_API_VERSION") or os.getenv("AZURE_API_VERSION", "2024-08-01-preview")
108 |     
109 |     elif provider == "anthropic":
110 |         api_key = os.getenv("ANTHROPIC_API_KEY")
111 |     
112 |     elif provider == "gemini":
113 |         api_key = os.getenv("GOOGLE_API_KEY")
114 |     
115 |     elif provider == "mistral":
116 |         api_key = os.getenv("MISTRAL_API_KEY")
117 |     
118 |     elif provider == "codestral":
119 |         # Codestral uses Mistral API key
120 |         api_key = os.getenv("MISTRAL_API_KEY")
121 |     
122 |     elif provider == "groq":
123 |         api_key = os.getenv("GROQ_API_KEY")
124 |     
125 |     elif provider == "openrouter":
126 |         api_key = os.getenv("OPENROUTER_API_KEY")
127 |     
128 |     elif provider == "huggingface":
129 |         api_key = os.getenv("HUGGINGFACE_API_KEY")
130 |     
131 |     elif provider == "cohere":
132 |         api_key = os.getenv("COHERE_API_KEY") or os.getenv("CO_API_KEY")
133 |     
134 |     elif provider == "bedrock":
135 |         # Bedrock uses AWS credentials
136 |         api_key = os.getenv("AWS_ACCESS_KEY_ID")
137 |         aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
138 |         aws_region = os.getenv("AWS_REGION_NAME", "us-east-1")
139 |         # Store region in endpoint field for Bedrock
140 |         endpoint = aws_region
141 |     
142 |     elif provider == "vertex_ai":
143 |         # Vertex AI uses GCP credentials (service account JSON or GOOGLE_APPLICATION_CREDENTIALS)
144 |         # No API key needed, but we set a placeholder to pass validation
145 |         gcp_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
146 |         if not gcp_creds and not os.path.exists(os.path.expanduser("~/.config/gcloud/application_default_credentials.json")):
147 |             raise ValueError(
148 |                 "GCP credentials not found. Set GOOGLE_APPLICATION_CREDENTIALS or run 'gcloud auth application-default login'"
149 |             )
150 |         api_key = "vertex_ai_placeholder"
151 |     
152 |     elif provider == "ollama":
153 |         # Ollama uses OLLAMA_BASE_URL (defaults to http://localhost:11434)
154 |         endpoint = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
155 |         # Ollama doesn't require API key, but we set a placeholder to pass validation
156 |         api_key = "ollama_placeholder"
157 |     
158 |     # Get optional parameters
159 |     temperature = float(os.getenv("LLM_TEMPERATURE", "0.2"))
160 |     top_p = float(os.getenv("LLM_TOP_P", "0.2"))
161 |     
162 |     config = {
163 |         "provider": provider,
164 |         "model": get_model_name(provider, model),
165 |         "api_key": api_key,
166 |         "temperature": temperature,
167 |         "top_p": top_p
168 |     }
169 |     
170 |     # Add provider-specific fields
171 |     if endpoint:
172 |         config["endpoint"] = endpoint
173 |     if api_version:
174 |         config["api_version"] = api_version
175 |     
176 |     # Special handling for Bedrock (store AWS region and secret)
177 |     if provider == "bedrock":
178 |         config["aws_secret_access_key"] = os.getenv("AWS_SECRET_ACCESS_KEY")
179 |         config["aws_region"] = endpoint  # Store region in endpoint field
180 |     
181 |     # Special handling for Vertex AI (store GCP project/location if provided)
182 |     if provider == "vertex_ai":
183 |         if os.getenv("GCP_PROJECT_ID"):
184 |             config["gcp_project_id"] = os.getenv("GCP_PROJECT_ID")
185 |         if os.getenv("GCP_LOCATION"):
186 |             config["gcp_location"] = os.getenv("GCP_LOCATION")
187 |     
188 |     return config
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Vulnhalla Setup Script - Cross platform one line installation
  4 | Usage: python setup.py
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import subprocess
 10 | import shutil
 11 | from pathlib import Path
 12 | 
 13 | # Get project root
 14 | PROJECT_ROOT = Path(__file__).parent
 15 | 
 16 | # Add project root to Python path for imports
 17 | sys.path.insert(0, str(PROJECT_ROOT))
 18 | 
 19 | # Initialize logging early
 20 | from src.utils.logger import setup_logging, get_logger
 21 | setup_logging()
 22 | logger = get_logger(__name__)
 23 | # Check Python version
 24 | if sys.version_info >= (3, 14):
 25 |     logger.error("Python 3.14+ is not yet supported (grpcio wheels unavailable). Please use Python 3.11 or 3.12.")
 26 |     sys.exit(1)
 27 | 
 28 | 
 29 | def check_dependencies_installed() -> bool:
 30 |     """
 31 |     Check if all required dependencies are already installed by trying to import them.
 32 |     
 33 |     Returns:
 34 |         bool: True if all dependencies are installed, False otherwise.
 35 |     """
 36 |     try:
 37 |         import requests
 38 |         import dotenv
 39 |         import litellm
 40 |         import yaml
 41 |         import textual
 42 |         import pySmartDL
 43 |         return True
 44 |     except ImportError:
 45 |         return False
 46 | 
 47 | 
 48 | def main():
 49 |     """Run the Vulnhalla setup process.
 50 | 
 51 |     This script installs Python dependencies, verifies the CodeQL
 52 |     CLI configuration, installs required CodeQL packs, and prints
 53 |     next steps for running the analysis pipeline.
 54 |     """
 55 |     logger.info("Vulnhalla Setup")
 56 |     logger.info("=" * 50)
 57 |     
 58 |     # Check if virtual environment exists
 59 |     venv_path = PROJECT_ROOT / "venv"
 60 |     use_venv = venv_path.exists()
 61 |     
 62 |     if use_venv:
 63 |         # Use virtual environment pip
 64 |         if os.name == 'nt':  # Windows
 65 |             pip_exe = [str(PROJECT_ROOT / "venv/Scripts/pip.exe")]
 66 |         else:  # Unix/macOS/Linux
 67 |             pip_exe = [str(PROJECT_ROOT / "venv/bin/pip")]
 68 |         logger.info("Using virtual environment...")
 69 |     else:
 70 |         # Use system pip
 71 |         pip_exe = [sys.executable, "-m", "pip"]
 72 |         logger.info("Installing to current Python environment...")
 73 |     
 74 |     if check_dependencies_installed():
 75 |         logger.info("✅ All dependencies are already installed! Skipping installation.")
 76 |     else:
 77 |         # Install dependencies
 78 |         logger.info("📦 Installing Python dependencies... This may take a moment ⏳")
 79 |         try:
 80 |             subprocess.run(pip_exe + ["install","-q", "-r", str(PROJECT_ROOT / "requirements.txt")], check=True)
 81 |             logger.info("✅ Python dependencies installed successfully!")
 82 |         except subprocess.CalledProcessError as e:
 83 |             logger.error("\n❌ Setup failed. Please fix the missing dependencies and run setup.py again.")
 84 |             sys.exit(1)
 85 |     
 86 |     # Install CodeQL packs
 87 |     # Check for CodeQL in PATH or .env
 88 |     codeql_cmd = None
 89 |     
 90 |     try:
 91 |         from src.utils.config import get_codeql_path
 92 |         from src.utils.config_validator import find_codeql_executable
 93 |         
 94 |         codeql_path = get_codeql_path()
 95 |         logger.info("Checking CodeQL path: %s", codeql_path)
 96 |         
 97 |         # Use helper function to find executable
 98 |         codeql_cmd = find_codeql_executable()
 99 |         
100 |         if codeql_cmd:
101 |             if codeql_path == "codeql":
102 |                 logger.info("🔍 Checking if 'codeql' is in PATH...")
103 |                 logger.info("✅ Found in PATH: %s", codeql_cmd)
104 |             else:
105 |                 logger.info("✅ Found CodeQL path: %s", codeql_cmd)
106 |         else:
107 |             # Provide detailed error messages
108 |             if codeql_path and codeql_path != "codeql":
109 |                 # Custom path specified - strip quotes if present
110 |                 codeql_path_clean = codeql_path.strip('"').strip("'")
111 |                 logger.error("❌ Path does not exist: %s", codeql_path_clean)
112 |                 if os.name == 'nt':
113 |                     logger.info("Also checked: %s.cmd", codeql_path_clean)
114 |             else:
115 |                 logger.info("🔍 Checking if 'codeql' is in PATH...")
116 |                 logger.error("❌ 'codeql' not found in PATH")
117 |     except Exception as e:
118 |         # Fallback to checking PATH
119 |         logger.error("❌ Error loading config: %s", e)
120 |         logger.info("🔍 Falling back to PATH check...")
121 |         codeql_cmd = shutil.which("codeql")
122 |         if codeql_cmd:
123 |             logger.info("✅ Found in PATH: %s", codeql_cmd)
124 |     
125 |     if codeql_cmd:
126 |         logger.info("📦 Installing CodeQL packs... This may take a moment ⏳")
127 |         
128 |         # Tools pack
129 |         tools_dir = PROJECT_ROOT / "data/queries/cpp/tools"
130 |         if tools_dir.exists():
131 |             os.chdir(str(tools_dir))
132 |             result = subprocess.run([codeql_cmd, "pack", "install"], check=False, capture_output=True, text=True)
133 |             if result.returncode != 0:
134 |                 logger.warning("Failed to install tools pack: %s", result.stderr)
135 |             os.chdir(str(PROJECT_ROOT))
136 |         
137 |         # Issues pack
138 |         issues_dir = PROJECT_ROOT / "data/queries/cpp/issues"
139 |         if issues_dir.exists():
140 |             os.chdir(str(issues_dir))
141 |             result = subprocess.run([codeql_cmd, "pack", "install"], check=False, capture_output=True, text=True)
142 |             if result.returncode != 0:
143 |                 logger.warning("Failed to install issues pack: %s", result.stderr)
144 |             os.chdir(str(PROJECT_ROOT))
145 |     else:
146 |         logger.error("❌ CodeQL CLI not found. Skipping CodeQL pack installation.")
147 |         logger.info("🔗 Install CodeQL CLI from: https://github.com/github/codeql-cli-binaries/releases")
148 |         logger.info("   After installation, either add CodeQL to your PATH or set CODEQL_PATH in your .env file.")
149 |         logger.info("   Then run: python setup.py or install packages manually")
150 |         return
151 |     
152 |     # Optional: Validate CodeQL configuration if .env file exists
153 |     env_file = PROJECT_ROOT / ".env"
154 |     if env_file.exists():
155 |         logger.info("\n🔍 Validating CodeQL configuration...")
156 |         try:
157 |             from src.utils.config_validator import validate_codeql_path
158 |             is_valid, error = validate_codeql_path()
159 |             if is_valid:
160 |                 logger.info("✅ CodeQL configuration validated successfully!")
161 |             else:
162 |                 logger.warning("⚠️  CodeQL configuration issue detected:")
163 |                 logger.warning("   %s", error.split(chr(10))[0])  # Print first line of error
164 |                 logger.warning("   Please fix this before running the pipeline.")
165 |         except Exception as e:
166 |             logger.warning("⚠️  Could not validate CodeQL configuration: %s", e)
167 |             logger.info("   This is not critical - you can fix configuration later.")
168 |     
169 |     logger.info("🎉 Setup completed successfully! 🎉")
170 |     logger.info("🔗 Next steps:")
171 |     if not env_file.exists():
172 |         logger.info("1. Create a .env file with all the required variables (see README.md)")
173 |         logger.info("2. Run one of the following commands to start the pipeline:")
174 |     else:
175 |         logger.info("Run one of the following commands to start the pipeline:")
176 |     logger.info("   • python src/pipeline.py <repo_org/repo_name>    # Analyze a specific repository")
177 |     logger.info("   • python src/pipeline.py                         # Analyze top 100 repositories")
178 |     logger.info("   • python examples/example.py                     # See a full pipeline run")
179 | 
180 | if __name__ == "__main__":
181 |     main()
182 | 
183 | 


--------------------------------------------------------------------------------
/src/pipeline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Pipeline orchestration for Vulnhalla.
  4 | This module coordinates the complete analysis pipeline:
  5 | 1. Fetch CodeQL databases
  6 | 2. Run CodeQL queries
  7 | 3. Classify results with LLM
  8 | 4. Open UI (optional)
  9 | """
 10 | import sys
 11 | from pathlib import Path
 12 | from typing import Optional
 13 | 
 14 | # Add project root to Python path
 15 | PROJECT_ROOT = Path(__file__).parent.parent
 16 | if str(PROJECT_ROOT) not in sys.path:
 17 |     sys.path.insert(0, str(PROJECT_ROOT))
 18 | 
 19 | from src.codeql.fetch_repos import fetch_codeql_dbs
 20 | from src.codeql.run_codeql_queries import compile_and_run_codeql_queries
 21 | from src.utils.config import get_codeql_path
 22 | from src.utils.config_validator import validate_and_exit_on_error
 23 | from src.utils.logger import setup_logging, get_logger
 24 | from src.utils.exceptions import (
 25 |     CodeQLError, CodeQLConfigError, CodeQLExecutionError,
 26 |     LLMError, LLMConfigError, LLMApiError,
 27 |     VulnhallaError
 28 | )
 29 | from src.vulnhalla import IssueAnalyzer
 30 | from src.ui.ui_app import main as ui_main
 31 | 
 32 | # Initialize logging
 33 | setup_logging()
 34 | logger = get_logger(__name__)
 35 | 
 36 | 
 37 | def _log_exception_cause(e: Exception) -> None:
 38 |     """
 39 |     Log the cause of an exception if available and not already included in the exception message.
 40 |     Checks both e.cause (if set via constructor) and e.__cause__ (if set via 'from e').
 41 |     """
 42 |     cause = getattr(e, 'cause', None) or getattr(e, '__cause__', None)
 43 |     if cause:
 44 |         # Only log cause if it's not already included in the exception message
 45 |         cause_str = str(cause)
 46 |         error_str = str(e)
 47 |         if cause_str not in error_str:
 48 |             logger.error("   Cause: %s", cause)
 49 | 
 50 | 
 51 | def analyze_pipeline(repo: Optional[str] = None, lang: str = "c", threads: int = 16, open_ui: bool = True) -> None:
 52 |     """
 53 |     Run the complete Vulnhalla pipeline: fetch, analyze, classify, and optionally open UI.
 54 |     
 55 |     Args:
 56 |         repo: Optional GitHub repository name (e.g., "redis/redis"). If None, fetches top repos.
 57 |         lang: Programming language code. Defaults to "c".
 58 |         threads: Number of threads for CodeQL operations. Defaults to 16.
 59 |         open_ui: Whether to open the UI after completion. Defaults to True.
 60 |     
 61 |     Note:
 62 |         This function catches and handles all exceptions internally, logging errors
 63 |         and exiting with code 1 on failure. It does not raise exceptions.
 64 |     """
 65 |     logger.info("🚀 Starting Vulnhalla Analysis Pipeline")
 66 |     logger.info("=" * 60)
 67 |     
 68 |     try:
 69 |         # Validate configuration before starting
 70 |         validate_and_exit_on_error()
 71 |     except (CodeQLConfigError, LLMConfigError, VulnhallaError) as e:
 72 |         # Format error message for display
 73 |         message = f"""
 74 | ⚠️ Configuration Validation Failed
 75 | ============================================================
 76 | {str(e)}
 77 | ============================================================
 78 | Please fix the configuration errors above and try again.
 79 | See README.md for configuration reference.
 80 | """
 81 |         logger.error(message)
 82 |         _log_exception_cause(e)
 83 |         sys.exit(1)
 84 |     
 85 |     try:
 86 |         # Step 1: Fetch CodeQL databases
 87 |         logger.info("\n[1/4] Fetching CodeQL Databases")
 88 |         logger.info("-" * 60)
 89 |         if repo:
 90 |             logger.info("Fetching database for: %s", repo)
 91 |             fetch_codeql_dbs(lang=lang, threads=threads, single_repo=repo)
 92 |         else:
 93 |             logger.info("Fetching top repositories for language: %s", lang)
 94 |             fetch_codeql_dbs(lang=lang, max_repos=100, threads=4)
 95 |     except CodeQLConfigError as e:
 96 |         logger.error("❌ Configuration error while fetching CodeQL databases: %s", e)
 97 |         _log_exception_cause(e)
 98 |         logger.error("   Please check your GitHub token and permissions.")
 99 |         sys.exit(1)
100 |     except CodeQLError as e:
101 |         logger.error("❌ Failed to fetch CodeQL databases: %s", e)
102 |         _log_exception_cause(e)
103 |         logger.error("   Please check file permissions, disk space, and GitHub API access.")
104 |         sys.exit(1)
105 |     
106 |     try:
107 |         # Step 2: Run CodeQL queries
108 |         logger.info("\n[2/4] Running CodeQL Queries")
109 |         logger.info("-" * 60)
110 |         compile_and_run_codeql_queries(
111 |             codeql_bin=get_codeql_path(),
112 |             lang=lang,
113 |             threads=threads,
114 |             timeout=300
115 |         )
116 |     except CodeQLConfigError as e:
117 |         logger.error("❌ Configuration error while running CodeQL queries: %s", e)
118 |         _log_exception_cause(e)
119 |         logger.error("   Please check your CODEQL_PATH configuration.")
120 |         sys.exit(1)
121 |     except CodeQLExecutionError as e:
122 |         logger.error("❌ Failed to execute CodeQL queries: %s", e)
123 |         _log_exception_cause(e)
124 |         logger.error("   Please check your CodeQL installation and database files.")
125 |         sys.exit(1)
126 |     except CodeQLError as e:
127 |         logger.error("❌ CodeQL error: %s", e)
128 |         _log_exception_cause(e)
129 |         sys.exit(1)
130 |     
131 |     try:
132 |         # Step 3: Classify results with LLM
133 |         logger.info("\n[3/4] Classifying Results with LLM")
134 |         logger.info("-" * 60)
135 |         analyzer = IssueAnalyzer(lang=lang)
136 |         analyzer.run()
137 |     except LLMConfigError as e:
138 |         logger.error("❌ LLM configuration error: %s", e)
139 |         _log_exception_cause(e)
140 |         logger.error("   Please check your LLM configuration and API credentials in .env file.")
141 |         sys.exit(1)
142 |     except LLMApiError as e:
143 |         logger.error("❌ LLM API error: %s", e)
144 |         _log_exception_cause(e)
145 |         logger.error("   Please check your API key, network connection, and rate limits.")
146 |         sys.exit(1)
147 |     except LLMError as e:
148 |         logger.error("❌ LLM error: %s", e)
149 |         _log_exception_cause(e)
150 |         sys.exit(1)
151 |     except CodeQLError as e:
152 |         logger.error("❌ CodeQL error while reading database files: %s", e)
153 |         _log_exception_cause(e)
154 |         logger.error("   This step reads CodeQL database files (YAML, ZIP, CSV) to prepare data for LLM analysis.")
155 |         logger.error("   Please check your CodeQL databases and files are accessible.")
156 |         sys.exit(1)
157 |     except VulnhallaError as e:
158 |         logger.error("❌ File system error while saving results: %s", e)
159 |         _log_exception_cause(e)
160 |         logger.error("   This step writes analysis results to disk and creates output directories.")
161 |         logger.error("   Please check file permissions and disk space.")
162 |         sys.exit(1)
163 |     
164 |     # Step 4: Open UI
165 |     if open_ui:
166 |         logger.info("\n[4/4] Opening UI")
167 |         logger.info("-" * 60)
168 |         logger.info("✅ Pipeline completed successfully!")
169 |         logger.info("Opening results UI...")
170 |         ui_main()
171 |     else:
172 |         logger.info("\n✅ Pipeline completed successfully!")
173 |         logger.info("View results with: python src/ui/ui_app.py")
174 | 
175 | 
176 | def main_analyze() -> None:
177 |     """
178 |     CLI entry point for the complete analysis pipeline.
179 |     Usage:
180 |         vulnhalla-analyze                    # Analyze top 100 repos
181 |         vulnhalla-analyze redis/redis        # Analyze specific repo
182 |     """
183 |     # Parse command line arguments
184 |     repo = None
185 |     if len(sys.argv) > 1:
186 |         repo = sys.argv[1]
187 |         if "/" not in repo:
188 |             logger.error("❌ Error: Repository must be in format 'org/repo'")
189 |             logger.error("   Example: python src/pipeline.py redis/redis")
190 |             logger.error("   Or run without arguments to analyze top repositories")
191 |             sys.exit(1)
192 |     analyze_pipeline(repo=repo)
193 | 
194 | 
195 | if __name__ == '__main__':
196 |     main_analyze()


--------------------------------------------------------------------------------
/src/ui/issue_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Issue parsing utilities for extracting and processing data from Issue objects.
  4 | 
  5 | This module contains pure data parsing logic, separated from UI concerns.
  6 | """
  7 | 
  8 | import re
  9 | from typing import List, Optional, Tuple
 10 | 
 11 | from src.ui.models import Issue
 12 | 
 13 | 
 14 | # Regex patterns for parsing
 15 | LOCATION_PATTERN = re.compile(r'Location:\s*[^:]*:(\d+)', re.IGNORECASE)
 16 | FILE_LINE_PATTERN = re.compile(r'^\s*file:')
 17 | NUMBERED_LINE_PATTERN = re.compile(r'^\s*\d+:')
 18 | LINE_NUMBER_PATTERN = re.compile(r'^\s*\d+:\s*')
 19 | LINE_NUMBER_MATCH_PATTERN = re.compile(r'^\s*(\d+):')
 20 | 
 21 | 
 22 | def extract_line_number_from_location(issue: Issue) -> Optional[int]:
 23 |     """
 24 |     Extract line number from "Location: ..." text in raw_data or final_data.
 25 | 
 26 |     Args:
 27 |         issue (Issue): Issue object containing raw_data and final_data.
 28 | 
 29 |     Returns:
 30 |         Optional[int]: Line number extracted from location text, or None if not found.
 31 |     """
 32 |     # Check raw_data prompt first
 33 |     if issue.raw_data and "prompt" in issue.raw_data:
 34 |         match = LOCATION_PATTERN.search(issue.raw_data["prompt"])
 35 |         if match:
 36 |             try:
 37 |                 return int(match.group(1))
 38 |             except (ValueError, IndexError):
 39 |                 pass
 40 |     
 41 |     # Check final_data messages
 42 |     if issue.final_data:
 43 |         for msg in issue.final_data:
 44 |             if isinstance(msg, dict):
 45 |                 content = msg.get("content", "")
 46 |                 if content:
 47 |                     match = LOCATION_PATTERN.search(content)
 48 |                     if match:
 49 |                         try:
 50 |                             return int(match.group(1))
 51 |                         except (ValueError, IndexError):
 52 |                             pass
 53 |     
 54 |     return None
 55 | 
 56 | 
 57 | def extract_code_blocks_from_text(text: str) -> List[str]:
 58 |     """
 59 |     Extract Vulnhalla code blocks from text.
 60 | 
 61 |     Args:
 62 |         text (str): Text containing code blocks in format "file: ..." followed by 
 63 |             numbered lines like "123: code".
 64 | 
 65 |     Returns:
 66 |         List[str]: List of extracted code block strings.
 67 |     """
 68 |     if not text:
 69 |         return []
 70 |     
 71 |     blocks = []
 72 |     lines = text.split('\n')
 73 |     i = 0
 74 |     
 75 |     while i < len(lines):
 76 |         if FILE_LINE_PATTERN.match(lines[i]):
 77 |             # Found file: line - start collecting block
 78 |             block_lines = [lines[i]]
 79 |             i += 1
 80 |             
 81 |             # Collect numbered lines until we hit a non-numbered line
 82 |             while i < len(lines):
 83 |                 line = lines[i]
 84 |                 if NUMBERED_LINE_PATTERN.match(line):
 85 |                     block_lines.append(line)
 86 |                     i += 1
 87 |                     # Include continuation lines that end with a backslash
 88 |                     if line.rstrip().endswith('\\') and i < len(lines):
 89 |                         block_lines.append(lines[i])
 90 |                         i += 1
 91 |                 else:
 92 |                     break  # End of block
 93 |             
 94 |             # Only keep blocks with at least file: + one numbered line
 95 |             if len(block_lines) > 1:
 96 |                 block = '\n'.join(block_lines).strip()
 97 |                 if block:
 98 |                     blocks.append(block)
 99 |         else:
100 |             i += 1
101 |     
102 |     return blocks
103 | 
104 | 
105 | def extract_code_from_messages(final_data: Optional[List]) -> List[str]:
106 |     """
107 |     Extract all code blocks from final_data messages in chronological order.
108 | 
109 |     Args:
110 |         final_data (Optional[List]): List of message dictionaries from LLM conversation.
111 | 
112 |     Returns:
113 |         List[str]: List of extracted code block strings.
114 |     """
115 |     if not final_data:
116 |         return []
117 |     
118 |     all_blocks = []
119 |     for msg in final_data:
120 |         if isinstance(msg, dict):
121 |             content = msg.get("content", "")
122 |             if isinstance(content, str) and content:
123 |                 all_blocks.extend(extract_code_blocks_from_text(content))
124 |     
125 |     return all_blocks
126 | 
127 | 
128 | def normalize_code_snippet(snippet: str) -> str:
129 |     """
130 |     Normalize code snippet for deduplication: strip line numbers and whitespace.
131 |     
132 |     The file header is normalized to handle slight formatting differences (whitespace,
133 |     trailing characters) so that the same code block with minor header differences
134 |     will be properly deduplicated.
135 | 
136 |     Args:
137 |         snippet (str): Code snippet to normalize.
138 | 
139 |     Returns:
140 |         str: Normalized code snippet with line numbers removed and file header normalized.
141 |     """
142 |     snippet = snippet.strip()
143 |     if not snippet:
144 |         return ""
145 |     
146 |     # Check for file: header
147 |     file_match = re.match(r'(file:\s*[^\n]+)\n(.*)', snippet, re.DOTALL)
148 |     if file_match:
149 |         # Normalize file header: strip whitespace and normalize multiple spaces to single space
150 |         file_header = file_match.group(1).strip()
151 |         # Normalize whitespace in file header (multiple spaces -> single space)
152 |         file_header = re.sub(r'\s+', ' ', file_header)
153 |         code_lines = file_match.group(2).split('\n')
154 |     else:
155 |         file_header = None
156 |         code_lines = snippet.split('\n')
157 |     
158 |     # Normalize all code lines: remove line numbers and strip whitespace
159 |     normalized_lines = []
160 |     for line in code_lines:
161 |         line = LINE_NUMBER_PATTERN.sub('', line).strip()
162 |         if line:  # Keep non-empty lines
163 |             normalized_lines.append(line)
164 |     
165 |     normalized_code = '\n'.join(normalized_lines)
166 |     # Return normalized key with normalized file header
167 |     return f"{file_header}\n{normalized_code}" if file_header else normalized_code
168 | 
169 | 
170 | def collect_all_code_snippets(issue: Issue) -> Tuple[str, List[str]]:
171 |     """
172 |     Collect all unique code snippets from final_data, deduplicated and in order.
173 | 
174 |     Args:
175 |         issue (Issue): Issue object containing final_data.
176 | 
177 |     Returns:
178 |         Tuple[str, List[str]]: Tuple of (initial_code, additional_code_list) where
179 |             initial_code is the first snippet (or empty string) and 
180 |             additional_code_list contains additional snippets (empty if none).
181 |     """
182 |     snippets = extract_code_from_messages(issue.final_data)
183 |     if not snippets:
184 |         return ("", [])
185 |     
186 |     # Deduplicate: keep first occurrence of each normalized block
187 |     seen = set()
188 |     unique_snippets = []
189 |     for snippet in snippets:
190 |         key = normalize_code_snippet(snippet)
191 |         if key and key not in seen:
192 |             seen.add(key)
193 |             unique_snippets.append(snippet)
194 |     
195 |     if not unique_snippets:
196 |         return ("", [])
197 |     
198 |     initial_code = unique_snippets[0]
199 |     additional_code = unique_snippets[1:] if len(unique_snippets) > 1 else []
200 |     
201 |     return (initial_code, additional_code)
202 | 
203 | 
204 | 
205 | def extract_last_message(final_data: Optional[List]) -> Optional[str]:
206 |     """
207 |     Extract the last non-empty message content from final_data.
208 | 
209 |     Args:
210 |         final_data (Optional[List]): List of message dictionaries from LLM conversation.
211 | 
212 |     Returns:
213 |         Optional[str]: Content string of the last message, or None if no valid message found.
214 |     """
215 |     if not final_data:
216 |         return None
217 |     
218 |     # Iterate backwards to find the last non-empty message
219 |     for msg in reversed(final_data):
220 |         if isinstance(msg, dict):
221 |             content = msg.get("content", "")
222 |             if isinstance(content, str) and content.strip():
223 |                 return content.strip()
224 |     
225 |     return None
226 | 
227 | 


--------------------------------------------------------------------------------
/src/utils/logger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Centralized logging configuration for Vulnhalla.
  4 | 
  5 | Provides consistent logging setup across all modules with support for:
  6 | - Console output (INFO level by default)
  7 | - Optional file logging (DEBUG level)
  8 | - Environment variable configuration
  9 | - Structured logging (JSON) option
 10 | """
 11 | 
 12 | import logging
 13 | import sys
 14 | import os
 15 | from pathlib import Path
 16 | from typing import Optional
 17 | 
 18 | # Default configuration
 19 | DEFAULT_LOG_LEVEL = "INFO"
 20 | DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 21 | DEFAULT_LOG_FORMAT_SIMPLE = "%(levelname)s - %(message)s"  # Simpler format for console
 22 | DEFAULT_LOG_FORMAT_INFO = "%(message)s"  # Minimal format for INFO messages
 23 | DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
 24 | 
 25 | # Track if logging has been initialized
 26 | _logging_initialized = False
 27 | 
 28 | 
 29 | def reset_logging() -> None:
 30 |     """
 31 |     Reset logging state
 32 |     
 33 |     Clears all handlers and resets the initialization flag.
 34 |     """
 35 |     global _logging_initialized
 36 |     root_logger = logging.getLogger()
 37 |     root_logger.handlers.clear()
 38 |     _logging_initialized = False
 39 | 
 40 | 
 41 | def suppress_third_party_loggers() -> None:
 42 |     """
 43 |     Suppress verbose logging from third-party libraries.
 44 |     
 45 |     Configures log levels for common third-party libraries that can be noisy.
 46 |     Respects THIRD_PARTY_LOG_LEVEL environment variable if set.
 47 |     """
 48 |     # Get third-party log level from environment, default to ERROR
 49 |     third_party_level_str = os.getenv("THIRD_PARTY_LOG_LEVEL", "ERROR").upper()
 50 |     third_party_level = getattr(logging, third_party_level_str, logging.ERROR)
 51 |     
 52 |     # LiteLLM - only show errors by default (can be verbose with INFO/DEBUG)
 53 |     logging.getLogger("LiteLLM").setLevel(third_party_level)
 54 |     
 55 |     # urllib3/requests - reduce HTTP connection noise
 56 |     logging.getLogger("urllib3").setLevel(third_party_level)
 57 |     logging.getLogger("urllib3.connectionpool").setLevel(third_party_level)
 58 |     logging.getLogger("requests").setLevel(third_party_level)
 59 | 
 60 | 
 61 | def setup_logging(
 62 |     log_level: Optional[str] = None,
 63 |     log_file: Optional[str] = None,
 64 |     log_format: Optional[str] = None,
 65 |     json_format: bool = False,
 66 |     simple_format: bool = False
 67 | ) -> None:
 68 |     """
 69 |     Configure logging for the application.
 70 |     
 71 |     Args:
 72 |         log_level: Logging level (DEBUG, INFO, WARNING, ERROR). 
 73 |                    Defaults to environment variable LOG_LEVEL or INFO.
 74 |         log_file: Optional path to log file. If None, reads from LOG_FILE env var.
 75 |         log_format: Custom log format string. If None, uses default or JSON.
 76 |         json_format: If True, use JSON structured logging format.
 77 |         simple_format: If True, use simpler format without timestamps for console.
 78 |     """
 79 |     global _logging_initialized
 80 |     
 81 |     # Prevent duplicate initialization
 82 |     if _logging_initialized:
 83 |         return
 84 |     
 85 |     # Get configuration from environment or parameters
 86 |     level_str = log_level or os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper()
 87 |     log_file_path = log_file or os.getenv("LOG_FILE")
 88 |     log_format_str = log_format or os.getenv("LOG_FORMAT", "default")
 89 |     # Console format control:
 90 |     # - Default: INFO messages are minimal (message only), WARNING/ERROR/CRITICAL use simple format (LEVEL - message)
 91 |     # - If LOG_VERBOSE_CONSOLE=true: WARNING/ERROR/CRITICAL use full format (timestamp - logger - level - message)
 92 |     # - INFO always remains minimal regardless of verbose mode
 93 |     use_verbose_console = os.getenv("LOG_VERBOSE_CONSOLE", "false").lower() == "true"
 94 |     # Legacy support: LOG_SIMPLE_FORMAT still works but is deprecated in favor of LOG_VERBOSE_CONSOLE
 95 |     use_simple_format = simple_format or os.getenv("LOG_SIMPLE_FORMAT", "false").lower() == "true"
 96 |     
 97 |     # Convert string level to logging constant
 98 |     numeric_level = getattr(logging, level_str, logging.INFO)
 99 |     
100 |     # Configure root logger
101 |     root_logger = logging.getLogger()
102 |     root_logger.setLevel(numeric_level)
103 |     
104 |     # Remove existing handlers to avoid duplicates
105 |     root_logger.handlers.clear()
106 |     
107 |     # Console handler (always present)
108 |     console_handler = logging.StreamHandler(sys.stdout)
109 |     console_handler.setLevel(numeric_level)
110 |     
111 |     if json_format or log_format_str.lower() == "json":
112 |         # JSON structured logging
113 |         try:
114 |             import json
115 |             from datetime import datetime
116 |             
117 |             class JSONFormatter(logging.Formatter):
118 |                 """Formatter that renders log records as JSON strings.
119 | 
120 |                 This formatter is used when JSON logging is enabled. It converts the
121 |                 LogRecord into a JSON object with a timestamp, logger name, level, and
122 |                 message, and can optionally include extra fields such as progress.
123 |                 """
124 |                 
125 |                 def format(self, record: logging.LogRecord) -> str:
126 |                     """Format a LogRecord as a JSON string.
127 | 
128 |                     Args:
129 |                         record: The log record to format.
130 | 
131 |                     Returns:
132 |                         str: A JSON representation of the log record.
133 |                     """
134 |                     log_entry = {
135 |                         "timestamp": datetime.utcnow().isoformat(),
136 |                         "level": record.levelname,
137 |                         "logger": record.name,
138 |                         "message": record.getMessage(),
139 |                     }
140 |                     # Add extra fields if present
141 |                     if hasattr(record, "progress"):
142 |                         log_entry["progress"] = record.progress
143 |                     return json.dumps(log_entry)
144 |             
145 |             console_handler.setFormatter(JSONFormatter())
146 |         except ImportError:
147 |             # Fallback to default format if json not available
148 |             formatter = logging.Formatter(
149 |                 DEFAULT_LOG_FORMAT,
150 |                 datefmt=DEFAULT_DATE_FORMAT
151 |             )
152 |             console_handler.setFormatter(formatter)
153 |     else:
154 |         # Standard formatted logging with level-based formatting
155 |         # Default behavior:
156 |         # - INFO: minimal format (message only)
157 |         # - WARNING/ERROR/CRITICAL: simple format (LEVEL - message)
158 |         # If LOG_VERBOSE_CONSOLE=true:
159 |         # - INFO: still minimal (message only)
160 |         # - WARNING/ERROR/CRITICAL: full format (timestamp - logger - level - message)
161 |         class LevelBasedFormatter(logging.Formatter):
162 |             """Formatter that uses different formats depending on log level.
163 | 
164 |             INFO messages are rendered in a minimal format (message only),
165 |             while WARNING/ERROR/CRITICAL messages can use either a simple format
166 |             (LEVEL - message) or a full format with timestamp and logger name.
167 |             """
168 |             def __init__(self, full_format: str, simple_format: str, datefmt: Optional[str] = None, verbose: bool = False) -> None:
169 |                 # Initialize with simple format as base (for WARNING/ERROR default behavior)
170 |                 super().__init__(simple_format, datefmt)
171 |                 self.full_format = full_format
172 |                 self.simple_format = simple_format
173 |                 self.verbose = verbose
174 |                 self._full_formatter = logging.Formatter(full_format, datefmt) if verbose else None
175 |             
176 |             def format(self, record: logging.LogRecord) -> str:
177 |                 """Format a LogRecord using level-based formatting.
178 | 
179 |                 INFO records are formatted as the plain message. Higher-severity
180 |                 records use either the simple or full format, depending on the
181 |                 configuration.
182 | 
183 |                 Args:
184 |                     record: The log record to format.
185 | 
186 |                 Returns:
187 |                     str: The formatted log message.
188 |                 """
189 |                 # For INFO level, always use minimal format (just the message)
190 |                 if record.levelno == logging.INFO:
191 |                     return record.getMessage()
192 |                 # For WARNING, ERROR, CRITICAL
193 |                 else:
194 |                     if self.verbose and self._full_formatter:
195 |                         # Use full format with timestamp when verbose mode is enabled
196 |                         return self._full_formatter.format(record)
197 |                     else:
198 |                         # Use simple format (LEVEL - message) by default
199 |                         return super().format(record)
200 |         
201 |         formatter = LevelBasedFormatter(
202 |             DEFAULT_LOG_FORMAT,
203 |             DEFAULT_LOG_FORMAT_SIMPLE,
204 |             datefmt=DEFAULT_DATE_FORMAT,
205 |             verbose=use_verbose_console
206 |         )
207 |         console_handler.setFormatter(formatter)
208 |     
209 |     root_logger.addHandler(console_handler)
210 |     
211 |     # Suppress noisy third-party loggers
212 |     suppress_third_party_loggers()
213 |     
214 |     # File handler (optional)
215 |     if log_file_path:
216 |         try:
217 |             # Ensure log directory exists
218 |             log_path = Path(log_file_path)
219 |             log_path.parent.mkdir(parents=True, exist_ok=True)
220 |             
221 |             file_handler = logging.FileHandler(log_file_path, encoding="utf-8")
222 |             file_handler.setLevel(logging.DEBUG)  # File always gets DEBUG level
223 |             file_formatter = logging.Formatter(
224 |                 DEFAULT_LOG_FORMAT,
225 |                 datefmt=DEFAULT_DATE_FORMAT
226 |             )
227 |             file_handler.setFormatter(file_formatter)
228 |             root_logger.addHandler(file_handler)
229 |         except Exception as e:
230 |             # If file logging fails, log to console and continue
231 |             root_logger.warning("Failed to set up file logging: %s", e)
232 |     
233 |     _logging_initialized = True
234 | 
235 | 
236 | def get_logger(name: str) -> logging.Logger:
237 |     """
238 |     Get a logger instance for a module.
239 |     
240 |     This is a convenience function that ensures logging is initialized
241 |     and returns a logger with the given name.
242 |     
243 |     Args:
244 |         name: Logger name (typically __name__ from the calling module)
245 |     
246 |     Returns:
247 |         Logger instance
248 |     """
249 |     # Ensure logging is set up (idempotent)
250 |     if not _logging_initialized:
251 |         setup_logging()
252 |     
253 |     return logging.getLogger(name)
254 | 
255 | 
256 | # Auto-setup on import
257 | _AUTO_SETUP = os.getenv("VULNHALLA_AUTO_SETUP_LOGGING", "true").lower() == "true"
258 | if _AUTO_SETUP:
259 |     setup_logging()
260 | 
261 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  5 |    1. Definitions.
  6 |       "License" shall mean the terms and conditions for use, reproduction,
  7 |       and distribution as defined by Sections 1 through 9 of this document.
  8 |       "Licensor" shall mean the copyright owner or entity authorized by
  9 |       the copyright owner that is granting the License.
 10 |       "Legal Entity" shall mean the union of the acting entity and all
 11 |       other entities that control, are controlled by, or are under common
 12 |       control with that entity. For the purposes of this definition,
 13 |       "control" means (i) the power, direct or indirect, to cause the
 14 |       direction or management of such entity, whether by contract or
 15 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 16 |       outstanding shares, or (iii) beneficial ownership of such entity.
 17 |       "You" (or "Your") shall mean an individual or Legal Entity
 18 |       exercising permissions granted by this License.
 19 |       "Source" form shall mean the preferred form for making modifications,
 20 |       including but not limited to software source code, documentation
 21 |       source, and configuration files.
 22 |       "Object" form shall mean any form resulting from mechanical
 23 |       transformation or translation of a Source form, including but
 24 |       not limited to compiled object code, generated documentation,
 25 |       and conversions to other media types.
 26 |       "Work" shall mean the work of authorship, whether in Source or
 27 |       Object form, made available under the License, as indicated by a
 28 |       copyright notice that is included in or attached to the work
 29 |       (an example is provided in the Appendix below).
 30 |       "Derivative Works" shall mean any work, whether in Source or Object
 31 |       form, that is based on (or derived from) the Work and for which the
 32 |       editorial revisions, annotations, elaborations, or other modifications
 33 |       represent, as a whole, an original work of authorship. For the purposes
 34 |       of this License, Derivative Works shall not include works that remain
 35 |       separable from, or merely link (or bind by name) to the interfaces of,
 36 |       the Work and Derivative Works thereof.
 37 |       "Contribution" shall mean any work of authorship, including
 38 |       the original version of the Work and any modifications or additions
 39 |       to that Work or Derivative Works thereof, that is intentionally
 40 |       submitted to Licensor for inclusion in the Work by the copyright owner
 41 |       or by an individual or Legal Entity authorized to submit on behalf of
 42 |       the copyright owner. For the purposes of this definition, "submitted"
 43 |       means any form of electronic, verbal, or written communication sent
 44 |       to the Licensor or its representatives, including but not limited to
 45 |       communication on electronic mailing lists, source code control systems,
 46 |       and issue tracking systems that are managed by, or on behalf of, the
 47 |       Licensor for the purpose of discussing and improving the Work, but
 48 |       excluding communication that is conspicuously marked or otherwise
 49 |       designated in writing by the copyright owner as "Not a Contribution."
 50 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 51 |       on behalf of whom a Contribution has been received by Licensor and
 52 |       subsequently incorporated within the Work.
 53 |    2. Grant of Copyright License. Subject to the terms and conditions of
 54 |       this License, each Contributor hereby grants to You a perpetual,
 55 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 56 |       copyright license to reproduce, prepare Derivative Works of,
 57 |       publicly display, publicly perform, sublicense, and distribute the
 58 |       Work and such Derivative Works in Source or Object form.
 59 |    3. Grant of Patent License. Subject to the terms and conditions of
 60 |       this License, each Contributor hereby grants to You a perpetual,
 61 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 62 |       (except as stated in this section) patent license to make, have made,
 63 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 64 |       where such license applies only to those patent claims licensable
 65 |       by such Contributor that are necessarily infringed by their
 66 |       Contribution(s) alone or by combination of their Contribution(s)
 67 |       with the Work to which such Contribution(s) was submitted. If You
 68 |       institute patent litigation against any entity (including a
 69 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 70 |       or a Contribution incorporated within the Work constitutes direct
 71 |       or contributory patent infringement, then any patent licenses
 72 |       granted to You under this License for that Work shall terminate
 73 |       as of the date such litigation is filed.
 74 |    4. Redistribution. You may reproduce and distribute copies of the
 75 |       Work or Derivative Works thereof in any medium, with or without
 76 |       modifications, and in Source or Object form, provided that You
 77 |       meet the following conditions:
 78 |       (a) You must give any other recipients of the Work or
 79 |           Derivative Works a copy of this License; and
 80 |       (b) You must cause any modified files to carry prominent notices
 81 |           stating that You changed the files; and
 82 |       (c) You must retain, in the Source form of any Derivative Works
 83 |           that You distribute, all copyright, patent, trademark, and
 84 |           attribution notices from the Source form of the Work,
 85 |           excluding those notices that do not pertain to any part of
 86 |           the Derivative Works; and
 87 |       (d) If the Work includes a "NOTICE" text file as part of its
 88 |           distribution, then any Derivative Works that You distribute must
 89 |           include a readable copy of the attribution notices contained
 90 |           within such NOTICE file, excluding those notices that do not
 91 |           pertain to any part of the Derivative Works, in at least one
 92 |           of the following places: within a NOTICE text file distributed
 93 |           as part of the Derivative Works; within the Source form or
 94 |           documentation, if provided along with the Derivative Works; or,
 95 |           within a display generated by the Derivative Works, if and
 96 |           wherever such third-party notices normally appear. The contents
 97 |           of the NOTICE file are for informational purposes only and
 98 |           do not modify the License. You may add Your own attribution
 99 |           notices within Derivative Works that You distribute, alongside
100 |           or as an addendum to the NOTICE text from the Work, provided
101 |           that such additional attribution notices cannot be construed
102 |           as modifying the License.
103 |       You may add Your own copyright statement to Your modifications and
104 |       may provide additional or different license terms and conditions
105 |       for use, reproduction, or distribution of Your modifications, or
106 |       for any such Derivative Works as a whole, provided Your use,
107 |       reproduction, and distribution of the Work otherwise complies with
108 |       the conditions stated in this License.
109 |    5. Submission of Contributions. Unless You explicitly state otherwise,
110 |       any Contribution intentionally submitted for inclusion in the Work
111 |       by You to the Licensor shall be under the terms and conditions of
112 |       this License, without any additional terms or conditions.
113 |       Notwithstanding the above, nothing herein shall supersede or modify
114 |       the terms of any separate license agreement you may have executed
115 |       with Licensor regarding such Contributions.
116 |    6. Trademarks. This License does not grant permission to use the trade
117 |       names, trademarks, service marks, or product names of the Licensor,
118 |       except as required for reasonable and customary use in describing the
119 |       origin of the Work and reproducing the content of the NOTICE file.
120 |    7. Disclaimer of Warranty. Unless required by applicable law or
121 |       agreed to in writing, Licensor provides the Work (and each
122 |       Contributor provides its Contributions) on an "AS IS" BASIS,
123 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
124 |       implied, including, without limitation, any warranties or conditions
125 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
126 |       PARTICULAR PURPOSE. You are solely responsible for determining the
127 |       appropriateness of using or redistributing the Work and assume any
128 |       risks associated with Your exercise of permissions under this License.
129 |    8. Limitation of Liability. In no event and under no legal theory,
130 |       whether in tort (including negligence), contract, or otherwise,
131 |       unless required by applicable law (such as deliberate and grossly
132 |       negligent acts) or agreed to in writing, shall any Contributor be
133 |       liable to You for damages, including any direct, indirect, special,
134 |       incidental, or consequential damages of any character arising as a
135 |       result of this License or out of the use or inability to use the
136 |       Work (including but not limited to damages for loss of goodwill,
137 |       work stoppage, computer failure or malfunction, or any and all
138 |       other commercial damages or losses), even if such Contributor
139 |       has been advised of the possibility of such damages.
140 |    9. Accepting Warranty or Additional Liability. While redistributing
141 |       the Work or Derivative Works thereof, You may choose to offer,
142 |       and charge a fee for, acceptance of support, warranty, indemnity,
143 |       or other liability obligations and/or rights consistent with this
144 |       License. However, in accepting such obligations, You may act only
145 |       on Your own behalf and on Your sole responsibility, not on behalf
146 |       of any other Contributor, and only if You agree to indemnify,
147 |       defend, and hold each Contributor harmless for any liability
148 |       incurred by, or claims asserted against, such Contributor by reason
149 |       of your accepting any such warranty or additional liability.
150 |    END OF TERMS AND CONDITIONS
151 |    APPENDIX: How to apply the Apache License to your work.
152 |       To apply the Apache License to your work, attach the following
153 |       boilerplate notice, with the fields enclosed by brackets "[]"
154 |       replaced with your own identifying information. (Don't include
155 |       the brackets!)  The text should be enclosed in the appropriate
156 |       comment syntax for the file format. We also recommend that a
157 |       file or class name and description of purpose be included on the
158 |       same "printed page" as the copyright notice for easier
159 |       identification within third-party archives.
160 | 
161 |    Copyright (c) 2025 CyberArk Software Ltd. All rights reserved.
162 |    Licensed under the Apache License, Version 2.0 (the "License");
163 |    you may not use this file except in compliance with the License.
164 |    You may obtain a copy of the License at
165 |        http://www.apache.org/licenses/LICENSE-2.0
166 |    Unless required by applicable law or agreed to in writing, software
167 |    distributed under the License is distributed on an "AS IS" BASIS,
168 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
169 |    See the License for the specific language governing permissions and
170 |    limitations under the License.


--------------------------------------------------------------------------------
/src/codeql/run_codeql_queries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Compile and run CodeQL queries on CodeQL databases for a specific language.
  4 | 
  5 | Requires that CodeQL is installed or available under the CODEQL path.
  6 | By default, it compiles all .ql files under 'data/queries/<LANG>/tools' and
  7 | 'data/queries/<LANG>/issues', then runs them on each CodeQL database located
  8 | in 'output/databases/<LANG>'.
  9 | 
 10 | Example:
 11 |     python src/codeql/run_codeql_queries.py
 12 | """
 13 | 
 14 | import subprocess
 15 | import os
 16 | 
 17 | # Make sure your common_functions module is in your PYTHONPATH or same folder
 18 | from src.utils.common_functions import get_all_dbs
 19 | from src.utils.config import get_codeql_path
 20 | from src.utils.logger import get_logger
 21 | from src.utils.exceptions import CodeQLError, CodeQLConfigError, CodeQLExecutionError
 22 | 
 23 | logger = get_logger(__name__)
 24 | 
 25 | 
 26 | # Default locations/values
 27 | DEFAULT_CODEQL = get_codeql_path()
 28 | DEFAULT_LANG = "c"  # Mapped to data/queries/cpp for some tasks
 29 | 
 30 | 
 31 | def pre_compile_ql(file_name: str, threads: int, codeql_bin: str) -> None:
 32 |     """
 33 |     Pre-compile a single .ql file using CodeQL.
 34 | 
 35 |     Args:
 36 |         file_name (str): The path to the .ql query file.
 37 |         threads (int): Number of threads to use during compilation.
 38 |         codeql_bin (str): Full path to the 'codeql' executable.
 39 |     
 40 |     Raises:
 41 |         CodeQLConfigError: If CodeQL executable not found.
 42 |         CodeQLExecutionError: If query compilation fails.
 43 |     """
 44 |     if not os.path.exists(file_name + "x"):
 45 |         try:
 46 |             subprocess.run(
 47 |                 [
 48 |                     codeql_bin,
 49 |                     "query",
 50 |                     "compile",
 51 |                     file_name,
 52 |                     f'--threads={threads}',
 53 |                     "--precompile"
 54 |                 ],
 55 |                 check=True,
 56 |                 text=True,
 57 |                 stdout=subprocess.DEVNULL,
 58 |                 stderr=subprocess.DEVNULL
 59 |             )
 60 |         except FileNotFoundError as e:
 61 |             raise CodeQLConfigError(
 62 |                 f"CodeQL executable not found: {codeql_bin}. "
 63 |                 "Please check your CODEQL_PATH configuration."
 64 |             ) from e
 65 |         except subprocess.CalledProcessError as e:
 66 |             raise CodeQLExecutionError(
 67 |                 f"Failed to compile query {file_name}: CodeQL returned exit code {e.returncode}"
 68 |             ) from e
 69 | 
 70 | 
 71 | def compile_all_queries(queries_folder: str, threads: int, codeql_bin: str) -> None:
 72 |     """
 73 |     Recursively pre-compile all .ql files in a folder.
 74 | 
 75 |     Args:
 76 |         queries_folder (str): Directory containing .ql files (and possibly subdirectories).
 77 |         threads (int): Number of threads to use during compilation.
 78 |         codeql_bin (str): Full path to the 'codeql' executable.
 79 |     
 80 |     Raises:
 81 |         CodeQLConfigError: If CodeQL executable not found.
 82 |         CodeQLExecutionError: If query compilation fails.
 83 |     """
 84 |     for subdir, dirs, files in os.walk(queries_folder):
 85 |         for file in files:
 86 |             if os.path.splitext(file)[1].lower() == ".ql":
 87 |                 file_path = os.path.join(subdir, file)
 88 |                 pre_compile_ql(file_path, threads, codeql_bin)
 89 | 
 90 | 
 91 | def run_one_query(
 92 |     query_file: str,
 93 |     curr_db: str,
 94 |     output_bqrs: str,
 95 |     output_csv: str,
 96 |     threads: int,
 97 |     codeql_bin: str
 98 | ) -> None:
 99 |     """
100 |     Execute a single CodeQL query on a specific database and export the results.
101 | 
102 |     Args:
103 |         query_file (str): The path to the .ql file to run.
104 |         curr_db (str): The path to the CodeQL database on which to run queries.
105 |         output_bqrs (str): Where to write the intermediate BQRS output.
106 |         output_csv (str): Where to write the CSV representation of the results.
107 |         threads (int): Number of threads to use during query execution.
108 |         codeql_bin (str): Full path to the 'codeql' executable.
109 |     
110 |     Raises:
111 |         CodeQLConfigError: If CodeQL executable not found.
112 |         CodeQLExecutionError: If query execution or BQRS decoding fails.
113 |     """
114 |     # Run the query
115 |     try:
116 |         subprocess.run(
117 |             [
118 |                 codeql_bin, "query", "run", query_file,
119 |                 f'--database={curr_db}',
120 |                 f'--output={output_bqrs}',
121 |                 f'--threads={threads}'
122 |             ],
123 |             check=True,
124 |             text=True,
125 |             stdout=subprocess.DEVNULL,
126 |             stderr=subprocess.DEVNULL
127 |         )
128 |     except FileNotFoundError as e:
129 |         raise CodeQLConfigError(
130 |             f"CodeQL executable not found: {codeql_bin}. "
131 |             "Please check your CODEQL_PATH configuration."
132 |         ) from e
133 |     except subprocess.CalledProcessError as e:
134 |         raise CodeQLExecutionError(
135 |             f"Failed to run query {query_file} on database {curr_db}: "
136 |             f"CodeQL returned exit code {e.returncode}"
137 |         ) from e
138 | 
139 |     # Decode BQRS to CSV
140 |     try:
141 |         subprocess.run(
142 |             [
143 |                 codeql_bin, "bqrs", "decode", output_bqrs,
144 |                 '--format=csv', f'--output={output_csv}'
145 |             ],
146 |             check=True,
147 |             text=True,
148 |             stdout=subprocess.DEVNULL,
149 |             stderr=subprocess.DEVNULL
150 |         )
151 |     except subprocess.CalledProcessError as e:
152 |         raise CodeQLExecutionError(
153 |             f"Failed to decode BQRS file {output_bqrs} to CSV: "
154 |             f"CodeQL returned exit code {e.returncode}"
155 |         ) from e
156 | 
157 | 
158 | def run_queries_on_db(
159 |     curr_db: str,
160 |     tools_folder: str,
161 |     queries_folder: str,
162 |     threads: int,
163 |     codeql_bin: str,
164 |     timeout: int = 300
165 | ) -> None:
166 |     """
167 |     Execute all tool queries in 'tools_folder' individually on a given database,
168 |     then run a bulk 'database analyze' with all queries in 'queries_folder'.
169 | 
170 |     Args:
171 |         curr_db (str): The path to the CodeQL database.
172 |         tools_folder (str): Folder containing individual .ql files to run.
173 |         queries_folder (str): Folder containing .ql queries for bulk analysis.
174 |         threads (int): Number of threads to use during query execution.
175 |         codeql_bin (str): Full path to the 'codeql' executable.
176 |         timeout (int, optional): Timeout in seconds for the bulk 'database analyze'.
177 |             Defaults to 300.
178 |     
179 |     Raises:
180 |         CodeQLConfigError: If CodeQL executable not found.
181 |         CodeQLExecutionError: If query execution or database analysis fails.
182 |     """
183 |     # 1) Run each .ql in tools_folder individually
184 |     if os.path.isdir(tools_folder):
185 |         for file in os.listdir(tools_folder):
186 |             if os.path.splitext(file)[1].lower() == ".ql":
187 |                 run_one_query(
188 |                     os.path.join(tools_folder, file),
189 |                     curr_db,
190 |                     os.path.join(curr_db, os.path.splitext(file)[0] + ".bqrs"),
191 |                     os.path.join(curr_db, os.path.splitext(file)[0] + ".csv"),
192 |                     threads,
193 |                     codeql_bin
194 |                 )
195 |     else:
196 |         logger.warning("Tools folder '%s' not found. Skipping individual queries.", tools_folder)
197 | 
198 |     # 2) Run the entire queries folder in one go (bulk analysis)
199 |     if os.path.isdir(queries_folder):
200 |         try:
201 |             subprocess.run(
202 |                 [
203 |                     codeql_bin,
204 |                     "database",
205 |                     "analyze",
206 |                     curr_db,
207 |                     queries_folder,
208 |                     f'--timeout={timeout}',
209 |                     '--format=csv',
210 |                     f'--output={os.path.join(curr_db, "issues.csv")}',
211 |                     f'--threads={threads}'
212 |                 ],
213 |                 check=True,
214 |                 text=True,
215 |                 stdout=subprocess.DEVNULL,
216 |                 stderr=subprocess.DEVNULL
217 |             )
218 |         except FileNotFoundError as e:
219 |             raise CodeQLConfigError(
220 |                 f"CodeQL executable not found: {codeql_bin}. "
221 |                 "Please check your CODEQL_PATH configuration."
222 |             ) from e
223 |         except subprocess.CalledProcessError as e:
224 |             raise CodeQLExecutionError(
225 |                 f"Failed to analyze database {curr_db} with queries from {queries_folder}: "
226 |                 f"CodeQL returned exit code {e.returncode}"
227 |             ) from e
228 |     else:
229 |         logger.warning("Queries folder '%s' not found. Skipping bulk analysis.", queries_folder)
230 | 
231 | 
232 | def compile_and_run_codeql_queries(
233 |     codeql_bin: str = DEFAULT_CODEQL,
234 |     lang: str = DEFAULT_LANG,
235 |     threads: int = 16,
236 |     timeout: int = 300
237 | ) -> None:
238 |     """
239 |     Compile and run CodeQL queries on CodeQL databases for a specific language.
240 | 
241 |     1. Pre-compile all .ql files in the tools and queries folders.
242 |     2. Enumerate all CodeQL DBs for the given language.
243 |     3. Run each DB against both the 'tools' and 'issues' queries folders.
244 | 
245 |     Args:
246 |         codeql_bin (str, optional): Full path to the 'codeql' executable. Defaults to DEFAULT_CODEQL.
247 |         lang (str, optional): Language code. Defaults to 'c' (which maps to data/queries/cpp).
248 |         threads (int, optional): Number of threads for compilation/execution. Defaults to 16.
249 |         timeout (int, optional): Timeout in seconds for bulk analysis. Defaults to 300.
250 |     
251 |     Raises:
252 |         CodeQLConfigError: If CodeQL executable not found (from compilation or query execution).
253 |         CodeQLExecutionError: If query compilation or execution fails.
254 |     """
255 |     # Setup paths
256 |     queries_subfolder = "cpp" if lang == "c" else lang
257 |     queries_folder = os.path.join("data/queries", queries_subfolder, "issues")
258 |     tools_folder = os.path.join("data/queries", queries_subfolder, "tools")
259 |     dbs_folder = os.path.join("output/databases", lang)
260 | 
261 |     # Step 1: Pre-compile all queries
262 |     compile_all_queries(tools_folder, threads, codeql_bin)
263 |     compile_all_queries(queries_folder, threads, codeql_bin)
264 | 
265 |     # Step 2: List databases and run queries
266 |     logger.info("Running queries on each DB in %s", dbs_folder)
267 |     
268 |     # List what's in the folder for debugging
269 |     try:
270 |         contents = os.listdir(dbs_folder)
271 |         if len(contents) == 0:
272 |             logger.warning("Database folder '%s' is empty. No databases to process.", dbs_folder)
273 |             return
274 |         logger.debug("Found %d item(s) in database folder: %s", len(contents), contents)
275 |     except OSError as e:
276 |         logger.warning("Cannot access database folder '%s': %s. No databases to process.", dbs_folder, e)
277 |         return
278 |     
279 |     dbs_path = get_all_dbs(dbs_folder)
280 |     
281 |     if len(dbs_path) == 0:
282 |         logger.warning("No valid databases found in '%s'. Expected structure: <dbs_folder>/<repo_name>/<db_name>/codeql-database.yml", dbs_folder)
283 |         logger.warning("Make sure databases were downloaded and extracted successfully.")
284 |         return
285 |     
286 |     for curr_db in dbs_path:
287 |         logger.info("Processing DB: %s", curr_db)
288 |         
289 |         # Check if database folder is empty
290 |         if os.path.isdir(curr_db):
291 |             try:
292 |                 if len(os.listdir(curr_db)) == 0:
293 |                     logger.warning("Database folder '%s' is empty. Skipping queries.", curr_db)
294 |                     continue
295 |             except OSError:
296 |                 logger.warning("Cannot access database folder '%s'. Skipping.", curr_db)
297 |                 continue
298 |         
299 |         # If issues.csv was not generated yet, or FunctionTree.csv missing, run
300 |         if (not os.path.exists(os.path.join(curr_db, "FunctionTree.csv")) or
301 |                 not os.path.exists(os.path.join(curr_db, "issues.csv"))):
302 |             run_queries_on_db(
303 |                 curr_db,
304 |                 tools_folder,
305 |                 queries_folder,
306 |                 threads,
307 |                 codeql_bin,
308 |                 timeout
309 |             )
310 |         else:
311 |             logger.info("Output files already exist for this DB, skipping...")
312 | 
313 |     logger.info("All databases processed.")
314 | 
315 | 
316 | def main_cli() -> None:
317 |     """
318 |     CLI entry point for running codeql queries with defaults.
319 |     """
320 |     compile_and_run_codeql_queries(
321 |         codeql_bin=DEFAULT_CODEQL,
322 |         lang=DEFAULT_LANG,
323 |         threads=16,
324 |         timeout=300
325 |     )
326 | 
327 | 
328 | if __name__ == '__main__':
329 |     # Initialize logging
330 |     from src.utils.logger import setup_logging
331 |     setup_logging()
332 |     
333 |     main_cli()
334 | 


--------------------------------------------------------------------------------
/src/utils/config_validator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Configuration Validator Module
  4 | 
  5 | Validates configuration at startup to catch errors early with clear messages.
  6 | """
  7 | 
  8 | import os
  9 | from shlex import join
 10 | import shutil
 11 | from typing import Any, Dict, List, Optional, Tuple
 12 | from src.utils.config import get_codeql_path
 13 | from src.utils.llm_config import load_llm_config, ALLOWED_LLM_PROVIDERS
 14 | from src.utils.logger import get_logger
 15 | 
 16 | logger = get_logger(__name__)
 17 | 
 18 | 
 19 | def is_placeholder_api_key(api_key: Optional[str]) -> bool:
 20 |     """
 21 |     Check if an API key is a placeholder value.
 22 |     
 23 |     Checks for common placeholders: "your_api_key" (from .env.example) and "sk-..."
 24 |     
 25 |     Args:
 26 |         api_key: API key to check
 27 |     
 28 |     Returns:
 29 |         True if the API key appears to be a placeholder, False otherwise
 30 |     """
 31 |     if not api_key:
 32 |         return True
 33 |     
 34 |     api_key_str = str(api_key).strip()
 35 |     # Strip quotes if present (from .env file)
 36 |     api_key_str = api_key_str.strip('"').strip("'")
 37 |     api_key_lower = api_key_str.lower()
 38 |     
 39 |     # Check for the placeholder used in .env.example
 40 |     if "your_api_key" in api_key_lower or api_key_lower == "your-api-key":
 41 |         return True
 42 |     
 43 |     # Check for "sk-..." placeholder pattern
 44 |     if api_key_str == "sk-...":
 45 |         return True
 46 |     
 47 |     return False
 48 | 
 49 | 
 50 | def find_codeql_executable() -> Optional[str]:
 51 |     """
 52 |     Find the actual CodeQL executable path to use.
 53 |     
 54 |     Returns:
 55 |         Path to CodeQL executable if found, None otherwise.
 56 |         On Windows, returns path with .cmd extension if needed.
 57 |     """
 58 |     try:
 59 |         codeql_path = get_codeql_path()
 60 |         
 61 |         # Strip quotes if present
 62 |         if codeql_path:
 63 |             codeql_path = codeql_path.strip('"').strip("'")
 64 |         
 65 |         # If default "codeql", check if it's in PATH
 66 |         if codeql_path == "codeql":
 67 |             return shutil.which("codeql")
 68 |         
 69 |         # Custom path provided - check if file exists
 70 |         if os.path.exists(codeql_path):
 71 |             return codeql_path
 72 |         
 73 |         # Check with extensions (Windows)
 74 |         if os.name == 'nt':
 75 |             # Check .cmd extension (CodeQL uses .cmd on Windows)
 76 |             if os.path.exists(codeql_path + ".cmd"):
 77 |                 return codeql_path + ".cmd"
 78 |             # Also check .exe for compatibility
 79 |             if os.path.exists(codeql_path + ".exe"):
 80 |                 return codeql_path + ".exe"
 81 |         
 82 |         return None
 83 |     except Exception:
 84 |         # Fallback to checking PATH
 85 |         return shutil.which("codeql")
 86 | 
 87 | 
 88 | def validate_codeql_path() -> Tuple[bool, Optional[str]]:
 89 |     """
 90 |     Validate that CodeQL executable exists.
 91 |     
 92 |     Returns:
 93 |         Tuple of (is_valid, error_message)
 94 |         - is_valid: True if CodeQL path is valid
 95 |         - error_message: Error message if invalid, None if valid
 96 |     """
 97 |     codeql_path = get_codeql_path()
 98 |     
 99 |     # Check for placeholder value
100 |     codeql_path_str = str(codeql_path).strip().lower()
101 |     if "your_codeql_path" in codeql_path_str or codeql_path_str == "your-codeql-path":
102 |         return False, (
103 |             "CODEQL_PATH appears to be a placeholder value.\n"
104 |             "Please set CODEQL_PATH in your .env file to the actual path of the CodeQL executable.\n"
105 |             "On Windows: C:\\path\\to\\codeql\\codeql.cmd\n"
106 |             "On Linux/macOS: /path/to/codeql/codeql or add 'codeql' to your PATH"
107 |         )
108 |     
109 |     # If default "codeql", check if it's in PATH
110 |     if codeql_path == "codeql":
111 |         codeql_cmd = shutil.which("codeql")
112 |         if not codeql_cmd:
113 |             return False, (
114 |                 "CodeQL not found in PATH. Please either:\n"
115 |                 "  1. Install CodeQL and add it to your PATH, or\n"
116 |                 "  2. Set CODEQL_PATH in your .env file to the full path of the CodeQL executable.\n"
117 |                 "     On Windows: C:\\path\\to\\codeql\\codeql.cmd"
118 |             )
119 |         return True, None
120 |     
121 |     # Custom path provided - check if file exists
122 |     if not os.path.exists(codeql_path):
123 |         # Check with .cmd extension (CodeQL uses .cmd on Windows)
124 |         if os.name == 'nt':
125 |             if os.path.exists(codeql_path + ".cmd"):
126 |                 return True, None
127 |         
128 |         return False, (
129 |             f"CodeQL executable not found at: {codeql_path}\n"
130 |             "Please check that CODEQL_PATH in your .env file is correct.\n"
131 |             "On Windows, the path must end with .cmd (e.g., C:\\path\\to\\codeql\\codeql.cmd)"
132 |         )
133 |     
134 |     return True, None
135 | 
136 | 
137 | def validate_llm_config_dict(config: Dict[str, Any]) -> bool:
138 |     """
139 |     Validate LLM configuration dictionary.
140 |     
141 |     Args:
142 |         config: Configuration dictionary
143 |     
144 |     Returns:
145 |         True if valid, raises ValueError if invalid
146 |     """
147 |     # Check required fields
148 |     required_fields = ["provider", "model"]
149 |     
150 |     for field in required_fields:
151 |         if field not in config:
152 |             raise ValueError(f"Missing required configuration field: {field}")
153 |     
154 |     # Normalize aliases to canonical provider name
155 |     provider = config["provider"]
156 |     if provider == "google":
157 |         provider = "gemini"
158 |         config["provider"] = provider  # Update config with normalized value
159 |     
160 |     # Validate provider is in allowed list
161 |     if provider not in ALLOWED_LLM_PROVIDERS:
162 |         raise ValueError(
163 |             f"Provider '{provider}' is not supported. "
164 |             f"Allowed providers: {', '.join(sorted(ALLOWED_LLM_PROVIDERS))}"
165 |         )
166 |     
167 |     # Validate provider specific requirements
168 |     if provider == "azure":
169 |         if "endpoint" not in config:
170 |             raise ValueError("Azure provider requires 'endpoint' in configuration")
171 |         if "api_key" not in config or not config["api_key"]:
172 |             raise ValueError("Azure provider requires 'api_key' in configuration")
173 |         if is_placeholder_api_key(config["api_key"]):
174 |             raise ValueError("Azure provider requires a valid 'api_key'. Please set AZURE_OPENAI_API_KEY in your .env file with your actual API key.")
175 |     
176 |     elif provider == "bedrock":
177 |         if "api_key" not in config or not config["api_key"]:
178 |             raise ValueError("Bedrock provider requires 'api_key' (AWS_ACCESS_KEY_ID) in configuration")
179 |         if is_placeholder_api_key(config["api_key"]):
180 |             raise ValueError("Bedrock provider requires a valid 'api_key' (AWS_ACCESS_KEY_ID). Please set AWS_ACCESS_KEY_ID in your .env file with your actual AWS access key.")
181 |         if "aws_secret_access_key" not in config or not config.get("aws_secret_access_key"):
182 |             raise ValueError("Bedrock provider requires 'aws_secret_access_key' (AWS_SECRET_ACCESS_KEY) in configuration")
183 |         if is_placeholder_api_key(config.get("aws_secret_access_key")):
184 |             raise ValueError("Bedrock provider requires a valid 'aws_secret_access_key' (AWS_SECRET_ACCESS_KEY). Please set AWS_SECRET_ACCESS_KEY in your .env file with your actual AWS secret key.")
185 |         if "endpoint" not in config or not config["endpoint"]:
186 |             raise ValueError("Bedrock provider requires 'endpoint' (AWS_REGION_NAME) in configuration")
187 |     
188 |     elif provider == "ollama":
189 |         # Ollama uses placeholder api_key
190 |         if "endpoint" not in config:
191 |             raise ValueError("Ollama provider requires 'endpoint' (OLLAMA_BASE_URL) in configuration")
192 |     
193 |     else:
194 |         # All other providers require api_key
195 |         if "api_key" not in config or not config["api_key"]:
196 |             raise ValueError(f"{provider} provider requires 'api_key' in configuration")
197 |         if is_placeholder_api_key(config["api_key"]):
198 |             # Get the environment variable name for this provider
199 |             env_var_map = {
200 |                 "openai": "OPENAI_API_KEY",
201 |                 "anthropic": "ANTHROPIC_API_KEY",
202 |                 "gemini": "GOOGLE_API_KEY",
203 |                 "mistral": "MISTRAL_API_KEY",
204 |                 "codestral": "MISTRAL_API_KEY",
205 |                 "groq": "GROQ_API_KEY",
206 |                 "openrouter": "OPENROUTER_API_KEY",
207 |                 "huggingface": "HUGGINGFACE_API_KEY",
208 |                 "cohere": "COHERE_API_KEY",
209 |             }
210 |             env_var = env_var_map.get(provider, "API_KEY")
211 |             raise ValueError(
212 |                 f"{provider} provider requires a valid 'api_key'. "
213 |                 f"Please set {env_var} in your .env file with your actual API key. "
214 |                 f"Current value appears to be a placeholder."
215 |             )
216 |     
217 |     return True
218 | 
219 | 
220 | def validate_llm_config() -> Tuple[bool, Optional[str]]:
221 |     """
222 |     Validate LLM configuration.
223 |     
224 |     Returns:
225 |         Tuple of (is_valid, error_message)
226 |         - is_valid: True if LLM config is valid
227 |         - error_message: Error message if invalid, None if valid
228 |     """
229 |     try:
230 |         config = load_llm_config()
231 |         validate_llm_config_dict(config)
232 |         
233 |         return True, None
234 |     except ValueError as e:
235 |         return False, str(e)
236 |     except Exception as e:
237 |         return False, f"Error loading LLM configuration: {str(e)}"
238 | 
239 | 
240 | def validate_logging_config() -> Tuple[bool, Optional[str]]:
241 |     """
242 |     Validate logging configuration from environment variables.
243 |     
244 |     Returns:
245 |         Tuple of (is_valid, error_message)
246 |         - is_valid: True if logging config is valid
247 |         - error_message: Error message if invalid, None if valid
248 |     """
249 |     import logging
250 |     
251 |     # Validate LOG_LEVEL
252 |     log_level = os.getenv("LOG_LEVEL", "INFO").upper()
253 |     valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
254 |     if log_level not in valid_levels:
255 |         return False, (
256 |             f"Invalid LOG_LEVEL: '{log_level}'. "
257 |             f"Must be one of: {', '.join(sorted(valid_levels))}"
258 |         )
259 |     
260 |     # Validate LOG_FORMAT
261 |     log_format = os.getenv("LOG_FORMAT", "default").lower()
262 |     valid_formats = {"default", "json"}
263 |     if log_format not in valid_formats:
264 |         return False, (
265 |             f"Invalid LOG_FORMAT: '{log_format}'. "
266 |             f"Must be one of: {', '.join(sorted(valid_formats))}"
267 |         )
268 |     
269 |     # Validate LOG_VERBOSE_CONSOLE
270 |     verbose_console = os.getenv("LOG_VERBOSE_CONSOLE", "false").lower()
271 |     if verbose_console not in {"true", "false"}:
272 |         return False, (
273 |             f"Invalid LOG_VERBOSE_CONSOLE: '{verbose_console}'. "
274 |             f"Must be 'true' or 'false'"
275 |         )
276 |     
277 |     # Validate THIRD_PARTY_LOG_LEVEL
278 |     third_party_level = os.getenv("THIRD_PARTY_LOG_LEVEL", "ERROR").upper()
279 |     if third_party_level not in valid_levels:
280 |         return False, (
281 |             f"Invalid THIRD_PARTY_LOG_LEVEL: '{third_party_level}'. "
282 |             f"Must be one of: {', '.join(sorted(valid_levels))}"
283 |         )
284 |     
285 |     # Validate LOG_FILE (if set, check if path is valid format)
286 |     log_file = os.getenv("LOG_FILE")
287 |     if log_file:
288 |         # Check if path contains invalid characters (basic validation)
289 |         from pathlib import Path
290 |         try:
291 |             # Try to create a Path object to validate format
292 |             log_path = Path(log_file)
293 |             # Check if parent directory can be determined (basic path validation)
294 |             if log_path.is_absolute() or log_path.parent != Path("."):
295 |                 # Path seems valid, but we don't check if directory exists (it will be created)
296 |                 pass
297 |         except (ValueError, OSError) as e:
298 |             return False, (
299 |                 f"Invalid LOG_FILE path: '{log_file}'. "
300 |                 f"Error: {str(e)}"
301 |             )
302 |     
303 |     return True, None
304 | 
305 | 
306 | def validate_all_config() -> Tuple[bool, List[str]]:
307 |     """
308 |     Validate all configuration (CodeQL, LLM, and Logging).
309 |     
310 |     Returns:
311 |         Tuple of (is_valid, error_messages)
312 |         - is_valid: True if all config is valid
313 |         - error_messages: List of error messages (empty if valid)
314 |     """
315 |     errors: List[str] = []
316 |     
317 |     # Validate CodeQL path
318 |     codeql_valid, codeql_error = validate_codeql_path()
319 |     if not codeql_valid:
320 |         errors.append(f"❌ CodeQL Configuration Error:\n{codeql_error}")
321 |     
322 |     # Validate LLM config
323 |     llm_valid, llm_error = validate_llm_config()
324 |     if not llm_valid:
325 |         errors.append(f"❌ LLM Configuration Error:\n{llm_error}")
326 |     
327 |     # Validate Logging config
328 |     logging_valid, logging_error = validate_logging_config()
329 |     if not logging_valid:
330 |         errors.append(f"❌ Logging Configuration Error:\n{logging_error}")
331 |     
332 |     is_valid = len(errors) == 0
333 |     return is_valid, errors
334 | 
335 | 
336 | def validate_and_exit_on_error() -> None:
337 |     """
338 |     Validate all configuration and exit with error message if invalid.
339 |     
340 |     This is the main function to call at startup.
341 |     
342 |     Raises:
343 |         LLMConfigError: If LLM configuration is invalid
344 |         CodeQLConfigError: If CodeQL configuration is invalid
345 |         VulnhallaError: If Logging configuration is invalid
346 |     """
347 |     from src.utils.exceptions import LLMConfigError, CodeQLConfigError, VulnhallaError
348 |     
349 |     is_valid, errors = validate_all_config()
350 |     
351 |     if not is_valid:
352 |         errors_block = "\n\n".join(errors)
353 |         
354 |         has_llm_error = any("LLM" in error for error in errors)
355 |         has_codeql_error = any("CodeQL" in error for error in errors)
356 |         has_logging_error = any("Logging" in error for error in errors)
357 |         
358 |         # Priority: LLM > CodeQL > Logging
359 |         if has_llm_error:
360 |             raise LLMConfigError(errors_block)
361 |         elif has_codeql_error:
362 |             raise CodeQLConfigError(errors_block)
363 |         elif has_logging_error:
364 |             raise VulnhallaError(errors_block)
365 |         else:
366 |             raise VulnhallaError(errors_block)
367 | 
368 | 


--------------------------------------------------------------------------------
/src/ui/results_loader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Results loader for parsing issue results from output/results/ directory.
  4 | """
  5 | 
  6 | import os
  7 | import json
  8 | import re
  9 | import sys
 10 | from pathlib import Path
 11 | from typing import Dict, List, Optional, Tuple
 12 | 
 13 | # Add project root to path for imports
 14 | PROJECT_ROOT = Path(__file__).parent.parent.parent
 15 | if str(PROJECT_ROOT) not in sys.path:
 16 |     sys.path.insert(0, str(PROJECT_ROOT))
 17 | 
 18 | from src.ui.models import Issue
 19 | from src.utils.logger import get_logger
 20 | from src.utils.exceptions import VulnhallaError
 21 | 
 22 | logger = get_logger(__name__)
 23 | 
 24 | 
 25 | class ResultsLoader:
 26 |     """
 27 |     Loads and parses issue results from output/results/ directory.
 28 |     """
 29 |     
 30 |     def __init__(self, results_root: str = "output/results"):
 31 |         """
 32 |         Initialize the ResultsLoader.
 33 | 
 34 |         Args:
 35 |             results_root (str): Root directory containing analysis results. 
 36 |                 Defaults to "output/results".
 37 |         """
 38 |         self.results_root = Path(results_root)
 39 |     
 40 |     def extract_status(self, content: str) -> str:
 41 |         """
 42 |         Extract status code from LLM content.
 43 | 
 44 |         Args:
 45 |             content (str): The LLM message content to analyze.
 46 | 
 47 |         Returns:
 48 |             str: Status code - "true" (if 1337 found), "false" (if 1007 found), 
 49 |                 or "more" (otherwise).
 50 |         """
 51 |         if not content:
 52 |             return "more"
 53 |         content_lower = content.lower()
 54 |         if "1337" in content_lower:
 55 |             return "true"
 56 |         elif "1007" in content_lower:
 57 |             return "false"
 58 |         return "more"
 59 |     
 60 |     def parse_final_json(self, path: Path) -> Optional[List[Dict]]:
 61 |         """
 62 |         Parse _final.json file containing LLM messages.
 63 | 
 64 |         Handles both valid JSON and malformed Python list representations.
 65 | 
 66 |         Args:
 67 |             path (Path): Path to the _final.json file.
 68 | 
 69 |         Returns:
 70 |             Optional[List[Dict]]: List of message dictionaries, or None if parsing fails.
 71 |         """
 72 |         try:
 73 |             with open(path, 'r', encoding='utf-8') as f:
 74 |                 content = f.read()
 75 |         except FileNotFoundError as e:
 76 |             logger.error("File not found: %s", path)
 77 |             return None
 78 |         except PermissionError as e:
 79 |             logger.error("Permission denied reading file: %s", path)
 80 |             return None
 81 |         except OSError as e:
 82 |             logger.error("OS error reading file: %s", path)
 83 |             return None
 84 |     
 85 |         try:
 86 |             return json.loads(content)
 87 |         except json.JSONDecodeError:
 88 |                 # Parse manually
 89 |                 messages = []
 90 |                 for match in re.finditer(r"\{'role':", content):
 91 |                     start = match.start()
 92 |                     # Find the matching closing brace
 93 |                     brace_count = 0
 94 |                     end = start
 95 |                     in_single_quote = False
 96 |                     in_double_quote = False
 97 |                     escape_next = False
 98 |                     for i in range(start, len(content)):
 99 |                         char = content[i]
100 |                         if escape_next:
101 |                             escape_next = False
102 |                             continue
103 |                         if char == '\\':
104 |                             escape_next = True
105 |                             continue
106 |                         if char == "'" and not escape_next and not in_double_quote:
107 |                             in_single_quote = not in_single_quote
108 |                             continue
109 |                         if char == '"' and not escape_next and not in_single_quote:
110 |                             in_double_quote = not in_double_quote
111 |                             continue
112 |                         if not in_single_quote and not in_double_quote:
113 |                             if char == '{':
114 |                                 brace_count += 1
115 |                             elif char == '}':
116 |                                 brace_count -= 1
117 |                                 if brace_count == 0:
118 |                                     end = i + 1
119 |                                     break
120 |                     
121 |                     dict_str = content[start:end]
122 |                     
123 |                     role_match = re.search(r"'role':\s*['\"]([^'\"]+)['\"]", dict_str)
124 |                     # Extract content field
125 |                     content_match = None
126 |                     
127 |                     # Determine which quote type used for content
128 |                     content_key_pos = dict_str.find("'content':")
129 |                     if content_key_pos >= 0:
130 |                         # Find the quote character after 'content':
131 |                         quote_start = content_key_pos + len("'content':")
132 |                         # Skip whitespace
133 |                         while quote_start < len(dict_str) and dict_str[quote_start] in ' \t\n':
134 |                             quote_start += 1
135 |                         if quote_start < len(dict_str):
136 |                             quote_char = dict_str[quote_start]
137 |                             if quote_char == '"':
138 |                                 content_pattern = r"'content':\s*\"((?:[^\"\\]|\\.)*)\""
139 |                                 content_match = re.search(content_pattern, dict_str, re.DOTALL)
140 |                             elif quote_char == "'":
141 |                                 content_pattern = r"'content':\s*'((?:[^'\\]|\\.|'')*)'"
142 |                                 content_match = re.search(content_pattern, dict_str, re.DOTALL)
143 |                     
144 |                     if not content_match:
145 |                         content_pattern = r"'content':\s*'((?:[^'\\]|\\.|'')*)'"
146 |                         content_match = re.search(content_pattern, dict_str, re.DOTALL)
147 |                         if not content_match:
148 |                             content_pattern = r"'content':\s*\"((?:[^\"\\]|\\.)*)\""
149 |                             content_match = re.search(content_pattern, dict_str, re.DOTALL)
150 |                     
151 |                     if role_match and content_match:
152 |                         content_str = content_match.group(1)
153 |                         content_str = content_str.replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"').replace('\\\\', '\\')
154 |                         messages.append({
155 |                             'role': role_match.group(1),
156 |                             'content': content_str
157 |                         })
158 |                 return messages if messages else None
159 |         except Exception:
160 |             return None
161 |         
162 |     def parse_raw_json(self, path: Path) -> Optional[Dict]:
163 |         """
164 |         Parse _raw.json file containing original CodeQL issue data.
165 | 
166 |         Args:
167 |             path (Path): Path to the _raw.json file.
168 | 
169 |         Returns:
170 |             Optional[Dict]: Parsed JSON data as a dictionary, or None if parsing fails.
171 |         """
172 |         try:
173 |             with open(path, "r", encoding="utf-8") as f:
174 |                 return json.loads(f.read().replace("\n", "\\n"))
175 |         except FileNotFoundError as e:
176 |             logger.error("File not found: %s", path)
177 |             return None
178 |         except PermissionError as e:
179 |             logger.error("Permission denied reading file: %s", path)
180 |             return None
181 |         except json.JSONDecodeError as e:
182 |             logger.error("JSON error parsing %s: %s", path, e)
183 |             return None
184 |         except OSError as e:
185 |             logger.error("OS error reading file: %s", path)
186 |             return None
187 |     
188 |     @staticmethod
189 |     def _extract_issue_name(raw_data: Dict, issue_type: str) -> str:
190 |         """
191 |         Extract issue name from raw_data.
192 | 
193 |         Args:
194 |             raw_data (Dict): Raw JSON data containing issue information.
195 |             issue_type (str): Fallback issue type if name cannot be extracted.
196 | 
197 |         Returns:
198 |             str: Issue name extracted from prompt or function name, or issue_type as fallback.
199 |         """
200 |         issue_name = raw_data.get("current_function", {}).get("function_name", issue_type)
201 |         if "prompt" in raw_data:
202 |             name_match = re.search(r'Name:\s*([^\n]+)', raw_data["prompt"])
203 |             if name_match:
204 |                 return name_match.group(1).strip()
205 |         return issue_name
206 |     
207 |     @staticmethod
208 |     def _extract_file_info(raw_data: Dict) -> tuple[str, int]:
209 |         """
210 |         Extract file basename and line number from raw_data.
211 | 
212 |         Args:
213 |             raw_data (Dict): Raw JSON data containing function information.
214 | 
215 |         Returns:
216 |             tuple[str, int]: Tuple of (file_basename, line_number).
217 |         """
218 |         func = raw_data.get("current_function", {})
219 |         file_path = func.get("file", "")
220 |         return (os.path.basename(file_path) if file_path else "unknown", int(func.get("start_line", 0)))
221 |     
222 |     @staticmethod
223 |     def _extract_repo_from_db_path(db_path: str) -> str:
224 |         """
225 |         Extract repository name (org/repo) from database path.
226 |         
227 |         Database path structure: output/databases/<lang>/<org>/<repo> 
228 |         We extract the repo name from the basename of db_path, and the org name from
229 |         the parent directory.
230 |         
231 |         Args:
232 |             db_path (str): The database path from raw_data (e.g., "output/databases/c/redis/cpp")
233 |         
234 |         Returns:
235 |             str: Repository name in format "org/repo" (e.g., "redis/cpp")
236 |         """
237 |         if not db_path:
238 |             return "unknown/unknown"
239 |         
240 |         try:
241 |             # DB path Structure: output/databases/<lang>/<org>/<repo>
242 |             # Example: output/databases/c/redis/cpp
243 |             repo_name = os.path.basename(db_path)
244 |             parent_dir = os.path.dirname(db_path)
245 |             org_name = os.path.basename(parent_dir)
246 |             
247 |             if org_name and repo_name:
248 |                 return f"{org_name}/{repo_name}"
249 |             else:
250 |                 return "unknown/unknown"
251 |         except Exception:
252 |             return "unknown/unknown"
253 |     
254 |     def load_all_issues(self, lang: str) -> Tuple[List[Issue], List[str]]:
255 |         """
256 |         Scan output/results/<lang>/<issue_type>/ and load all issues.
257 | 
258 |         Args:
259 |             lang (str): Language code to scan (e.g., "c").
260 | 
261 |         Returns:
262 |             Tuple[List[Issue], List[str]]: 
263 |                 - List of Issue objects loaded from all issue type directories.
264 |                 - List of error messages for files that failed to load.
265 |         """
266 |         issues = []
267 |         errors = []
268 |         lang_dir = self.results_root / lang
269 |         
270 |         if not lang_dir.exists():
271 |             return issues, errors
272 |         
273 |         # Scan each issue_type directory
274 |         for issue_type_dir in lang_dir.iterdir():
275 |             if not issue_type_dir.is_dir():
276 |                 continue
277 |             
278 |             issue_type = issue_type_dir.name
279 |             
280 |             # Find all _final.json files
281 |             for final_file in issue_type_dir.glob("*_final.json"):
282 |                 # Extract issue ID from filename
283 |                 issue_id = final_file.stem.replace("_final", "")
284 |                 
285 |                 # Find corresponding _raw.json
286 |                 raw_file = final_file.parent / f"{issue_id}_raw.json"
287 |                 
288 |                 if not raw_file.exists():
289 |                     errors.append(f"Missing raw file for issue {issue_id}: {raw_file}")
290 |                     continue
291 |                 
292 |                 # Parse JSON files
293 |                 final_data = self.parse_final_json(final_file)
294 |                 raw_data = self.parse_raw_json(raw_file)
295 |                 
296 |                 if not final_data:
297 |                     errors.append(f"Failed to parse final JSON: {final_file}")
298 |                     continue
299 |                 
300 |                 if not raw_data:
301 |                     errors.append(f"Failed to parse raw JSON: {raw_file}")
302 |                     continue
303 |                 
304 |                 file_basename, start_line = self._extract_file_info(raw_data)
305 |                 issue_name = self._extract_issue_name(raw_data, issue_type)
306 |                 
307 |                 # Extract repo from db_path in raw_data
308 |                 db_path = raw_data.get("db_path", "")
309 |                 repo = self._extract_repo_from_db_path(db_path) if db_path else "unknown/unknown"
310 |                 
311 |                 # Extract status from final_data
312 |                 status = "more"
313 |                 # Try to find status in assistant messages
314 |                 for msg in reversed(final_data):
315 |                     if isinstance(msg, dict) and msg.get("role", "").lower() == "assistant":
316 |                         content = msg.get("content", "")
317 |                         if content:
318 |                             status = self.extract_status(content)
319 |                             if status != "more":
320 |                                 break
321 |                 # No status found in assistant messages, check all messages
322 |                 if status == "more":
323 |                     for msg in reversed(final_data):
324 |                         if isinstance(msg, dict) and "content" in msg:
325 |                             status = self.extract_status(msg.get("content", ""))
326 |                             if status != "more":
327 |                                 break
328 |                 
329 |                 issue = Issue(
330 |                     id=issue_id,
331 |                     name=issue_name,
332 |                     file=file_basename,
333 |                     line=start_line,
334 |                     status=status,
335 |                     issue_type=issue_type,
336 |                     lang=lang,
337 |                     repo=repo,
338 |                     raw_path=str(raw_file),
339 |                     final_path=str(final_file),
340 |                     raw_data=raw_data,
341 |                     final_data=final_data
342 |                 )
343 |                 issues.append(issue)
344 |         
345 |         return issues, errors
346 | 
347 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Vulnhalla 
  2 | # Automated CodeQL Analysis with LLM Classification
  3 | 
  4 | <div align="center">
  5 |   <img src="images/vulnhalla_logo.png" alt="Vulnhalla" width="400">
  6 | </div>
  7 | 
  8 | For a detailed overview of the research and motivation behind Vulnhalla, see the official CyberArk Threat Research blog post:
  9 | 
 10 | **["Vulnhalla: Picking the True Vulnerabilities from the CodeQL Haystack"](https://www.cyberark.com/resources/threat-research-blog/vulnhalla-picking-the-true-vulnerabilities-from-the-codeql-haystack)**
 11 | 
 12 | ### Vulnhalla automates the complete security analysis pipeline:
 13 | 
 14 | 1. **Fetching repositories** of a given programming language from GitHub
 15 | 2. **Downloading** their corresponding [CodeQL](https://github.com/github/codeql) databases (if available)
 16 | 3. **Running CodeQL queries** on those databases to detect security or code-quality issues
 17 | 4. **Post-processing** the results with an LLM (ChatGPT, Gemini, etc.) to classify and filter issues
 18 | 
 19 | ---
 20 | 
 21 | ## 🚀 Quick Start
 22 | 
 23 | ### Step 1: Prerequisites
 24 | 
 25 | Before starting, ensure you have:
 26 | 
 27 | - **Python 3.10 – 3.13** (Python 3.11 or 3.12 recommended)
 28 |   - Python 3.14+ is not supported (this tool uses grpcio which is not supported by Python 3.14+)
 29 |   - Download from [python.org](https://www.python.org/downloads/)
 30 | 
 31 | - **CodeQL CLI**
 32 |   - Download from [CodeQL CLI releases](https://github.com/github/codeql-cli-binaries/releases)
 33 |   - Make sure `codeql` is in your PATH, or you'll set the path in `.env` (see Step 2)
 34 | 
 35 | - **(Optional) GitHub API token**
 36 |   - For higher rate limits when downloading databases
 37 |   - Get from [GitHub Settings > Tokens](https://github.com/settings/tokens)
 38 | 
 39 | - **LLM API key**
 40 |   - OpenAI, Azure, or Gemini API key (depending on your provider)
 41 | 
 42 | ### Step 2: Configure Environment
 43 | 
 44 | All configuration is in a single file: `.env`
 45 | 
 46 | 1. **Clone the repository:**
 47 | ```bash
 48 | git clone https://github.com/cyberark/Vulnhalla
 49 | cd Vulnhalla
 50 | ```
 51 | 
 52 | 2. **Copy `.env.example` to `.env`:**
 53 | ```bash
 54 | cp .env.example .env
 55 | ```
 56 | 
 57 | 3. **Edit `.env` and fill in your values:**
 58 | 
 59 | **Example for OpenAI:**
 60 | ```env
 61 | CODEQL_PATH=codeql
 62 | GITHUB_TOKEN=ghp_your_token_here
 63 | PROVIDER=openai
 64 | MODEL=gpt-4o
 65 | OPENAI_API_KEY=your-api-key-here
 66 | LLM_TEMPERATURE=0.2
 67 | LLM_TOP_P=0.2
 68 | 
 69 | # Optional: Logging Configuration
 70 | LOG_LEVEL=INFO                  # DEBUG, INFO, WARNING, ERROR
 71 | LOG_FILE=                       # Optional: path to log file (e.g., logs/vulnhalla.log)
 72 | LOG_FORMAT=default              # default or json
 73 | # LOG_VERBOSE_CONSOLE=false     # If true, WARNING/ERROR use full format (timestamp - logger - level - message)
 74 | ```
 75 | 
 76 | > **📖 For complete configuration reference:** See [Configuration Reference](#-configuration-reference) below for all supported providers (OpenAI, Azure, Gemini), required/optional variables, and detailed examples.
 77 | 
 78 | **Optional:** Create a virtual environment:
 79 | 
 80 | ```bash
 81 | # (Optional) Create virtual environment
 82 | python3 -m venv venv
 83 | venv\Scripts\activate # On Windows
 84 | # On MacOS/Linux: source venv/bin/activate
 85 | ```
 86 | 
 87 | ### Step 3: setup
 88 | 
 89 | **Option 1: Automated Setup (Recommended)**
 90 | 
 91 | ```bash
 92 | python setup.py
 93 | ```
 94 | 
 95 | **Note:** Virtual environment is optional. If `venv/` exists, setup will use it. Otherwise, it installs to your current Python environment.
 96 | 
 97 | The setup script will:
 98 | - Install Python dependencies from `requirements.txt`
 99 | - Initialize CodeQL packs
100 | 
101 | **Option 2: Manual Setup**
102 | 
103 | If you prefer to install manually:
104 | 
105 | ### Install dependencies
106 | ```bash
107 | pip install -r requirements.txt
108 | ```
109 | 
110 | ### Initialize CodeQL packs
111 | ```bash
112 | cd data/queries/cpp/tools
113 | codeql pack install
114 | cd ../issues
115 | codeql pack install
116 | cd ../../../..
117 | ```
118 | 
119 | ### Step 4: Run the Pipeline
120 | 
121 | **Option 1: Using the Unified Pipeline**
122 | 
123 | Run the complete pipeline with a single command:
124 | 
125 | ```bash
126 | # Analyze a specific repository
127 | python src/pipeline.py redis/redis
128 | 
129 | # Analyze top 100 repositories
130 | python src/pipeline.py
131 | ```
132 | 
133 | This will automatically:
134 | 1. Fetch CodeQL databases
135 | 2. Run CodeQL queries on all downloaded databases
136 | 3. Analyze results with LLM and save to `output/results/`
137 | 4. Open the UI to browse results
138 | 
139 | **Option 2: Using the Example Script**
140 | 
141 | Run the end-to-end example:
142 | 
143 | ```bash
144 | python examples/example.py
145 | ```
146 | 
147 | This will:
148 | 1. Fetch CodeQL databases for `videolan/vlc` and `redis/redis`
149 | 2. Run CodeQL queries on all downloaded databases
150 | 3. Analyze results with LLM and save to `output/results/`
151 | 
152 | ---
153 | 
154 | ## 🖥️ User Interface (UI)
155 | 
156 | Vulnhalla includes a full-featured User Interface for browsing and exploring analysis results.
157 | 
158 | ### Running the UI
159 | 
160 | ```bash
161 | python src/ui/ui_app.py
162 | # or
163 | python examples/ui_example.py
164 | ```
165 | 
166 | ### UI Layout
167 | 
168 | The UI displays a two-panel top area with a controls bar at the bottom:
169 | 
170 | **Top Area (side-by-side, resizable):**
171 | 
172 | - **Left Panel (Issues List):**
173 |   - DataTable showing: **ID**, **Repo**, **Issue Name**, **File**, **LLM decision**, **Manual decision**
174 |   - Issues count and sort indicator
175 |   - Search input box at the bottom, updates as you type (case-insensitive).
176 | 
177 | - **Right Panel (Details):**
178 |   - **LLM decision Section**: Shows the LLM's classification (True Positive, False Positive, or Needs More Data)
179 |   - **Metadata Section**: Issue name, Repo, File, Line, Type, Function name
180 |   - **Code Section**: 
181 |     - 📌 Initial Code Context (first code snippet the LLM saw)
182 |     - 📥 Additional Code (code that the LLM requested during the conversation) - only shown if additional code exists
183 |     - Vulnerable line highlighted in red
184 |   - **Summary Section**: LLM final answer/decision
185 |   - **Manual Decision Select**: Dropdown at the bottom to set manual verdict (True Positive, False Positive, Uncertain, or Not Set)
186 | 
187 | **Bottom Controls Bar:**
188 | 
189 | - Language: C (only language currently supported)
190 | - Filter by llm desicion dropdown: All, True Positive, False Positive, Needs more Info to decide
191 | - Action buttons: Refresh, Run Analysis
192 | - Key bindings help text
193 | 
194 | ### Key Bindings
195 | 
196 | - `↑`/`↓` - Navigate issue list (row-by-row)
197 | - `Tab` / `Shift+Tab` - Switch focus between panels
198 | - `Enter` - Show details for selected issue
199 | - `/` - Focus search input box (in left panel)
200 | - `Esc` - Clear search and return focus to issues table
201 | - `r` - Reload results from disk
202 | - `[` / `]` - Resize left/right panels (adjust split position)
203 | - `q` - Quit application
204 | 
205 | ### Interactive Features
206 | 
207 | #### Column Sorting
208 | 
209 | - **Click any column header** to sort by that column
210 | - Default sorting: by Repo (ascending), then by ID (ascending)
211 | 
212 | #### Resizable Panels
213 | 
214 | - **Draggable divider** between Issues List and Details panels
215 | - **Mouse**: Click and drag the divider to resize
216 | - **Keyboard**: Use `[` to move divider left, `]` to move divider right
217 | - Split position is remembered during the session
218 | 
219 | ---
220 | 
221 | ## 📊 Output Structure
222 | 
223 | After running the pipeline, results are organized in `output/results/<LANG>/<ISSUE_TYPE>/`:
224 | 
225 | ```
226 | output/results/c/Copy_function_using_source_size/
227 | ├── 1_raw.json      # Original CodeQL issue data
228 | ├── 1_final.json    # LLM conversation and classification
229 | ├── 2_raw.json
230 | ├── 2_final.json
231 | └── ...
232 | ```
233 | 
234 | Each `*_final.json` contains:
235 | - Full LLM conversation (system prompts, user messages, assistant responses, tool calls)
236 | - Final status code (1337 = vulnerable, 1007 = secure, 7331/3713 = needs more info)
237 | 
238 | Each `*_raw.json` contains:
239 | - Original CodeQL issue data
240 | - Function context
241 | - Database path (includes org/repo information: `output/databases/<LANG>/<ORG>/<REPO>`)
242 | - Issue location
243 | 
244 | ---
245 | 
246 | ## 🛠 Troubleshooting
247 | 
248 | - **CodeQL CLI not found**:  
249 |   Set `CODEQL_PATH` in your `.env` file to the full path of your CodeQL executable.
250 |   **On Windows**: The path must end with `.cmd` (e.g., `C:\path\to\codeql\codeql.cmd`).
251 | 
252 | - **GitHub rate limits**:  
253 |   Set `GITHUB_TOKEN` in your `.env` file (get token from https://github.com/settings/tokens).
254 | 
255 | - **LLM issues**:  
256 |   Check your API keys in `.env` file match your selected provider.
257 | 
258 | - **Import errors in UI**:  
259 |   Make sure you're running from the project root directory, or use `python examples/ui_example.py` which handles path setup.
260 | 
261 | ---
262 | 
263 | ## ⚙️ Configuration Reference
264 | 
265 | ### Environment Variables
266 | 
267 | All configuration is managed through environment variables in your `.env` file. Here's a complete reference:
268 | 
269 | #### Required Variables
270 | 
271 | | Variable | Required For | Description |
272 | |----------|--------------|-------------|
273 | | `CODEQL_PATH` | All | Path to CodeQL executable. Defaults to `codeql` if CodeQL is in PATH. Use full path if not in PATH (e.g., `C:\path\to\codeql\codeql.cmd` on Windows) |
274 | | `PROVIDER` | All | LLM provider: `openai`, `azure`, or `gemini` |
275 | | `MODEL` | All | Model name (e.g., `gpt-4o`, `gpt-4-turbo`, `gemini-2.5-flash`) |
276 | 
277 | #### Provider-Specific Required Variables
278 | 
279 | **OpenAI:**
280 | | Variable | Description |
281 | |----------|-------------|
282 | | `OPENAI_API_KEY` | Your OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys) |
283 | 
284 | **Azure OpenAI:**
285 | | Variable | Description |
286 | |----------|-------------|
287 | | `AZURE_OPENAI_API_KEY` or `AZURE_API_KEY` | Your Azure OpenAI API key |
288 | | `AZURE_OPENAI_ENDPOINT` or `AZURE_API_BASE` | Your Azure OpenAI endpoint URL (e.g., `https://your-resource.openai.azure.com`) |
289 | | `AZURE_OPENAI_API_VERSION` or `AZURE_API_VERSION` | API version (default: `2024-08-01-preview`) |
290 | 
291 | **Gemini (Google):**
292 | | Variable | Description |
293 | |----------|-------------|
294 | | `GOOGLE_API_KEY` | Your Google API key from [Google AI Studio](https://makersuite.google.com/app/apikey) |
295 | 
296 | #### Optional Variables
297 | 
298 | | Variable | Default | Description |
299 | |----------|---------|-------------|
300 | | `GITHUB_TOKEN` | - | GitHub API token for higher rate limits. Get from [GitHub Settings > Tokens](https://github.com/settings/tokens) |
301 | | `LLM_TEMPERATURE` | `0.2` | LLM temperature (0.0-2.0). Lower = more deterministic. **Recommended: keep at 0.2** |
302 | | `LLM_TOP_P` | `0.2` | LLM top-p sampling (0.0-1.0). Lower = more focused. **Recommended: keep at 0.2** |
303 | | `LOG_LEVEL` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, or `ERROR`. Controls verbosity of console output |
304 | | `LOG_FILE` | - | Optional path to log file (e.g., `logs/vulnhalla.log`). If set, logs are written to both console and file. File logging uses DEBUG level for detailed output |
305 | | `LOG_FORMAT` | `default` | Log format style: `default` (human-readable), or `json` (structured JSON format) |
306 | | `LOG_VERBOSE_CONSOLE` | `false` | If `true`, WARNING/ERROR/CRITICAL use full format (timestamp - logger - level - message). Default: WARNING/ERROR use simple format (LEVEL - message), INFO always minimal (message only) |
307 | | `THIRD_PARTY_LOG_LEVEL` | `ERROR` | Log level for third-party libraries (LiteLLM, urllib3, requests). Options: `DEBUG`, `INFO`, `WARNING`, `ERROR`. Default suppresses most third-party noise |
308 | 
309 | > **⚠️ Important:** Do not increase `LLM_TEMPERATURE` or `LLM_TOP_P` unless you fully understand the impact. Lower values keep the model stable and deterministic, which is critical for security analysis. Higher values may cause the model to become inconsistent, creative, or hallucinate results.
310 | 
311 | > **📝 Note:** For additional configuration examples, see the `.env.example` file in the project root.
312 | 
313 | ### Configuration Validation
314 | 
315 | Vulnhalla validates your configuration at startup. If required variables are missing or invalid, you'll see clear error messages indicating what needs to be fixed.
316 | 
317 | **Common validation errors:**
318 | - Missing API key for selected provider
319 | - Invalid provider name (must be `openai`, `azure`, or `gemini`)
320 | - Missing Azure endpoint (required for Azure provider)
321 | - Invalid CodeQL path (if `CODEQL_PATH` is set but file doesn't exist)
322 | 
323 | ---
324 | 
325 | ## 📝 Status Codes
326 | 
327 | The LLM uses the following status codes:
328 | 
329 | - **1337**: Security vulnerability found (True Positive)
330 | - **1007**: Code is secure, no vulnerability (False Positive)
331 | - **7331**: More code/information needed to validate security
332 | - **3713**: Likely not a security problem, but more info needed (used with 7331)
333 | 
334 | The UI maps these to:
335 | - `1337` → "True Positive"
336 | - `1007` → "False Positive"
337 | - `7331` or `3713` → "Needs More Data"
338 | 
339 | ---
340 | 
341 | ## 🔧 Development
342 | 
343 | ### Running Tests
344 | 
345 | The project includes basic test infrastructure using pytest:
346 | 
347 | ```bash
348 | # Run all tests
349 | pytest
350 | 
351 | # Run with verbose output
352 | pytest -v
353 | ```
354 | 
355 | The test suite includes smoke tests to verify the test infrastructure is set up correctly.
356 | 
357 | ### Project Dependencies
358 | 
359 | See `requirements.txt` for Python dependencies:
360 | - `requests` - HTTP requests for GitHub API
361 | - `pySmartDL` - Smart download manager for CodeQL databases
362 | - `litellm` - Unified LLM interface supporting multiple providers
363 | - `python-dotenv` - Environment variable management
364 | - `PyYAML` - YAML parsing for CodeQL pack files
365 | - `textual` - Terminal UI framework
366 | - `pytest` - Testing framework
367 | 
368 | ### CodeQL Queries
369 | 
370 | CodeQL queries are organized in `data/queries/<LANG>/`:
371 | - `issues/` - Security issue detection queries
372 | - `tools/` - Helper queries (function trees, classes, global variables, macros)
373 | 
374 | Each directory contains a `qlpack.yml` file defining the CodeQL pack.
375 | 
376 | ---
377 | 
378 | ## 📄 License
379 | 
380 | Copyright (c) 2025 CyberArk Software Ltd. All rights reserved.
381 | 
382 | This repository is licensed under the Apache License, Version 2.0 - see [LICENSE.txt](LICENSE.txt) for more details.
383 | 
384 | ---
385 | 
386 | ## 🤝 Contributing
387 | 
388 | 
389 | We welcome contributions of all kinds to this repository. For instructions on how to get started and descriptions of our development workflows, please see our [contributing guide](https://github.com/cyberark/Vulnhalla/blob/main/CONTRIBUTING.md).
390 | 
391 | ---  
392 | ### Code of Conduct
393 | 
394 | Please read and follow our [Code of Conduct](CODE_OF_CONDUCT.md). We are committed to providing a welcoming and inclusive environment for all contributors.
395 | 
396 | ---
397 | 
398 | ## 📧 Contact
399 | 
400 | Feel free to contact us via GitHub issues if you have any feature requests or project issues.
401 | 


--------------------------------------------------------------------------------
/NOTICES.txt:
--------------------------------------------------------------------------------
  1 | Copyright (c) 2025 CyberArk Software Ltd. All rights reserved
  2 | 
  3 | Vulnhalla is using the following open source components:
  4 | 
  5 | 1) Requests (https://github.com/psf/requests/blob/main/LICENSE) : Apache-2.0 License
  6 |         Copyright 2019 Kenneth Reitz     
  7 |         
  8 | 2) python-dotenv (https://github.com/theskumar/python-dotenv/blob/main/LICENSE) : BSD-3-Clause License
  9 |         Copyright (c) 2014, Saurabh Kumar (python-dotenv), 2013, Ted Tieken (django-dotenv-rw), 2013, Jacob Kaplan-Moss (django-dotenv)
 10 | 
 11 | 3) litellm (https://github.com/BerriAI/litellm/blob/main/LICENSE) : MIT License
 12 |         Copyright (c) 2023 Berri AI
 13 | 
 14 | 4) pyYaml (https://pyyaml.org/) : MIT License
 15 |         Copyright (c) 2017-2021 Ingy döt Net
 16 |               Copyright (c) 2006-2016 Kirill Simonov
 17 | 
 18 | 5) textual (https://github.com/Textualize/textual/blob/main/LICENSE) : MIT License
 19 |         Copyright (c) 2021 Will McGugan
 20 | 
 21 | 6) pySmartDL (http://pypi.python.org/pypi/pySmartDL/) : Public Domain
 22 | 
 23 | =====================================================================
 24 | 1)
 25 |                                
 26 |                                  Apache License
 27 |                            Version 2.0, January 2004
 28 |                         http://www.apache.org/licenses/
 29 | 
 30 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 31 | 
 32 |    1. Definitions.
 33 | 
 34 |       "License" shall mean the terms and conditions for use, reproduction,
 35 |       and distribution as defined by Sections 1 through 9 of this document.
 36 | 
 37 |       "Licensor" shall mean the copyright owner or entity authorized by
 38 |       the copyright owner that is granting the License.
 39 | 
 40 |       "Legal Entity" shall mean the union of the acting entity and all
 41 |       other entities that control, are controlled by, or are under common
 42 |       control with that entity. For the purposes of this definition,
 43 |       "control" means (i) the power, direct or indirect, to cause the
 44 |       direction or management of such entity, whether by contract or
 45 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 46 |       outstanding shares, or (iii) beneficial ownership of such entity.
 47 | 
 48 |       "You" (or "Your") shall mean an individual or Legal Entity
 49 |       exercising permissions granted by this License.
 50 | 
 51 |       "Source" form shall mean the preferred form for making modifications,
 52 |       including but not limited to software source code, documentation
 53 |       source, and configuration files.
 54 | 
 55 |       "Object" form shall mean any form resulting from mechanical
 56 |       transformation or translation of a Source form, including but
 57 |       not limited to compiled object code, generated documentation,
 58 |       and conversions to other media types.
 59 | 
 60 |       "Work" shall mean the work of authorship, whether in Source or
 61 |       Object form, made available under the License, as indicated by a
 62 |       copyright notice that is included in or attached to the work
 63 |       (an example is provided in the Appendix below).
 64 | 
 65 |       "Derivative Works" shall mean any work, whether in Source or Object
 66 |       form, that is based on (or derived from) the Work and for which the
 67 |       editorial revisions, annotations, elaborations, or other modifications
 68 |       represent, as a whole, an original work of authorship. For the purposes
 69 |       of this License, Derivative Works shall not include works that remain
 70 |       separable from, or merely link (or bind by name) to the interfaces of,
 71 |       the Work and Derivative Works thereof.
 72 | 
 73 |       "Contribution" shall mean any work of authorship, including
 74 |       the original version of the Work and any modifications or additions
 75 |       to that Work or Derivative Works thereof, that is intentionally
 76 |       submitted to Licensor for inclusion in the Work by the copyright owner
 77 |       or by an individual or Legal Entity authorized to submit on behalf of
 78 |       the copyright owner. For the purposes of this definition, "submitted"
 79 |       means any form of electronic, verbal, or written communication sent
 80 |       to the Licensor or its representatives, including but not limited to
 81 |       communication on electronic mailing lists, source code control systems,
 82 |       and issue tracking systems that are managed by, or on behalf of, the
 83 |       Licensor for the purpose of discussing and improving the Work, but
 84 |       excluding communication that is conspicuously marked or otherwise
 85 |       designated in writing by the copyright owner as "Not a Contribution."
 86 | 
 87 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 88 |       on behalf of whom a Contribution has been received by Licensor and
 89 |       subsequently incorporated within the Work.
 90 | 
 91 |    2. Grant of Copyright License. Subject to the terms and conditions of
 92 |       this License, each Contributor hereby grants to You a perpetual,
 93 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 94 |       copyright license to reproduce, prepare Derivative Works of,
 95 |       publicly display, publicly perform, sublicense, and distribute the
 96 |       Work and such Derivative Works in Source or Object form.
 97 | 
 98 |    3. Grant of Patent License. Subject to the terms and conditions of
 99 |       this License, each Contributor hereby grants to You a perpetual,
100 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
101 |       (except as stated in this section) patent license to make, have made,
102 |       use, offer to sell, sell, import, and otherwise transfer the Work,
103 |       where such license applies only to those patent claims licensable
104 |       by such Contributor that are necessarily infringed by their
105 |       Contribution(s) alone or by combination of their Contribution(s)
106 |       with the Work to which such Contribution(s) was submitted. If You
107 |       institute patent litigation against any entity (including a
108 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
109 |       or a Contribution incorporated within the Work constitutes direct
110 |       or contributory patent infringement, then any patent licenses
111 |       granted to You under this License for that Work shall terminate
112 |       as of the date such litigation is filed.
113 | 
114 |    4. Redistribution. You may reproduce and distribute copies of the
115 |       Work or Derivative Works thereof in any medium, with or without
116 |       modifications, and in Source or Object form, provided that You
117 |       meet the following conditions:
118 | 
119 |       (a) You must give any other recipients of the Work or
120 |           Derivative Works a copy of this License; and
121 | 
122 |       (b) You must cause any modified files to carry prominent notices
123 |           stating that You changed the files; and
124 | 
125 |       (c) You must retain, in the Source form of any Derivative Works
126 |           that You distribute, all copyright, patent, trademark, and
127 |           attribution notices from the Source form of the Work,
128 |           excluding those notices that do not pertain to any part of
129 |           the Derivative Works; and
130 | 
131 |       (d) If the Work includes a "NOTICE" text file as part of its
132 |           distribution, then any Derivative Works that You distribute must
133 |           include a readable copy of the attribution notices contained
134 |           within such NOTICE file, excluding those notices that do not
135 |           pertain to any part of the Derivative Works, in at least one
136 |           of the following places: within a NOTICE text file distributed
137 |           as part of the Derivative Works; within the Source form or
138 |           documentation, if provided along with the Derivative Works; or,
139 |           within a display generated by the Derivative Works, if and
140 |           wherever such third-party notices normally appear. The contents
141 |           of the NOTICE file are for informational purposes only and
142 |           do not modify the License. You may add Your own attribution
143 |           notices within Derivative Works that You distribute, alongside
144 |           or as an addendum to the NOTICE text from the Work, provided
145 |           that such additional attribution notices cannot be construed
146 |           as modifying the License.
147 | 
148 |       You may add Your own copyright statement to Your modifications and
149 |       may provide additional or different license terms and conditions
150 |       for use, reproduction, or distribution of Your modifications, or
151 |       for any such Derivative Works as a whole, provided Your use,
152 |       reproduction, and distribution of the Work otherwise complies with
153 |       the conditions stated in this License.
154 | 
155 |    5. Submission of Contributions. Unless You explicitly state otherwise,
156 |       any Contribution intentionally submitted for inclusion in the Work
157 |       by You to the Licensor shall be under the terms and conditions of
158 |       this License, without any additional terms or conditions.
159 |       Notwithstanding the above, nothing herein shall supersede or modify
160 |       the terms of any separate license agreement you may have executed
161 |       with Licensor regarding such Contributions.
162 | 
163 |    6. Trademarks. This License does not grant permission to use the trade
164 |       names, trademarks, service marks, or product names of the Licensor,
165 |       except as required for reasonable and customary use in describing the
166 |       origin of the Work and reproducing the content of the NOTICE file.
167 | 
168 |    7. Disclaimer of Warranty. Unless required by applicable law or
169 |       agreed to in writing, Licensor provides the Work (and each
170 |       Contributor provides its Contributions) on an "AS IS" BASIS,
171 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
172 |       implied, including, without limitation, any warranties or conditions
173 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
174 |       PARTICULAR PURPOSE. You are solely responsible for determining the
175 |       appropriateness of using or redistributing the Work and assume any
176 |       risks associated with Your exercise of permissions under this License.
177 | 
178 |    8. Limitation of Liability. In no event and under no legal theory,
179 |       whether in tort (including negligence), contract, or otherwise,
180 |       unless required by applicable law (such as deliberate and grossly
181 |       negligent acts) or agreed to in writing, shall any Contributor be
182 |       liable to You for damages, including any direct, indirect, special,
183 |       incidental, or consequential damages of any character arising as a
184 |       result of this License or out of the use or inability to use the
185 |       Work (including but not limited to damages for loss of goodwill,
186 |       work stoppage, computer failure or malfunction, or any and all
187 |       other commercial damages or losses), even if such Contributor
188 |       has been advised of the possibility of such damages.
189 | 
190 |    9. Accepting Warranty or Additional Liability. While redistributing
191 |       the Work or Derivative Works thereof, You may choose to offer,
192 |       and charge a fee for, acceptance of support, warranty, indemnity,
193 |       or other liability obligations and/or rights consistent with this
194 |       License. However, in accepting such obligations, You may act only
195 |       on Your own behalf and on Your sole responsibility, not on behalf
196 |       of any other Contributor, and only if You agree to indemnify,
197 |       defend, and hold each Contributor harmless for any liability
198 |       incurred by, or claims asserted against, such Contributor by reason
199 |       of your accepting any such warranty or additional liability.
200 | 
201 | =====================================================================
202 | 2) BSD-3-Clause License
203 | 
204 | Copyright (c) 2014, Saurabh Kumar (python-dotenv), 2013, Ted Tieken (django-dotenv-rw), 2013, Jacob Kaplan-Moss (django-dotenv)
205 | Redistribution and use in source and binary forms, with or without modification,
206 | are permitted provided that the following conditions are met:
207 | - Redistributions of source code must retain the above copyright notice,
208 |   this list of conditions and the following disclaimer.
209 | - Redistributions in binary form must reproduce the above copyright notice,
210 |   this list of conditions and the following disclaimer in the documentation
211 |   and/or other materials provided with the distribution.
212 | - Neither the name of django-dotenv nor the names of its contributors
213 |   may be used to endorse or promote products derived from this software
214 |   without specific prior written permission.
215 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
216 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
217 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
218 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
219 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
220 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
221 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
222 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
223 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
224 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
225 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
226 | 
227 | =====================================================================
228 | 3) MIT License
229 | 
230 | Copyright (c) 2023 Berri AI
231 | 
232 | Permission is hereby granted, free of charge, to any person obtaining a copy
233 | of this software and associated documentation files (the "Software"), to deal
234 | in the Software without restriction, including without limitation the rights
235 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
236 | copies of the Software, and to permit persons to whom the Software is
237 | furnished to do so, subject to the following conditions:
238 | 
239 | The above copyright notice and this permission notice shall be included in all
240 | copies or substantial portions of the Software.
241 | 
242 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
243 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
244 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
245 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
246 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
247 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
248 | SOFTWARE.
249 | 
250 | =====================================================================
251 | 4) MIT License
252 | 
253 | Copyright (c) 2017-2021 Ingy döt Net
254 | Copyright (c) 2006-2016 Kirill Simonov
255 | 
256 | Permission is hereby granted, free of charge, to any person obtaining a copy of
257 | this software and associated documentation files (the "Software"), to deal in
258 | the Software without restriction, including without limitation the rights to
259 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
260 | of the Software, and to permit persons to whom the Software is furnished to do
261 | so, subject to the following conditions:
262 | 
263 | The above copyright notice and this permission notice shall be included in all
264 | copies or substantial portions of the Software.
265 | 
266 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
267 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
268 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
269 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
270 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
271 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
272 | SOFTWARE.
273 | 
274 | =====================================================================
275 | 5) MIT License
276 | 
277 | Copyright (c) 2021 Will McGugan
278 | 
279 | Permission is hereby granted, free of charge, to any person obtaining a copy
280 | of this software and associated documentation files (the "Software"), to deal
281 | in the Software without restriction, including without limitation the rights
282 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
283 | copies of the Software, and to permit persons to whom the Software is
284 | furnished to do so, subject to the following conditions:
285 | 
286 | The above copyright notice and this permission notice shall be included in all
287 | copies or substantial portions of the Software.
288 | 
289 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
290 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
291 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
292 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
293 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
294 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
295 | SOFTWARE.


--------------------------------------------------------------------------------
/src/vulnhalla.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Core analysis engine for Vulnhalla.
  4 | 
  5 | This module coordinates the aggregation of raw CodeQL findings and their
  6 | classification by an LLM. It loads issues from CodeQL result files,
  7 | groups them by issue type, runs LLM-based analysis to decide whether
  8 | each finding is a true positive, false positive, or needs more data,
  9 | and writes structured result files for further inspection (e.g. in the UI).
 10 | """
 11 | 
 12 | import os
 13 | import csv
 14 | import re
 15 | import json
 16 | from typing import Any, Callable, Dict, List, Optional, Tuple
 17 | 
 18 | # Import from common
 19 | from src.utils.common_functions import (
 20 |     get_all_dbs,
 21 |     read_file_lines_from_zip,
 22 |     read_file as read_file_utf8,
 23 |     write_file_ascii,
 24 |     read_yml
 25 | )
 26 | 
 27 | # Script that holds your GPT logic
 28 | from src.llm.llm_analyzer import LLMAnalyzer
 29 | from src.utils.config_validator import validate_and_exit_on_error
 30 | from src.utils.logger import get_logger
 31 | from src.utils.exceptions import VulnhallaError, CodeQLError
 32 | 
 33 | logger = get_logger(__name__)
 34 | 
 35 | 
 36 | class IssueAnalyzer:
 37 |     """
 38 |     Analyzes all issues in CodeQL databases, fetches relevant code snippets,
 39 |     and forwards them to an LLM (via llm_analyzer) for triage.
 40 |     """
 41 | 
 42 |     def __init__(self, lang: str = "c", config: Optional[Dict[str, Any]] = None) -> None:
 43 |         """
 44 |         Initialize the IssueAnalyzer with default parameters.
 45 | 
 46 |         Args:
 47 |             lang (str, optional): The language code. Defaults to 'c'.
 48 |             config (Dict, optional): Full LLM configuration dictionary. If not provided, loads from .env file.
 49 |         """
 50 |         self.lang = lang
 51 |         self.db_path: Optional[str] = None
 52 |         self.code_path: Optional[str] = None
 53 |         self.config = config
 54 | 
 55 |     # ----------------------------------------------------------------------
 56 |     # 1. CSV Parsing and Data Gathering
 57 |     # ----------------------------------------------------------------------
 58 | 
 59 |     def parse_issues_csv(self, file_name: str) -> List[Dict[str, str]]:
 60 |         """
 61 |         Reads the issues.csv file produced by CodeQL (with a custom or default
 62 |         set of columns) and returns a list of dicts.
 63 | 
 64 |         Args:
 65 |             file_name (str): The path to 'issues.csv'.
 66 | 
 67 |         Returns:
 68 |             List[Dict[str, str]]: A list of issue objects parsed from CSV rows.
 69 |         
 70 |         Raises:
 71 |             CodeQLError: If file cannot be read (not found, permission denied, etc.).
 72 |         """
 73 |         field_names = [
 74 |             "name", "help", "type", "message",
 75 |             "file", "start_line", "start_offset",
 76 |             "end_line", "end_offset"
 77 |         ]
 78 |         issues = []
 79 |         try:
 80 |             with open(file_name, "r", encoding="utf-8") as f:
 81 |                 csv_reader = csv.DictReader(f, fieldnames=field_names)
 82 |                 for row in csv_reader:
 83 |                     issues.append(row)
 84 |         except FileNotFoundError as e:
 85 |             raise CodeQLError(f"Issues CSV file not found: {file_name}") from e
 86 |         except PermissionError as e:
 87 |             raise CodeQLError(f"Permission denied reading issues CSV: {file_name}") from e
 88 |         except OSError as e:
 89 |             raise CodeQLError(f"OS error while reading issues CSV: {file_name}") from e
 90 |         return issues
 91 | 
 92 |     def collect_issues_from_databases(self, dbs_folder: str) -> Dict[str, List[Dict[str, str]]]:
 93 |         """
 94 |         Searches through all CodeQL databases in `dbs_folder`, collects issues
 95 |         from each DB, and groups them by issue name.
 96 | 
 97 |         Args:
 98 |             dbs_folder (str): The folder containing the language-specific databases.
 99 | 
100 |         Returns:
101 |             Dict[str, List[Dict[str, str]]]: All issues, grouped by issue name.
102 |         
103 |         Raises:
104 |             CodeQLError: If database folder cannot be accessed or issues cannot be read.
105 |         """
106 |         issues_statistics: Dict[str, List[Dict[str, str]]] = {}
107 |         # get_all_dbs() raises CodeQLError on errors
108 |         dbs_path = get_all_dbs(dbs_folder)
109 |         for curr_db in dbs_path:
110 |             logger.info("Processing DB: %s", curr_db)
111 |             function_tree_csv = os.path.join(curr_db, "FunctionTree.csv")
112 |             issues_file = os.path.join(curr_db, "issues.csv")
113 | 
114 |             if os.path.exists(function_tree_csv) and os.path.exists(issues_file):
115 |                 # parse_issues_csv() raises CodeQLError on errors
116 |                 issues = self.parse_issues_csv(issues_file)
117 |                 for issue in issues:
118 |                     if issue["name"] not in issues_statistics:
119 |                         issues_statistics[issue["name"]] = []
120 |                     issue["db_path"] = curr_db
121 |                     issues_statistics[issue["name"]].append(issue)
122 |             else:
123 |                 logger.error("Error: Execute run_codeql_queries.py first!")
124 |                 continue
125 | 
126 |         return issues_statistics
127 | 
128 |     # ----------------------------------------------------------------------
129 |     # 2. Function and Snippet Extraction
130 |     # ----------------------------------------------------------------------
131 | 
132 |     def find_function_by_line(self, function_tree_file: str, file_path: str, line: int) -> Optional[Dict[str, str]]:
133 |         """
134 |         Finds the most specific (smallest) function in the function tree file that includes the given file and line number.
135 | 
136 |         Args:
137 |             function_tree_file (str): Path to the 'FunctionTree.csv' file.
138 |             file_path (str): Partial or full file path to match in the CSV rows.
139 |             line (int): The line number to check within function range.
140 | 
141 |         Returns:
142 |             Optional[Dict[str, str]]: The best matching function dictionary, or None if not found.
143 |         
144 |         Raises:
145 |             CodeQLError: If function tree file cannot be read (not found, permission denied, etc.).
146 |         """
147 |         keys = ["function_name", "file", "start_line", "function_id", "end_line", "caller_id"]
148 |         best_function = None
149 |         smallest_range = float('inf')
150 | 
151 |         try:
152 |             with open(function_tree_file, "r", encoding="utf-8") as f:
153 |                 for row in f:
154 |                     if file_path in row:
155 |                         fields = re.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', row.strip())
156 |                         if len(fields) != len(keys):
157 |                             continue  # Skip malformed rows
158 | 
159 |                         function = dict(zip(keys, fields))
160 |                         try:
161 |                             start_line = int(function["start_line"])
162 |                             end_line = int(function["end_line"])
163 |                         except ValueError:
164 |                             continue  # Skip if lines aren't integers
165 | 
166 |                         if start_line <= line <= end_line:
167 |                             if file_path in function["file"]:
168 |                                 size = end_line - start_line
169 |                                 if size < smallest_range:
170 |                                     best_function = function
171 |                                     smallest_range = size
172 |         except FileNotFoundError as e:
173 |             raise CodeQLError(f"Function tree file not found: {function_tree_file}") from e
174 |         except PermissionError as e:
175 |             raise CodeQLError(f"Permission denied reading function tree file: {function_tree_file}") from e
176 |         except OSError as e:
177 |             raise CodeQLError(f"OS error while reading function tree file: {function_tree_file}") from e
178 | 
179 |         return best_function
180 | 
181 |     def extract_function_code(self, code_file: List[str], function_dict: Dict[str, str]) -> str:
182 |         """
183 |         Produces lines of the function's code from a list of lines.
184 | 
185 |         Args:
186 |             code_file (List[str]): A list of lines for the entire file.
187 |             function_dict (Dict[str, str]): The dictionary describing the function.
188 | 
189 |         Returns:
190 |             str: A snippet string of code for the function.
191 |         """
192 |         if not function_dict:
193 |             return ""
194 |         start_line = int(function_dict["start_line"]) - 1
195 |         end_line = int(function_dict["end_line"])
196 |         snippet_lines = code_file[start_line:end_line]
197 |         snippet = "\n".join(
198 |             f"{start_line + i}: {s.replace(chr(9), '    ')}"
199 |             for i, s in enumerate(snippet_lines)
200 |         )
201 |         return snippet
202 | 
203 |     # ----------------------------------------------------------------------
204 |     # 3. Text Replacement & Prompt Building
205 |     # ----------------------------------------------------------------------
206 | 
207 |     def create_bracket_reference_replacer(
208 |         self,
209 |         db_path: str,
210 |         code_path: str
211 |     ) -> Callable[[re.Match], str]:
212 |         """
213 |         Creates and returns a 'replacement' callback function that can be used with
214 |         `re.sub` to transform bracketed references (like [[var|"file://path:line:..."]])
215 |         into a more readable snippet inline with line references.
216 | 
217 |         Args:
218 |             db_path (str): Path to the current CodeQL database.
219 |             code_path (str): Base path to the code. May differ on Windows vs. Linux.
220 | 
221 |         Returns:
222 |             Callable[[re.Match], str]: A function that can be used with `re.sub`.
223 |         
224 |         Note:
225 |             The returned callback function may raise `CodeQLError` if ZIP file cannot be read.
226 |         """
227 |         def replacement(match):
228 |             variable = match.group(1)
229 |             path_type = match.group(2)
230 |             file_path = match.group(3)
231 |             line_number = match.group(4)
232 |             start_offset = match.group(5)
233 |             end_offset = match.group(6)
234 | 
235 |             # Read snippet from the code
236 |             if path_type == "relative://":
237 |                 full_path = code_path + file_path
238 |             else:
239 |                 # Handle 'file://' or something else by removing the leading slash
240 |                 full_path = file_path[1:] if file_path.startswith("/") else file_path
241 | 
242 |             code_text = read_file_lines_from_zip(
243 |                 os.path.join(db_path, "src.zip"),
244 |                 full_path
245 |             )
246 |             code_lines = code_text.split("\n")
247 |             snippet = code_lines[int(line_number) - 1][int(start_offset) - 1:int(end_offset)]
248 | 
249 |             file_name = os.path.split(file_path)[1]
250 |             return f"{variable} '{snippet}' ({file_name}:{int(line_number)})"
251 | 
252 |         return replacement
253 | 
254 |     def build_prompt_by_template(
255 |         self,
256 |         issue: Dict[str, str],
257 |         message: str,
258 |         snippet: str,
259 |         code: str
260 |     ) -> str:
261 |         """
262 |         Builds the final 'prompt' template to feed into an LLM, combining
263 |         the code snippet, code content, and a set of hints.
264 | 
265 |         Args:
266 |             issue (Dict[str, str]): The issue dictionary from parse_issues_csv.
267 |             message (str): The processed "message" text to embed.
268 |             snippet (str): The direct snippet from the code for the particular highlight.
269 |             code (str): Additional code context (e.g. entire function).
270 | 
271 |         Returns:
272 |             str: A final prompt string with the template + hints + snippet + code.
273 |         
274 |         Raises:
275 |             VulnhallaError: If template files cannot be read (not found, permission denied, etc.).
276 |         """
277 |         # If language is 'c', many queries are stored under 'cpp'
278 |         lang_folder = "cpp" if self.lang == "c" else self.lang
279 | 
280 |         # Try to read an existing template specific to the issue name
281 |         hints_path = os.path.join("data/templates", lang_folder, issue["name"] + ".template")
282 |         if not os.path.exists(hints_path):
283 |             hints_path = os.path.join("data/templates", lang_folder, "general.template")
284 | 
285 |         hints = read_file_utf8(hints_path)
286 | 
287 |         # Read the larger general template
288 |         template_path = os.path.join("data/templates", lang_folder, "template.template")
289 |         template = read_file_utf8(template_path)
290 | 
291 |         location = "look at {file_line} with '{snippet}'".format(
292 |             file_line=os.path.split(issue["file"])[1] + ":" + str(int(issue["start_line"]) - 1),
293 |             snippet=snippet
294 |         )
295 | 
296 |         # Special case for "Use of object after its lifetime has ended"
297 |         if issue["name"] == "Use of object after its lifetime has ended":
298 |             message = message.replace("here", f"here ({location})", 1)
299 | 
300 |         prompt = template.format(
301 |             name=issue["name"],
302 |             description=issue["help"],
303 |             message=message,
304 |             location=location,
305 |             hints=hints,
306 |             code=code
307 |         )
308 |         return prompt
309 | 
310 |     # ----------------------------------------------------------------------
311 |     # 4. Saving LLM Results
312 |     # ----------------------------------------------------------------------
313 | 
314 |     def ensure_directories_exist(self, dirs: List[str]) -> None:
315 |         """
316 |         Creates all directories in the given list if they do not already exist.
317 | 
318 |         Args:
319 |             dirs (List[str]): A list of directory paths to create if missing.
320 |         
321 |         Raises:
322 |             VulnhallaError: If directory creation fails (permission denied, etc.).
323 |         """
324 |         for d in dirs:
325 |             if not os.path.exists(d):
326 |                 try:
327 |                     os.makedirs(d, exist_ok=True)
328 |                 except PermissionError as e:
329 |                     raise VulnhallaError(f"Permission denied creating directory: {d}") from e
330 |                 except OSError as e:
331 |                     raise VulnhallaError(f"OS error creating directory: {d}") from e
332 | 
333 | 
334 |     # ----------------------------------------------------------------------
335 |     # 5. Main Analysis Routine
336 |     # ----------------------------------------------------------------------
337 | 
338 |     def save_raw_input_data(
339 |         self,
340 |         prompt: str,
341 |         function_tree_file: str,
342 |         current_function: Dict[str, str],
343 |         results_folder: str,
344 |         issue_id: int
345 |     ) -> None:
346 |         """
347 |         Saves the raw input data (prompt, function tree info, etc.) to a JSON file before
348 |         sending it to the LLM.
349 | 
350 |         Args:
351 |             prompt (str): The final prompt text sent to the LLM.
352 |             function_tree_file (str): Path to 'FunctionTree.csv'.
353 |             current_function (Dict[str, str]): The currently found function dict.
354 |             results_folder (str): Folder path where we store the result files.
355 |             issue_id (int): The numeric ID of the current issue.
356 |         
357 |         Raises:
358 |             VulnhallaError: If file cannot be written (permission denied, etc.).
359 |         """
360 |         raw_data = json.dumps({
361 |             "function_tree_file": function_tree_file,
362 |             "current_function": current_function,
363 |             "db_path": self.db_path,
364 |             "code_path": self.code_path,
365 |             "prompt": prompt
366 |         }, ensure_ascii=False)
367 | 
368 |         raw_output_file = os.path.join(results_folder, f"{issue_id}_raw.json")
369 |         write_file_ascii(raw_output_file, raw_data)
370 | 
371 |     def format_llm_messages(self, messages: List[str]) -> str:
372 |         """
373 |         Converts the list of messages returned by the LLM into a JSON-ish string to
374 |         store as output.
375 | 
376 |         Args:
377 |             messages (List[str]): The messages from the LLM.
378 | 
379 |         Returns:
380 |             str: A string representation of LLM messages (somewhat JSON-formatted).
381 |         """
382 |         gpt_result = "[\n    " + ",\n    ".join(
383 |             f"'''{item}'''" if "\n" in item else repr(item) for item in messages).replace("\\n", "\n    ").replace(
384 |             "\\t", " ") + "\n]"
385 |         return gpt_result
386 | 
387 |     def determine_issue_status(self, llm_content: str) -> str:
388 |         """
389 |         Checks the content returned by the LLM to see if it includes certain
390 |         status codes that classify the issue as 'true' or 'false' or 'more'.
391 | 
392 |         Args:
393 |             llm_content (str): The text content from the LLM's final response.
394 | 
395 |         Returns:
396 |             str: "true" if content has '1337', "false" if content has '1007',
397 |                  otherwise "more".
398 |         """
399 |         if "1337" in llm_content:
400 |             return "true"
401 |         elif "1007" in llm_content:
402 |             return "false"
403 |         else:
404 |             return "more"
405 | 
406 |     def append_extra_functions(
407 |         self,
408 |         extra_lines: List[tuple[str, str, str]],
409 |         function_tree_file: str,
410 |         src_zip_path: str,
411 |         code: str,
412 |         current_function: Dict[str, str]
413 |     ) -> Tuple[str, List[Dict[str, str]]]:
414 |         """
415 |         Searches for additional functions (via bracket references) outside the current one
416 |         and appends their code to the main snippet.
417 | 
418 |         Args:
419 |             extra_lines (List[tuple[str, str, str]]): All matches of additional references.
420 |             function_tree_file (str): Path to 'FunctionTree.csv'.
421 |             src_zip_path (str): Path to the DB's src.zip file.
422 |             code (str): The existing code snippet.
423 |             current_function (Dict[str, str]): The currently found function dict.
424 | 
425 |         Returns:
426 |             str: The extended code snippet, possibly including multiple functions.
427 |         
428 |         Raises:
429 |             CodeQLError: If function tree file or ZIP file cannot be read.
430 |         """
431 |         functions = [current_function]
432 |         for another_func_ref in extra_lines:
433 |             path_type, file_ref, line_ref = another_func_ref
434 |             file_ref = file_ref.strip()
435 | 
436 |             if path_type == "relative://":
437 |                 file_ref = self.code_path + file_ref
438 |             else:
439 |                 file_ref = file_ref[1:] if file_ref.startswith("/") else file_ref
440 | 
441 |             # If it's within the same function's line range, skip
442 |             start_line_func = int(current_function["start_line"])
443 |             end_line_func = int(current_function["end_line"])
444 |             if start_line_func <= int(line_ref) <= end_line_func:
445 |                 continue
446 | 
447 |             # Attempt to find the new function
448 |             new_function = self.find_function_by_line(function_tree_file, "/" + file_ref, int(line_ref))
449 |             if new_function and new_function not in functions:
450 |                 functions.append(new_function)
451 |                 code_file2 = read_file_lines_from_zip(src_zip_path, file_ref).split("\n")
452 |                 code += (
453 |                     "\n\nfile: " + file_ref + "\n" +
454 |                     self.extract_function_code(code_file2, new_function)
455 |                 )
456 | 
457 |         return code, functions
458 | 
459 |     def process_issue_type(
460 |         self,
461 |         issue_type: str,
462 |         issues_of_type: List[Dict[str, str]],
463 |         llm_analyzer: LLMAnalyzer
464 |     ) -> None:
465 |         """
466 |         Processes all issues of a single type. Builds file/folder paths, runs
467 |         analysis, calls the LLM, and saves results.
468 | 
469 |         Args:
470 |             issue_type (str): The name of the issue type.
471 |             issues_of_type (List[Dict[str, str]]): All issues belonging to that type.
472 |             llm_analyzer (LLMAnalyzer): The LLM analyzer instance to use for queries.
473 |         
474 |         Raises:
475 |             CodeQLError: If database files cannot be read (YAML, ZIP, CSV, etc.).
476 |             VulnhallaError: If result files cannot be written.
477 |             LLMError: If LLM analysis fails.
478 |         """
479 |         results_folder = os.path.join("output/results", self.lang, issue_type.replace(" ", "_").replace("/", "-"))
480 |         self.ensure_directories_exist([results_folder])
481 | 
482 |         issue_id = 0
483 |         real_issues = []
484 |         false_issues = []
485 |         more_data = []
486 | 
487 |         logger.info("Found %d issues of type %s", len(issues_of_type), issue_type)
488 |         logger.info("")
489 |         for issue in issues_of_type:
490 |             issue_id += 1
491 |             self.db_path = issue["db_path"]
492 |             db_yml_path = os.path.join(self.db_path, "codeql-database.yml")
493 |             db_yml = read_yml(db_yml_path)
494 |             self.code_path = db_yml["sourceLocationPrefix"]
495 | 
496 |             # Adjust Windows / Linux path references
497 |             if ":" in self.code_path:
498 |                 self.code_path = self.code_path.replace(":", "_").replace("\\", "/")
499 |             else:
500 |                 self.code_path = self.code_path[1:]
501 | 
502 |             function_tree_file = os.path.join(self.db_path, "FunctionTree.csv")
503 |             src_zip_path = os.path.join(self.db_path, "src.zip")
504 | 
505 |             full_file_path = self.code_path + issue["file"]
506 |             code_file_contents = read_file_lines_from_zip(src_zip_path, full_file_path).split("\n")
507 | 
508 |             current_function = self.find_function_by_line(
509 |                 function_tree_file,
510 |                 "/" + self.code_path + issue["file"],
511 |                 int(issue["start_line"])
512 |             )
513 |             if not current_function:
514 |                 logger.warning("issue %s: Can't find the function or function is too big!", issue_id)
515 |                 continue
516 | 
517 |             snippet = code_file_contents[int(issue["start_line"]) - 1][
518 |                 int(issue["start_offset"]) - 1:int(issue["end_offset"])
519 |             ]
520 | 
521 |             code = (
522 |                 "file: " + self.code_path + issue["file"] + "\n" +
523 |                 self.extract_function_code(code_file_contents, current_function)
524 |             )
525 | 
526 |             # Replace bracket references in the issue message
527 |             bracket_pattern = r'\[\["(.*?)"\|"((?:relative://|file://))?(/.*?):(\d+):(\d+):\d+:(\d+)"\]\]'
528 |             transform_func = self.create_bracket_reference_replacer(self.db_path, self.code_path)
529 |             message = re.sub(bracket_pattern, transform_func, issue["message"])
530 | 
531 |             # Also check for lines referencing other code blocks
532 |             extra_lines_pattern = r'\[\[".*?"\|"((?:relative://|file://)?)(/.*?):(\d+):\d+:\d+:\d+"\]\]'
533 |             extra_lines = re.findall(extra_lines_pattern, issue["message"])
534 |             functions = [current_function]
535 | 
536 |             if extra_lines:
537 |                 code, functions = self.append_extra_functions(
538 |                     extra_lines, function_tree_file, src_zip_path, code, current_function
539 |                 )
540 | 
541 |             prompt = self.build_prompt_by_template(issue, message, snippet, code)
542 | 
543 |             # Save raw input to the LLM
544 |             self.save_raw_input_data(prompt, function_tree_file, current_function, results_folder, issue_id)
545 | 
546 |             # Send to LLM
547 |             messages, content = llm_analyzer.run_llm_security_analysis(
548 |                 prompt,
549 |                 function_tree_file,
550 |                 current_function,
551 |                 functions,
552 |                 self.db_path
553 |             )
554 |             gpt_result = self.format_llm_messages(messages)
555 |             final_file = os.path.join(results_folder, f"{issue_id}_final.json")
556 |             write_file_ascii(final_file, gpt_result)
557 | 
558 |             # Check status code in LLM content
559 |             status = self.determine_issue_status(content)
560 |             if status == "true":
561 |                 real_issues.append(issue_id)
562 |                 status = "True Positive"
563 |             elif status == "false":
564 |                 false_issues.append(issue_id)
565 |                 status = "False Positive"
566 |             else:
567 |                 more_data.append(issue_id)
568 |                 status = "LLM needs More Data"
569 | 
570 |             # Log issue status
571 |             logger.info("Issue ID: %s, LLM decision: → %s", issue_id, status)
572 | 
573 |         logger.info("")
574 |         logger.info("Issue type: %s", issue_type)
575 |         logger.info("Total issues: %d", len(issues_of_type))
576 |         logger.info("True Positive: %d", len(real_issues))
577 |         logger.info("False Positive: %d", len(false_issues))
578 |         logger.info("LLM needs More Data: %d", len(more_data))
579 |         logger.info("")
580 | 
581 |     def run(self) -> None:
582 |         """
583 |         Main analysis routine:
584 |         1. Initializes the LLM.
585 |         2. Finds all CodeQL DBs for the given language.
586 |         3. Parses each DB's issues.csv, aggregates them by issue type.
587 |         4. Asks the LLM for each issue's snippet context, saving final results
588 |            in various directory structures.
589 |         
590 |         Raises:
591 |             CodeQLError: If database files cannot be accessed or read.
592 |             VulnhallaError: If directory creation or file writing fails.
593 |             LLMError: If LLM initialization or analysis fails.
594 |         """
595 |         # Validate configuration before starting
596 |         if self.config is None:
597 |             validate_and_exit_on_error()
598 |         
599 |         llm_analyzer = LLMAnalyzer()
600 |         llm_analyzer.init_llm_client(config=self.config)
601 | 
602 |         dbs_folder = os.path.join("output/databases", self.lang)
603 | 
604 |         # Gather issues from all DBs
605 |         issues_statistics = self.collect_issues_from_databases(dbs_folder)
606 | 
607 |         total_issues = 0
608 |         for issue_type in issues_statistics:
609 |             total_issues += len(issues_statistics[issue_type])
610 |         logger.info("Total issues found: %d", total_issues)
611 |         logger.info("")
612 | 
613 |         # Process all issues, type by type
614 |         for issue_type in issues_statistics.keys():
615 |             self.process_issue_type(issue_type, issues_statistics[issue_type], llm_analyzer)
616 | 
617 | if __name__ == '__main__':
618 |     # Initialize logging
619 |     from src.utils.logger import setup_logging
620 |     setup_logging()
621 |     
622 |     # Loads configuration from .env file
623 |     # Or use: analyzer = IssueAnalyzer(lang="c", config={...})
624 |     analyzer = IssueAnalyzer(lang="c")
625 |     analyzer.run()
626 | 
627 | 


--------------------------------------------------------------------------------