├── .coveragerc
├── .env.example
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── documentation.yml
    │   └── enhancement.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── configs
    ├── database.yml
    └── prompts
    │   └── default.yml
├── data
    └── examples
    │   ├── example.sql
    │   └── folders_with_sql_files
    │       ├── example1.sql
    │       ├── example2.sql
    │       ├── random_file.txt
    │       ├── subfolder1
    │           ├── example3.sql
    │           └── example3_expected.json
    │       └── subfolder2
    │           ├── random_file.txt
    │           └── subfolder2_1
    │               ├── example4.sql
    │               ├── example5.sql
    │               └── random_file.yml
├── docs
    ├── .gitkeep
    ├── api-reference
    │   ├── app.md
    │   ├── cache.md
    │   ├── cli.md
    │   ├── config.md
    │   ├── database.md
    │   ├── llm-parsers.md
    │   ├── models.md
    │   ├── parallel.md
    │   ├── rate-limiter.md
    │   ├── utils.md
    │   └── visualization.md
    ├── assets
    │   └── images
    │   │   └── sqldeps_logo.png
    ├── authors.md
    ├── changelog.md
    ├── contributing.md
    ├── docs-requirements.txt
    ├── examples.md
    ├── getting-started
    │   ├── installation.md
    │   └── quick-start.md
    ├── index.md
    ├── stylesheets
    │   └── custom.css
    └── user-guide
    │   ├── api-usage.md
    │   ├── cli-usage.md
    │   ├── database-integration.md
    │   ├── visualization.md
    │   └── web-app.md
├── mkdocs.yml
├── notebooks
    ├── .gitkeep
    └── sqldeps_showcase.ipynb
├── pyproject.toml
├── scripts
    └── .gitkeep
├── sqldeps
    ├── __init__.py
    ├── app
    │   ├── __init__.py
    │   ├── assets
    │   │   └── images
    │   │   │   ├── sqldeps_gray.png
    │   │   │   └── sqldeps_white.png
    │   └── main.py
    ├── cache.py
    ├── cli.py
    ├── config.py
    ├── configs
    │   └── prompts
    │   │   ├── default.yml
    │   │   ├── default_v0.1.0.yml
    │   │   └── simplified.yml
    ├── database
    │   ├── __init__.py
    │   ├── base.py
    │   └── postgresql.py
    ├── llm_parsers
    │   ├── __init__.py
    │   ├── base.py
    │   ├── deepseek.py
    │   ├── groq.py
    │   ├── litellm.py
    │   └── openai.py
    ├── models.py
    ├── parallel.py
    ├── rate_limiter.py
    ├── utils.py
    └── visualization.py
└── tests
    ├── conftest.py
    ├── data
        ├── expected_outputs
        │   ├── example10_expected.json
        │   ├── example1_expected.json
        │   ├── example2_expected.json
        │   ├── example3_expected.json
        │   ├── example4_expected.json
        │   ├── example5_expected.json
        │   ├── example6_expected.json
        │   ├── example7_expected.json
        │   ├── example8_expected.json
        │   └── example9_expected.json
        ├── oneshot.json
        ├── oneshot.sql
        └── sql
        │   ├── example1.sql
        │   ├── example10.sql
        │   ├── example2.sql
        │   ├── example3.sql
        │   ├── example4.sql
        │   ├── example5.sql
        │   ├── example6.sql
        │   ├── example7.sql
        │   ├── example8.sql
        │   └── example9.sql
    ├── functional
        └── test_sql.py
    ├── integration
        └── test_database.py
    └── unit
        ├── app
            ├── __init__.py
            └── test_main.py
        ├── database
            └── test_postgresql.py
        ├── llm_parsers
            ├── test_base.py
            ├── test_deepseek.py
            ├── test_groq.py
            ├── test_init.py
            └── test_openai.py
        ├── test_cache.py
        ├── test_cli.py
        ├── test_config.py
        ├── test_models.py
        ├── test_parallel.py
        ├── test_rate_limiter.py
        ├── test_utils.py
        └── test_visualization.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = 
3 |     sqldeps/app/*
4 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Database credentials
 2 | DB_HOST = "host"
 3 | DB_PORT = "5432"
 4 | DB_NAME = "database"
 5 | DB_USER = "username"
 6 | DB_PASSWORD = "password"
 7 | 
 8 | # Test database credentials
 9 | TEST_DB_HOST = "host"
10 | TEST_DB_PORT = "5432"
11 | TEST_DB_NAME = "database"
12 | TEST_DB_USER = "username"
13 | TEST_DB_PASSWORD = "password"
14 | 
15 | # API Keys
16 | GROQ_API_KEY = "groq_token"
17 | OPENAI_API_KEY = "openai_token"
18 | DEEPSEEK_API_KEY = "deepseek_token"
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: "🐛 Bug Report"
 2 | description: Report a bug in SQLDeps.
 3 | title: "[BUG]: <Please write a comprehensive title after the 'BUG: ' prefix>"
 4 | labels: ["bug"]
 5 | 
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for taking the time to fill out this bug report!
11 |   - type: textarea
12 |     id: what-happened
13 |     attributes:
14 |       label: What happened?
15 |       description: A clear and concise description of what the bug is.
16 |       placeholder: Tell us what you see!
17 |     validations:
18 |       required: true
19 |   - type: textarea
20 |     id: reproduce
21 |     attributes:
22 |       label: Steps to reproduce
23 |       description: How can we reproduce this issue?
24 |       placeholder: |
25 |         1. Run `...`
26 |         2. See error
27 |     validations:
28 |       required: true
29 |   - type: textarea
30 |     id: expected
31 |     attributes:
32 |       label: Expected behavior
33 |       description: What did you expect to happen?
34 |   - type: textarea
35 |     id: environment
36 |     attributes:
37 |       label: Environment
38 |       description: Include relevant details about your environment
39 |       placeholder: |
40 |         - Python version: [e.g. 3.12]
41 |         - SQLDeps version: [e.g. 0.0.10]
42 |         - OS: [e.g. macOS, Windows, Linux]
43 |   - type: textarea
44 |     id: additional
45 |     attributes:
46 |       label: Additional context
47 |       description: Add any other context or screenshots about the bug here.
48 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Questions
4 |     about: Ask a question or ask about a problem in GitHub Discussions.
5 |     url: https://github.com/glue-lab/sqldeps/discussions/categories/questions
6 |   - name: Feature Request
7 |     about: To suggest an idea or ask about a feature, please start with a question saying what you would like to achieve.
8 |     url: https://github.com/glue-lab/sqldeps/discussions/categories/ideas
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: "🖹 Documentation"
 2 | description: Report an issue (e.g., typo) related to the documentation.
 3 | title: "[DOC]: <Please write a comprehensive title after the 'DOC: ' prefix>"
 4 | labels: [documentation]
 5 | 
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for helping us improve the SQLDeps documentation!
11 |   - type: dropdown
12 |     id: type
13 |     attributes:
14 |       label: Type of documentation issue
15 |       options:
16 |         - Error/typo in existing documentation
17 |         - Missing documentation
18 |         - Confusing explanation
19 |         - Other
20 |     validations:
21 |       required: true
22 |   - type: textarea
23 |     id: description
24 |     attributes:
25 |       label: Description
26 |       description: What needs to be improved or fixed?
27 |       placeholder: A clear description of what's wrong or missing in the documentation
28 |     validations:
29 |       required: true
30 |   - type: textarea
31 |     id: location
32 |     attributes:
33 |       label: Location
34 |       description: Where can we find this documentation issue?
35 |       placeholder: URLs, file paths, etc.
36 |   - type: textarea
37 |     id: suggestion
38 |     attributes:
39 |       label: Suggested improvement
40 |       description: Have suggestions for how to improve the documentation?
41 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.yml:
--------------------------------------------------------------------------------
 1 | name: "📈 Enhancement Request"
 2 | description: Suggest an enhancement for SQLDeps
 3 | title: "[Enhancement]: <Please write a comprehensive title after the 'Enhancement: ' prefix>"
 4 | labels: ["enhancement"]
 5 | 
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for suggesting an enhancement to SQLDeps!
11 |   - type: textarea
12 |     id: problem
13 |     attributes:
14 |       label: Problem to solve
15 |       description: What problem would this enhancement solve?
16 |       placeholder: A clear description of what limitation or issue you're trying to address
17 |     validations:
18 |       required: true
19 |   - type: textarea
20 |     id: solution
21 |     attributes:
22 |       label: Proposed solution
23 |       description: What solution would you like to see?
24 |       placeholder: Describe how you'd like to see this implemented
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: additional
29 |     attributes:
30 |       label: Additional details
31 |       description: Add any other context, code examples, or references here
32 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   lint:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 | 
14 |       - name: Install uv
15 |         run: |
16 |           curl -LsSf https://astral.sh/uv/install.sh | sh
17 |           echo "$HOME/.local/bin" >> $GITHUB_PATH
18 | 
19 |       - name: Install core dependencies
20 |         run: uv sync
21 | 
22 |       - name: Lint
23 |         run: |
24 |           uv run ruff check .
25 |           uv run ruff format --check .
26 | 
27 |   test:
28 |     strategy:
29 |       matrix:
30 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
31 |         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
32 | 
33 |     runs-on: ${{ matrix.os }}
34 |     
35 |     steps:
36 |       - uses: actions/checkout@v4
37 | 
38 |       - name: Set up Python ${{ matrix.python-version }}
39 |         uses: actions/setup-python@v5
40 |         with:
41 |           python-version: ${{ matrix.python-version }}
42 | 
43 |       - name: Install uv (Unix)
44 |         if: runner.os != 'Windows'
45 |         run: |
46 |           curl -LsSf https://astral.sh/uv/install.sh | sh
47 |           echo "$HOME/.local/bin" >> $GITHUB_PATH
48 |       
49 |       - name: Install uv (Windows)
50 |         if: runner.os == 'Windows'
51 |         run: |
52 |           powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
53 |         
54 |       - name: Install PostgreSQL (Ubuntu)
55 |         if: matrix.os == 'ubuntu-latest'
56 |         run: sudo apt-get update && sudo apt-get install -y libpq-dev
57 | 
58 |       - name: Install PostgreSQL (macOS)
59 |         if: matrix.os == 'macos-latest'
60 |         run: brew install postgresql
61 | 
62 |       - name: check python version
63 |         run: uv run python --version
64 | 
65 |       - name: Install all dependencies (including optional)
66 |         run: uv sync --all-extras
67 | 
68 |       - name: Run tests
69 |         run: uv run pytest
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | # Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | # poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | .idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # SQLDeps-specific cache:
174 | .sqldeps_cache
175 | 
176 | # PyPI configuration file
177 | .pypirc
178 | 
179 | # Data - ignore all data execpt examples
180 | data/*
181 | !/data/examples
182 | 
183 | # Ignore output artifacts
184 | outputs/
185 | artifacts/
186 | 
187 | # Ignore notebooks except target ones
188 | notebooks/*
189 | !notebooks/.gitkeep
190 | !notebooks/sqldeps_showcase.ipynb
191 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |   # Ruff version.
 4 |   rev: v0.11.8
 5 |   hooks:
 6 |     # Run check for the linter.
 7 |     - id: ruff
 8 |       types_or: [ python, pyi ]
 9 |     # Run check for the formatter.
10 |     - id: ruff-format
11 |       args: [ --check ]
12 |       types_or: [ python, pyi ]
13 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.12"
 7 | 
 8 | mkdocs:
 9 |   configuration: mkdocs.yml
10 |   fail_on_warning: false
11 | 
12 | python:
13 |   install:
14 |     - requirements: docs/docs-requirements.txt
15 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to SQLDeps will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | ## [0.1.1] - 2025-05-05
 8 | 
 9 | ### Added
10 | - Added LiteLLM as the default proxy to LLM providers
11 | - Added pre-commit hook configuration
12 | - Added example for SQL database connection config file
13 | - Added optional PostgreSQL dependencies
14 | 
15 | ### Changed
16 | - Updated and optimized prompts for better SQL dependency analysis
17 | - Improved documentation (README, installation, quick-start, user guides)
18 | - Updated and optimized tests with new test data
19 | - Updated SQLDeps version in package metadata
20 | - Updated web application
21 | 
22 | ### Fixed
23 | - Fixed temperature parameter usage in OpenAI calls
24 | 
25 | ## [0.1.0] - 2025-04-04
26 | 
27 | ### Added
28 | 
29 | 
30 | - Added caching system (#3, #11)
31 | - Added rate limiter per minute (RPM) (#2, #11)
32 | - Added support to multiprocessing (#2, #11)
33 | - Added Streamlit-based app as part of optional package dependencies (#7)
34 | - Added GiHub Issue templates (#1).
35 | 
36 | ### Changed
37 | 
38 | - Improved CLI with new features and subcommands (#8, #11)
39 | - Improved unit test coverage and test structure (#5, #11)
40 | - Improved interactive visualization function input (#12, #14)
41 | - Added/improved comprehensive docstring for Python files (#9, #12)
42 | 
43 | ### Fixed
44 | 
45 | - Bug fixes (#6)
46 | 
47 | ### Removed
48 | 
49 | - Unused/outdated validation scripts
50 | 
51 | ## [0.0.1] - 2025-03-20
52 | 
53 | - Pre-release of SQLDeps! 
54 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to SQLDeps
  2 | 
  3 | Thank you for considering contributing to SQLDeps! This guide explains how to set up your development environment and contribute to the project.
  4 | 
  5 | ## Questions and Discussion
  6 | 
  7 | For questions or discussions, please check [SQLDeps Discussions](https://github.com/glue-lab/sqldeps/discussions) or reach out to the maintainers directly.
  8 | 
  9 | ## Development Setup
 10 | 
 11 | ### Prerequisites
 12 | 
 13 | - [Git](https://git-scm.com/)
 14 | - [UV](https://docs.astral.sh/uv/)
 15 | 
 16 | ### Clone the Repository
 17 | 
 18 | ```bash
 19 | git clone https://github.com/glue-lab/sqldeps.git
 20 | cd sqldeps
 21 | ```
 22 | 
 23 | ### Install Development Dependencies
 24 | 
 25 | SQLDeps uses [`uv`](https://github.com/astral-sh/uv) as the package manager for development.
 26 | 
 27 | After installing UV, run:
 28 | 
 29 | ```bash
 30 | uv sync
 31 | ```
 32 | 
 33 | This will create a virtual environment with the correct Python version and all the required dependencies, including:
 34 | 
 35 | - Core dependencies
 36 | - Development tools (`pytest`, `ruff`, etc.)
 37 | - Documentation tools (`mkdocs`, etc.)
 38 | 
 39 | ### Environment Variables
 40 | 
 41 | Create a `.env` file in the project root with your API keys:
 42 | 
 43 | ```
 44 | # LLM API Keys
 45 | GROQ_API_KEY=your_groq_api_key
 46 | OPENAI_API_KEY=your_openai_api_key
 47 | DEEPSEEK_API_KEY=your_deepseek_api_key
 48 | ANTHROPIC_API_KEY=your_anthropic_api_key
 49 | ```
 50 | 
 51 | For instance, [Groq](https://console.groq.com/keys) offers free tokens without requiring payment details, making it ideal for contributions.
 52 | 
 53 | ## Development Workflow
 54 | 
 55 | ### Code Style
 56 | 
 57 | SQLDeps uses Ruff for code formatting and linting:
 58 | 
 59 | ```bash
 60 | # Format code
 61 | uv run ruff format .
 62 | 
 63 | # Fix linting issues
 64 | uv run ruff check . --fix
 65 | ```
 66 | 
 67 | Alternatively, you can easily check and apply formatting and linting with `make`:
 68 | 
 69 | ```bash
 70 | # Check code style without fixing
 71 | make check
 72 | 
 73 | # Apply fixes
 74 | make fix
 75 | ```
 76 | 
 77 | ### Running Tests
 78 | 
 79 | The test suite is set up with markers to allow selective testing:
 80 | 
 81 | ```bash
 82 | # Run all tests except those marked as 'llm' or 'integration'
 83 | # This is the default when running pytest without arguments
 84 | uv run pytest
 85 | 
 86 | # Run tests with a specific marker
 87 | uv run pytest -m llm  # Run LLM-dependent tests (requires API keys)
 88 | uv run pytest -m integration  # Run integration tests (requires database)
 89 | 
 90 | # Run tests with a specific framework
 91 | uv run pytest --framework=groq
 92 | 
 93 | # Run specific test files
 94 | uv run pytest tests/unit/test_models.py
 95 | 
 96 | # Run with coverage report
 97 | uv run pytest --cov=sqldeps
 98 | ```
 99 | 
100 | Note that by default tests marked with `llm` and `integration` are skipped to avoid requiring external dependencies during CI/CD. These tests require valid API keys and/or database connections.
101 | 
102 | ### Building Documentation
103 | 
104 | ```bash
105 | # Build and serve documentation locally
106 | uv run mkdocs serve
107 | ```
108 | 
109 | This will start a local server at `http://127.0.0.1:8000` where you can preview the documentation.
110 | 
111 | ## Project Structure
112 | 
113 | Here's the simplified project structure:
114 | 
115 | ```
116 | sqldeps/
117 | ├── .github/              # GitHub configuration files
118 | ├── configs/              # External configuration files for experiments
119 | ├── docs/                 # Documentation files
120 | ├── sqldeps/              # Main package source code
121 | │   ├── app/              # Streamlit web application
122 | │   ├── database/         # Database connector implementations
123 | │   ├── llm_parsers/      # LLM integration for SQL parsing
124 | │   └── ...               # Other core modules
125 | └── tests/                # Test suite
126 | ```
127 | 
128 | ## Adding Features
129 | 
130 | ### Adding a New LLM Provider
131 | 
132 | 1. Create a new file in `sqldeps/llm_parsers/` following the pattern of existing providers
133 | 2. Implement the required methods from `BaseSQLExtractor`
134 | 3. Add the new provider to `__init__.py` and the `DEFAULTS` dictionary
135 | 4. Add tests in `tests/` (both unit and functional tests)
136 | 
137 | ### Adding Database Support
138 | 
139 | 1. Create a new file in `sqldeps/database/` following the pattern of existing connectors
140 | 2. Implement the required methods from `SQLBaseConnector`
141 | 3. Add the new connector to `__init__.py`
142 | 4. Add tests in `tests/` (both unit and integration tests)
143 | 
144 | ## Pull Request Process
145 | 
146 | 1. Fork the repository
147 | 2. Create a new branch for your feature or bug fix
148 | 3. Make your changes and add tests
149 | 4. Run the tests and linting checks
150 | 5. Update documentation if necessary
151 | 6. Submit a pull request to the `main` branch
152 | 
153 | ## Package Versioning
154 | 
155 | SQLDeps follows [Semantic Versioning](https://semver.org/):
156 | 
157 | - **MAJOR** version when making incompatible API changes
158 | - **MINOR** version when adding functionality in a backward-compatible manner
159 | - **PATCH** version when making backward-compatible bug fixes
160 | 
161 | ## Code of Conduct
162 | 
163 | Please be respectful and inclusive when contributing to SQLDeps. We strive to maintain a welcoming environment for all contributors.
164 | 
165 | ## License
166 | 
167 | By contributing to SQLDeps, you agree that your contributions will be licensed under the project's MIT License.
168 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Global Land Use and Environment Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: fix 
 2 | fix:
 3 | 	uv run ruff format .
 4 | 	uv run ruff check . --fix
 5 | 
 6 | .PHONY: check
 7 | check:
 8 | 	-uv run ruff format . --check
 9 | 	-uv run ruff check .
10 | 
11 | .PHONY: clean
12 | clean:
13 | 	# Remove Python cache directories
14 | 	find . -type d \( \
15 | 		-name "__pycache__" -o \
16 | 		-name "*.egg-info" -o \
17 | 		-name ".eggs" -o \
18 | 		-name ".ipynb_checkpoints" \
19 | 	\) -exec rm -rf {} +
20 | 
21 | 	# Remove compiled Python files
22 | 	find . -name "*.pyc" -delete
23 | 
24 | 	# Remove build, test, and cache directories
25 | 	rm -rf dist build htmlcov .pytest_cache .ruff_cache .sqldeps_cache .mypy_cache .tox 2>/dev/null || true
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SQLDeps: SQL Dependency Extractor
  2 | 
  3 | <p align="center">
  4 |   <img src="https://github.com/glue-lab/sqldeps/blob/main/docs/assets/images/sqldeps_logo.png?raw=true" alt="SQLDeps Logo" width="300">
  5 | </p>
  6 | 
  7 | <p align="left">
  8 | <a href="https://github.com/glue-lab/sqldeps/actions/workflows/ci.yml" target="_blank">
  9 |     <img src="https://github.com/glue-lab/sqldeps/actions/workflows/ci.yml/badge.svg" alt="Test">
 10 | </a>
 11 | <a href="https://sqldeps.readthedocs.io/en/latest/" target="_blank">
 12 |     <img src="https://readthedocs.org/projects/sqldeps/badge/?version=latest" alt="Documentation">
 13 | </a>
 14 | <a href="https://pypi.org/project/sqldeps" target="_blank">
 15 |     <img src="https://img.shields.io/pypi/pyversions/sqldeps.svg?color=%2334D058" alt="Supported Python versions">
 16 | </a>
 17 | <a href="https://pypi.org/project/sqldeps" target="_blank">
 18 |     <img src="https://img.shields.io/pypi/v/sqldeps?color=%2334D058&label=pypi%20package" alt="Package version">
 19 | </a>
 20 | <a href="https://opensource.org/licenses/MIT" target="_blank">
 21 |     <img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License">
 22 | </a>
 23 | </p>
 24 | 
 25 | A tool that automatically extracts and maps SQL dependencies and outputs using Large Language Models (LLMs).
 26 | 
 27 | ---
 28 | 
 29 | - **Documentation**: [https://sqldeps.readthedocs.io/](https://sqldeps.readthedocs.io/)
 30 | - **Code repositoty**: [https://github.com/glue-lab/sqldeps](https://sqldeps.readthedocs.io/)
 31 | <!-- - **Simulate SQLDeps savings for your team**: [Streamlit WebApp](https://sqldeps-simulator.streamlit.app/) -->
 32 | 
 33 | 
 34 | ---
 35 | 
 36 | ## Overview
 37 | 
 38 | SQLDeps analyzes SQL scripts to identify:
 39 | 
 40 | 1. **Dependencies**: Tables and columns that must exist BEFORE query execution
 41 | 2. **Outputs**: Tables and columns permanently CREATED or MODIFIED by the query
 42 | 
 43 | It intelligently filters out temporary constructs like CTEs and derived tables, focusing only on the real database objects that matter.
 44 | 
 45 | ### Benefits
 46 | 
 47 | - 🛠️ **Change Management:** Safely modify schemas by identifying true dependencies
 48 | - 💾 **Storage Optimization:** Focus resources on essential tables and columns
 49 | - 🚢 **Migration Planning:** Precisely determine what needs to be migrated
 50 | - 📝 **Project Documentation:** Create comprehensive maps of database dependencies
 51 | 
 52 | ## Installation
 53 | 
 54 | ```bash
 55 | pip install sqldeps
 56 | ```
 57 | 
 58 | For additional functionality:
 59 | 
 60 | ```bash
 61 | # Install with web app dependencies
 62 | pip install "sqldeps[app]"
 63 | 
 64 | # Install with data visualization dependencies
 65 | pip install "sqldeps[dataviz]"
 66 | 
 67 | # Install all optional dependencies
 68 | pip install "sqldeps[app,postgres,dataviz]"
 69 | ```
 70 | 
 71 | ## Quick Start
 72 | 
 73 | SQLDeps provides both API and CLI interfaces:
 74 | - **API**: Flexible for Python developers to integrate into scripts, notebooks, or applications.
 75 | - **CLI**: Fast and user-friendly for analyzing files or folders directly from the command line.
 76 | 
 77 | ### API Usage
 78 | 
 79 | ```python
 80 | from sqldeps.llm_parsers import create_extractor
 81 | 
 82 | # Create extractor with default settings (framework="litellm", model="openai/gpt-4.1")
 83 | extractor = create_extractor()
 84 | 
 85 | # Extract dependencies and outputs from a SQL query
 86 | sql_query = """
 87 | WITH user_orders AS (
 88 |     SELECT o.user_id, COUNT(*) AS order_count
 89 |     FROM orders o
 90 |     JOIN users u ON o.user_id = u.id
 91 |     WHERE u.status = 'active'
 92 |     GROUP BY o.user_id
 93 | )
 94 | 
 95 | CREATE TABLE transactions.user_order_summary AS
 96 | SELECT * FROM user_orders;
 97 | """
 98 | result = extractor.extract_from_query(sql_query)
 99 | 
100 | # Print the results
101 | print("Dependencies:")
102 | print(result.dependencies)
103 | print("\nOutputs:")
104 | print(result.outputs)
105 | 
106 | # Or extract from a file
107 | result = extractor.extract_from_file('path/to/query.sql')
108 | 
109 | # Convert to dictionary or DataFrame
110 | dict_format = result.to_dict()
111 | df_format = result.to_dataframe()
112 | ```
113 | 
114 | ### CLI Usage
115 | 
116 | ```bash
117 | # Basic example with default settings
118 | sqldeps extract path/to/query.sql
119 | 
120 | # Specify framework and output format
121 | sqldeps extract path/to/query.sql --framework=openai --model=gpt-4.1-mini -o results.json
122 | 
123 | # Scan a folder recursively with intelligent parallelization
124 | sqldeps extract \
125 |     data/sql_folder \       # Automatically detect if path is file or folder       
126 |     --recursive \           # Scan folder recursively
127 |     --framework=deepseek \  # Specify framework/provider
128 |     --rpm 50                # Maximum 50 requests per minute
129 |     --n-workers -1 \        # Use all available processors
130 |     -o results.csv          # Output a dataframe as CSV instead of JSON
131 | ```
132 | 
133 | ```bash
134 | # Get help on available commands
135 | sqldeps --help
136 | 
137 | # Get help on extract - the main command
138 | sqldeps extract --help
139 | ```
140 | 
141 | ### Web Application
142 | 
143 | SQLDeps also includes a Streamlit-based web interface:
144 | 
145 | ```bash
146 | # Run the web app
147 | sqldeps app
148 | ```
149 | 
150 | **Note**: The web application is designed for single-file extraction and demonstration purposes. For processing multiple files or entire folders, use the API or CLI instead.
151 | 
152 | ## Example
153 | 
154 | Given this SQL query:
155 | 
156 | ```sql
157 | -- Common Table Expression (CTE) to count user orders for active users
158 | WITH user_orders AS (
159 |     SELECT o.user_id, COUNT(*) AS order_count
160 |     FROM orders o
161 |     JOIN users u ON o.user_id = u.id
162 |     WHERE u.status = 'active'
163 |     GROUP BY o.user_id
164 | )
165 | 
166 | -- Create a new table from the CTE
167 | CREATE TABLE transactions.user_order_summary AS
168 | SELECT * FROM user_orders;
169 | ```
170 | 
171 | SQLDeps will extract:
172 | 
173 | ```json
174 | {
175 |   "dependencies": {
176 |     "orders": ["user_id"],
177 |     "users": ["id", "status"]
178 |   },
179 |   "outputs": {
180 |     "transactions.user_order_summary": ["*"]
181 |   }
182 | }
183 | ```
184 | 
185 | Notice how:
186 | 
187 | - CTE (`user_orders`) is correctly excluded
188 | - Real source tables (`orders`, `users`) are included as dependencies
189 | - Target table (`transactions.user_order_summary`) is correctly identified as output
190 | 
191 | ## Supported Models
192 | 
193 | All models available on [Groq](https://console.groq.com/docs/models), [OpenAI](https://platform.openai.com/docs/models), and [DeepSeek](https://api-docs.deepseek.com/).  
194 | For up-to-date pricing details, please check [Groq](https://groq.com/pricing/), [OpenAI](https://platform.openai.com/docs/pricing), [DeepSeek](https://api-docs.deepseek.com/quick_start/pricing).
195 | 
196 | ## API Keys / Configuration
197 | 
198 | You'll need to set up API keys for your chosen LLM provider. Create a `.env` file in your project root:
199 | 
200 | ```
201 | # LLM API Keys
202 | GROQ_API_KEY=your_groq_api_key
203 | OPENAI_API_KEY=your_openai_api_key
204 | DEEPSEEK_API_KEY=your_deepseek_api_key
205 | ANTHROPIC_API_KEY=your_anthropic_api_key
206 | 
207 | # Database credentials (for schema validation)
208 | DB_HOST=localhost
209 | DB_PORT=5432
210 | DB_NAME=mydatabase
211 | DB_USER=username
212 | DB_PASSWORD=password
213 | ```
214 | 
215 | > **Tip:** [Groq](https://console.groq.com/keys) offers free tokens without requiring payment details, making it ideal for getting started quickly.
216 | 
217 | ## Advanced Usage
218 | 
219 | ### Database Schema Matching
220 | 
221 | SQLDeps allows the user to match SQLDeps results (table/column dependencies and outputs) with database schemas to retrieve column data types.
222 | 
223 | ```python
224 | from sqldeps.database import PostgreSQLConnector
225 | from sqldeps.llm_parsers import create_extractor
226 | 
227 | # Extract dependencies
228 | extractor = create_extractor(model="openai/gpt-4.1-mini")
229 | result = extractor.extract_from_file('query.sql')
230 | 
231 | # Connect to database and validate
232 | conn = PostgreSQLConnector(
233 |     host="localhost",
234 |     port=5432,
235 |     database="mydatabase",
236 |     username="username"
237 | )
238 | 
239 | # Match extracted dependencies against database schema
240 | matching_schema = extractor.match_database_schema(
241 |     result,
242 |     db_connection=conn,
243 |     target_schemas=["public", "sales"]
244 | )
245 | 
246 | # View validation results as a pandas DataFrame
247 | print(matching_schema)
248 | ```
249 | 
250 | For custom database YAML configuration file (optional):
251 | 
252 | ```yml
253 | # database.yml
254 | database:
255 |   host: localhost
256 |   port: 5432
257 |   database: mydatabase
258 |   username: username
259 |   password: password
260 | ```
261 | 
262 | ### Using Custom Prompts
263 | 
264 | You can customize the prompts used to instruct the LLM:
265 | 
266 | ```python
267 | # Create extractor with custom prompt
268 | extractor = create_extractor(
269 |     model="groq/llama-3.3-70b-versatile",
270 |     prompt_path="path/to/custom_prompt.yml"
271 | )
272 | ```
273 | 
274 | The custom prompt YAML should include:
275 | 
276 | ```yaml
277 | system_prompt: |
278 |   You are a SQL analyzer that extracts two key elements from SQL queries:
279 | 
280 |   1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution.
281 |   2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query.
282 | 
283 |   # Add detailed instructions for the LLM here...
284 | 
285 | user_prompt: |
286 |   Extract SQL dependencies and outputs from this query:
287 |   {sql}
288 | ```
289 | 
290 | ### Interactive Visualization of SQL Dependency Graphs
291 | 
292 | SQLDeps provides built-in visualization capabilities to help you understand complex SQL dependencies:
293 | 
294 | ```python
295 | from sqldeps.llm_parsers import create_extractor
296 | from sqldeps.visualization import visualize_sql_dependencies
297 | 
298 | # Create an interactive network graph from multiple SQL files
299 | extractor = create_extractor()
300 | sql_profiles = extractor.extract_from_folder("path/to/folder", recursive=False)
301 | 
302 | # Generate an interactive visualization (saving output to an HTML file)
303 | figure = visualize_sql_dependencies(sql_profiles, output_path="dependencies.html")
304 | 
305 | # Show figure
306 | figure.show()
307 | ```
308 | 
309 | ## Documentation
310 | 
311 | For comprehensive documentation, including API reference and examples, visit [https://sqldeps.readthedocs.io](https://sqldeps.readthedocs.io/).
312 | 
313 | ## Contributing
314 | 
315 | Contributions are welcome! 
316 | 
317 | - Found a bug? Please [open an issue](https://github.com/glue-lab/sqldeps/issues) with detailed information.
318 | - Missing a feature? Feel free to [suggest enhancements](https://github.com/glue-lab/sqldeps/discussions/categories/ideas) or submit a pull request.
319 | 
320 | Please check out the [Contributing Guide](https://sqldeps.readthedocs.io/en/latest/contributing/) for details.
321 | 
322 | 
323 | ## License
324 | 
325 | MIT
326 | 


--------------------------------------------------------------------------------
/configs/database.yml:
--------------------------------------------------------------------------------
1 | database:
2 |   host: xx.xx.xx.xx
3 |   port: 5432
4 |   database: database_name
5 |   username: username
6 |   password: password
7 | 


--------------------------------------------------------------------------------
/configs/prompts/default.yml:
--------------------------------------------------------------------------------
 1 | system_prompt: |
 2 |   You are a SQL analyzer that extracts two key elements from SQL queries:
 3 | 
 4 |   1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution.
 5 |     - Source tables in `FROM`, `JOIN`, CTEs, subqueries, etc.
 6 |     - ALL target tables in operations like `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE` must be included in dependencies.
 7 |     - Referenced columns in `SELECT`, `WHERE`, `CASE`, `JOIN`, `GROUP BY`, `HAVING`, `ORDER BY`, etc.
 8 |     - Columns used in expressions, `CASE` statements, and aggregate functions.
 9 | 
10 |   2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query.
11 |      - Tables modified with `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE`.
12 |      - Target columns in these operations.
13 |      - Tables created with `CREATE TABLE`.
14 | 
15 |   KEY RULES (CRITICAL):
16 |   - ALL target tables (INSERT, UPDATE, DELETE, TRUNCATE) MUST appear in BOTH dependencies AND outputs.
17 |     - Example: For `INSERT INTO table_x (col1, col2) VALUES (1, 2)`
18 |       → Dependencies: `{"table_x": ["col1", "col2"]}`
19 |       → Outputs: `{"table_x": ["col1", "col2"]}`
20 |     - Example: For `TRUNCATE TABLE table_x`
21 |       → Dependencies: `{"table_x": []}`
22 |       → Outputs: `{"table_x": []}`
23 |   - This applies even inside CTEs, functions or stored procedures.
24 |   - EXCEPTIONS:
25 |     - If a table is CREATED in the same query (CREATE TABLE), it appears ONLY in outputs.
26 |     - If a table is used in INSERT statement after TRUNCATE, include the specified columns for both dependencies and outputs instead of an empty list.
27 |   - ALWAYS include schema-qualified tables (e.g., `schema.table`) in both dependencies and outputs, preserving the schema name.
28 | 
29 |   COLUMN HANDLING:
30 |   - Explicit `SELECT *` should return ["*"] in dependencies
31 |   - Functions like COUNT(*) with specific names, do NOT use ["*"], only include explicitly named columns
32 |     - Example: For `SELECT COUNT(*), name FROM users`, dependencies would include `{"users": ["name"]}`
33 |   - INSERT without column list creates dependencies on all columns in the target table: ["*"]
34 | 
35 |   CTE HANDLING:
36 |   - CTEs (WITH queries) are temporary structures and should NOT be included as dependencies or outputs themselves.
37 |   - However, include tables and columns used within CTEs that originated outside, since they are dependencies.
38 |   - Example:
39 |     ```sql
40 |     WITH cte AS (SELECT * FROM table_x)
41 |     INSERT INTO table_y SELECT * FROM cte
42 |     ```
43 |     → Dependencies: `{"table_x": ["*"], "table_y": ["*"]}`
44 |     → Outputs: `{"table_y": ["*"]}`
45 | 
46 |   SCHEMA-QUALIFIED TABLES:
47 |   - Always preserve schema names exactly as they appear in the query.
48 |   - Example: For `INSERT INTO schema_a.table_x SELECT col1,col2 FROM schema_b.table_y`
49 |     → Dependencies: `{"schema_a.table_x": ["*"], "schema_b.table_y": ["col1","col2"]}`
50 |     → Outputs: `{"schema_a.table_x": ["*"]}`
51 | 
52 |   FUNCTION & PROCEDURE HANDLING:
53 |   - Even inside functions or stored procedures, any `INSERT`, `UPDATE`, `DELETE`, or `TRUNCATE` statements affect real tables and must be included as dependencies and outputs.
54 | 
55 |   ADDITIONAL CONSIDERATIONS:
56 |   - Resolve table aliases to real table names.
57 |   - `CASE` expressions → dependencies on all examined columns.
58 |   - `MERGE`/`UPSERT` → both dependencies and outputs.
59 |   - Ignore variables and parameters as dependencies.
60 | 
61 |   OUTPUT JSON FORMAT:
62 |   {
63 |     "dependencies": {"table_name": ["column1", "column2"]},
64 |     "outputs": {"table_name": ["column1", "column2"]}
65 |   }
66 | 
67 | user_prompt: |
68 |   Extract SQL dependencies (tables/columns needed BEFORE execution) and outputs (tables/columns CREATED or MODIFIED) from this query.
69 | 
70 |   Respond ONLY with JSON in this exact format:
71 |   {{
72 |     "dependencies": {{"table_name": ["column1", "column2"]}},
73 |     "outputs": {{"table_name": ["column1", "column2"]}}
74 |   }}
75 | 
76 |   SQL query to analyze:
77 |   {sql}
78 | 


--------------------------------------------------------------------------------
/data/examples/example.sql:
--------------------------------------------------------------------------------
 1 | -- Common Table Expression (CTE) to count user orders for active users
 2 | WITH user_orders AS (
 3 |     SELECT o.user_id, COUNT(*) AS order_count
 4 |     FROM orders o
 5 |     JOIN users u ON o.user_id = u.id
 6 |     WHERE u.status = 'active'
 7 |     GROUP BY o.user_id
 8 | )
 9 | 
10 | -- Create a new table from the CTE
11 | CREATE TABLE transactions.user_order_summary AS
12 | SELECT * FROM user_orders;
13 | 
14 | -- Truncate an existing table before repopulating
15 | TRUNCATE TABLE order_summary;
16 | 


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/example1.sql:
--------------------------------------------------------------------------------
1 | -- Simple query selecting a subset of columns
2 | SELECT id, name FROM users


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/example2.sql:
--------------------------------------------------------------------------------
1 | -- Simple query selecting all columns
2 | SELECT * FROM users LIMIT 100


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/random_file.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/data/examples/folders_with_sql_files/random_file.txt


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/subfolder1/example3.sql:
--------------------------------------------------------------------------------
1 | -- Query with table alias, with and without database specification, and join
2 | SELECT u.id, u.name, o.order_id
3 | FROM my_db.users u
4 | JOIN orders AS o ON u.id = o.user_id


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/subfolder1/example3_expected.json:
--------------------------------------------------------------------------------
1 | {
2 |     "tables": ["my_db.users", "orders"],
3 |     "columns": {
4 |       "my_db.users": ["id", "name"],
5 |       "orders": ["order_id", "user_id"]
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/subfolder2/random_file.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/data/examples/folders_with_sql_files/subfolder2/random_file.txt


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example4.sql:
--------------------------------------------------------------------------------
 1 | -- Query with table alias, with and without database specification, and join, and where clauses
 2 | SELECT u.id, u.name, o.order_id
 3 | FROM my_db.users u
 4 | JOIN orders AS o ON u.id = o.user_id
 5 | WHERE u.status = 'active'
 6 |     AND o.order_date >= '2024-01-01'
 7 |     AND o.total_amount > 100.00
 8 |     AND u.email LIKE '%@company.com'
 9 |     AND o.order_type IN ('retail', 'wholesale')
10 |     AND (
11 |         o.shipping_status = 'pending'
12 |         OR (o.shipping_status = 'processed' AND o.priority_level = 'high')
13 |     );


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example5.sql:
--------------------------------------------------------------------------------
1 | -- Simple CTE
2 | WITH user_orders AS (
3 |     SELECT user_id, COUNT(*) as order_count
4 |     FROM orders
5 |     GROUP BY user_id
6 | )
7 | SELECT u.name, uo.order_count
8 | FROM users u
9 | JOIN user_orders uo ON u.id = uo.user_id;


--------------------------------------------------------------------------------
/data/examples/folders_with_sql_files/subfolder2/subfolder2_1/random_file.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/data/examples/folders_with_sql_files/subfolder2/subfolder2_1/random_file.yml


--------------------------------------------------------------------------------
/docs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/docs/.gitkeep


--------------------------------------------------------------------------------
/docs/api-reference/app.md:
--------------------------------------------------------------------------------
1 | # Web App Reference
2 | 
3 | ::: sqldeps.app
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/cache.md:
--------------------------------------------------------------------------------
1 | # Cache Reference
2 | 
3 | ::: sqldeps.cache
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/cli.md:
--------------------------------------------------------------------------------
1 | # Command Line Interface
2 | 
3 | ::: sqldeps.cli
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/config.md:
--------------------------------------------------------------------------------
1 | # Config Reference
2 | 
3 | ::: sqldeps.config
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/database.md:
--------------------------------------------------------------------------------
1 | # Database Reference
2 | 
3 | ::: sqldeps.database
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/llm-parsers.md:
--------------------------------------------------------------------------------
1 | # LLM Parsers Reference
2 | 
3 | ::: sqldeps.llm_parsers
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/models.md:
--------------------------------------------------------------------------------
1 | # Models Reference
2 | 
3 | ::: sqldeps.models
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/parallel.md:
--------------------------------------------------------------------------------
1 | # Parallelization Reference
2 | 
3 | ::: sqldeps.parallel
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/rate-limiter.md:
--------------------------------------------------------------------------------
1 | # Rate Limiter Reference
2 | 
3 | ::: sqldeps.rate_limiter
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/utils.md:
--------------------------------------------------------------------------------
1 | # Utils Reference
2 | 
3 | ::: sqldeps.utils
4 | 


--------------------------------------------------------------------------------
/docs/api-reference/visualization.md:
--------------------------------------------------------------------------------
1 | # Interactive Graphs of SQL Dependency
2 | 
3 | ::: sqldeps.visualization
4 | 


--------------------------------------------------------------------------------
/docs/assets/images/sqldeps_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/docs/assets/images/sqldeps_logo.png


--------------------------------------------------------------------------------
/docs/authors.md:
--------------------------------------------------------------------------------
 1 | # Authors and Maintainers
 2 | 
 3 | ## Core Team
 4 | 
 5 | ### Primary Author
 6 | - **Cainã Max Couto da Silva** - *Initial work and primary developer*
 7 |   - [:fontawesome-brands-github:](https://github.com/cmcouto-silva) [GitHub](https://github.com/cmcouto-silva) | [:fontawesome-brands-linkedin:](https://www.linkedin.com/in/cmcouto-silva/) [LinkedIn](https://www.linkedin.com/in/cmcouto-silva/) | [:material-email:](mailto:coutodasilva@wisc.edu) coutodasilva@wisc.edu
 8 |   - [:material-school:](https://gibbs-lab.wisc.edu/) Global Land Use and Environment Lab, UW-Madison
 9 | 
10 | ### Project Lead
11 | - **Matt Christie** - *Project supervision and code review*
12 |   - [:fontawesome-brands-github:](https://github.com/mjchristie) [GitHub](https://github.com/mjchristie) | [:material-email:](mailto:mjchristie@wisc.edu) mjchristie@wisc.edu
13 |   - [:material-school:](https://gibbs-lab.wisc.edu/) Global Land Use and Environment Lab, UW-Madison
14 | 
15 | ## Institutional Support
16 | 
17 | SQLDeps was developed as part of research and development work at the University of Wisconsin-Madison's Nelson Institute for Environmental Studies, specifically within the [Global Land Use and Environment (GLUE) Lab](https://gibbs-lab.wisc.edu/).
18 | 
19 | The project aims to support data analysis workflows and improve database management practices.
20 | 
21 | ## Contributors
22 | 
23 | We appreciate all contributions to SQLDeps!  
24 | If you contribute to this project, your name will be added here.
25 | 
26 | ## Becoming a Contributor
27 | 
28 | Interested in contributing to SQLDeps?  
29 | Please check the [Contributing Guide](contributing.md) for details on how to get started.
30 | 
31 | ## Contact
32 | 
33 | For questions about the project, please open an issue on GitHub or contact the maintainers directly via email.
34 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | --8<-- "CHANGELOG.md"
2 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | --8<-- "CONTRIBUTING.md"
2 | 


--------------------------------------------------------------------------------
/docs/docs-requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs==1.6.1
2 | mkdocs-material==9.6.9
3 | pymdown-extensions==10.14.3
4 | mkdocstrings[python]==0.29.1
5 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
  1 | # Examples
  2 | 
  3 | Here are some practical examples of using SQLDeps for different use cases.
  4 | 
  5 | ## Basic Dependency Extraction
  6 | 
  7 | ### Example 1: Simple SELECT Query
  8 | 
  9 | ```sql
 10 | -- example1.sql
 11 | SELECT u.id, u.name, o.order_id, o.amount
 12 | FROM users u
 13 | JOIN orders o ON u.id = o.user_id
 14 | WHERE u.status = 'active'
 15 | ```
 16 | 
 17 | Extracted dependencies:
 18 | 
 19 | ```json
 20 | {
 21 |   "dependencies": {
 22 |     "users": ["id", "name", "status"],
 23 |     "orders": ["order_id", "amount", "user_id"]
 24 |   },
 25 |   "outputs": {}
 26 | }
 27 | ```
 28 | 
 29 | ### Example 2: CTEs and Table Creation
 30 | 
 31 | ```sql
 32 | -- example2.sql
 33 | WITH user_orders AS (
 34 |     SELECT o.user_id, COUNT(*) AS order_count
 35 |     FROM orders o
 36 |     JOIN users u ON o.user_id = u.id
 37 |     WHERE u.status = 'active'
 38 |     GROUP BY o.user_id
 39 | )
 40 | 
 41 | CREATE TABLE transactions.user_order_summary AS
 42 | SELECT * FROM user_orders;
 43 | ```
 44 | 
 45 | Extracted dependencies:
 46 | 
 47 | ```json
 48 | {
 49 |   "dependencies": {
 50 |     "orders": ["user_id"],
 51 |     "users": ["id", "status"]
 52 |   },
 53 |   "outputs": {
 54 |     "transactions.user_order_summary": ["*"]
 55 |   }
 56 | }
 57 | ```
 58 | 
 59 | ### Example 3: UPDATE Operation
 60 | 
 61 | ```sql
 62 | -- example3.sql
 63 | UPDATE users
 64 | SET status = 'inactive'
 65 | WHERE last_login < CURRENT_DATE - INTERVAL '90 days'
 66 | AND status = 'active';
 67 | ```
 68 | 
 69 | Extracted dependencies:
 70 | 
 71 | ```json
 72 | {
 73 |   "dependencies": {
 74 |     "users": ["status", "last_login"]
 75 |   },
 76 |   "outputs": {
 77 |     "users": ["status"]
 78 |   }
 79 | }
 80 | ```
 81 | 
 82 | ### Example 4: INSERT Operation
 83 | 
 84 | ```sql
 85 | -- example4.sql
 86 | INSERT INTO sales.order_summary (date, total_orders, total_amount)
 87 | SELECT 
 88 |     DATE_TRUNC('day', order_date) as date,
 89 |     COUNT(*) as total_orders,
 90 |     SUM(amount) as total_amount
 91 | FROM orders
 92 | GROUP BY DATE_TRUNC('day', order_date);
 93 | ```
 94 | 
 95 | Extracted dependencies:
 96 | 
 97 | ```json
 98 | {
 99 |   "dependencies": {
100 |     "orders": ["order_date", "amount"],
101 |     "sales.order_summary": ["date", "total_orders", "total_amount"]
102 |   },
103 |   "outputs": {
104 |     "sales.order_summary": ["date", "total_orders", "total_amount"]
105 |   }
106 | }
107 | ```


--------------------------------------------------------------------------------
/docs/getting-started/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | - Python 3.10 or higher
 6 | - API keys for your preferred LLM provider (Groq, OpenAI, or DeepSeek)
 7 | 
 8 | ## Install from PyPI
 9 | 
10 | The simplest way to install SQLDeps is via pip:
11 | 
12 | ```bash
13 | pip install sqldeps
14 | ```
15 | 
16 | For additional functionality, you can install optional dependencies:
17 | 
18 | ```bash
19 | # Install with web app dependencies
20 | pip install "sqldeps[app]"
21 | 
22 | # Install with data visualization dependencies
23 | pip install "sqldeps[dataviz]"
24 | 
25 | # Install all optional dependencies
26 | pip install "sqldeps[app,postgres,dataviz]"
27 | ```
28 | 
29 | ## Setup API Keys
30 | 
31 | SQLDeps requires API keys for the LLM providers you want to use. These keys are set through environment variables.
32 | 
33 | ### Environment Variables
34 | 
35 | Create a `.env` file in your project root with your API keys:
36 | 
37 | ```
38 | # LLM API Keys
39 | GROQ_API_KEY=your_groq_api_key
40 | OPENAI_API_KEY=your_openai_api_key
41 | DEEPSEEK_API_KEY=your_deepseek_api_key
42 | ANTHROPIC_API_KEY=your_anthropic_api_key
43 | 
44 | # Optional: Database credentials (for schema validation)
45 | DB_HOST=localhost
46 | DB_PORT=5432
47 | DB_NAME=mydatabase
48 | DB_USER=username
49 | DB_PASSWORD=password
50 | ```
51 | 
52 | SQLDeps will automatically load variables from the .env file when you import the package.
53 | 
54 | > **Tip:** [Groq](https://console.groq.com/keys) offers free tokens without requiring payment details, making it ideal for getting started quickly.
55 | 
56 | ## Database Configuration (Optional)
57 | 
58 | If you plan to use the database features, you can set up your database credentials in several ways:
59 | 
60 | ### YAML Configuration File
61 | 
62 | ```yaml
63 | # database.yml
64 | database:
65 |   host: localhost
66 |   port: 5432
67 |   database: mydatabase
68 |   username: username
69 |   password: password
70 | ```
71 | 
72 | ### Environment Variables
73 | 
74 | ```
75 | DB_HOST=localhost
76 | DB_PORT=5432
77 | DB_NAME=mydatabase
78 | DB_USER=username
79 | DB_PASSWORD=password
80 | ```
81 | 
82 | ### PostgreSQL Password File
83 | 
84 | SQLDeps also supports reading credentials from the standard PostgreSQL password file (`~/.pgpass`).
85 | 
86 | ## Verify Installation
87 | 
88 | You can verify your installation by running:
89 | 
90 | ```bash
91 | sqldeps --help
92 | ```
93 | 
94 | This should display the command-line help information for SQLDeps.
95 | 


--------------------------------------------------------------------------------
/docs/getting-started/quick-start.md:
--------------------------------------------------------------------------------
  1 | # Quick Start
  2 | 
  3 | SQLDeps provides both API and CLI interfaces for extracting dependencies from SQL queries.
  4 | 
  5 | ## API Usage
  6 | 
  7 | ```python
  8 | from sqldeps.llm_parsers import create_extractor
  9 | 
 10 | # Create extractor with default settings (framework="litellm", model="openai/gpt-4.1")
 11 | extractor = create_extractor()
 12 | 
 13 | # Extract dependencies and outputs from a SQL query
 14 | sql_query = """
 15 | WITH user_orders AS (
 16 |     SELECT o.user_id, COUNT(*) AS order_count
 17 |     FROM orders o
 18 |     JOIN users u ON o.user_id = u.id
 19 |     WHERE u.status = 'active'
 20 |     GROUP BY o.user_id
 21 | )
 22 | 
 23 | CREATE TABLE transactions.user_order_summary AS
 24 | SELECT * FROM user_orders;
 25 | """
 26 | result = extractor.extract_from_query(sql_query)
 27 | 
 28 | # Print the results
 29 | print("Dependencies:")
 30 | print(result.dependencies)
 31 | print("\nOutputs:")
 32 | print(result.outputs)
 33 | 
 34 | # Or extract from a file
 35 | result = extractor.extract_from_file('path/to/query.sql')
 36 | 
 37 | # Convert to dictionary or DataFrame
 38 | dict_format = result.to_dict()
 39 | df_format = result.to_dataframe()
 40 | ```
 41 | 
 42 | ## CLI Usage
 43 | 
 44 | ```bash
 45 | # Basic example with default settings
 46 | sqldeps extract path/to/query.sql
 47 | 
 48 | # Specify framework and output format
 49 | sqldeps extract path/to/query.sql --framework=litellm --model=gpt-4.1-mini -o results.json
 50 | 
 51 | # Scan a folder recursively with intelligent parallelization
 52 | sqldeps extract \
 53 |     data/sql_folder \       # Automatically detect if path is file or folder       
 54 |     --recursive \           # Scan folder recursively
 55 |     --framework=deepseek \  # Specify framework/provider
 56 |     --rpm 50                # Maximum 50 requests per minute
 57 |     --n-workers -1 \        # Use all available processors
 58 |     -o results.csv          # Output a dataframe as CSV instead of JSON
 59 | ```
 60 | 
 61 | ```bash
 62 | # Get help on available commands
 63 | sqldeps --help
 64 | 
 65 | # Get help on extract - the main command
 66 | sqldeps extract --help
 67 | ```
 68 | 
 69 | ## Web Application
 70 | 
 71 | SQLDeps includes a Streamlit-based web interface:
 72 | 
 73 | ```bash
 74 | # Run the web app
 75 | sqldeps app
 76 | ```
 77 | 
 78 | **Note**: The web application is designed for single-file extraction and demonstration purposes. For processing multiple files or entire folders, use the API or CLI instead.
 79 | 
 80 | ## Example
 81 | 
 82 | Given this SQL query:
 83 | 
 84 | ```sql
 85 | -- Common Table Expression (CTE) to count user orders for active users
 86 | WITH user_orders AS (
 87 |     SELECT o.user_id, COUNT(*) AS order_count
 88 |     FROM orders o
 89 |     JOIN users u ON o.user_id = u.id
 90 |     WHERE u.status = 'active'
 91 |     GROUP BY o.user_id
 92 | )
 93 | 
 94 | -- Create a new table from the CTE
 95 | CREATE TABLE transactions.user_order_summary AS
 96 | SELECT * FROM user_orders;
 97 | ```
 98 | 
 99 | SQLDeps will extract:
100 | 
101 | ```json
102 | {
103 |   "dependencies": {
104 |     "orders": ["user_id"],
105 |     "users": ["id", "status"]
106 |   },
107 |   "outputs": {
108 |     "transactions.user_order_summary": ["*"]
109 |   }
110 | }
111 | ```
112 | 
113 | Notice how:
114 | 
115 | - CTE (`user_orders`) is correctly excluded
116 | - Real source tables (`orders`, `users`) are included as dependencies
117 | - Target table (`transactions.user_order_summary`) is correctly identified as output
118 | 
119 | ## Next Steps
120 | 
121 | - Read the [API Usage](../user-guide/api-usage.md) guide for detailed API options
122 | - Read the [CLI Usage](../user-guide/cli-usage.md) for easy-to-use command-line features
123 | - Explore [Database Integration](../user-guide/database-integration.md) for schema validation and data type retrieval
124 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # SQLDeps: SQL Dependency Extractor
 2 | 
 3 | <p align="center">
 4 |   <img src="assets/images/sqldeps_logo.png" alt="SQLDeps Logo" width="300">
 5 | </p>
 6 | 
 7 | <!-- <p align="center">
 8 |   <em>Developed at the <a href="https://gibbs-lab.wisc.edu/" target="blank">Global Land Use and Environment (GLUE)</a> Lab<br>
 9 |   <a href="https://nelson.wisc.edu/" target="blank">Nelson Institute for Environmental Studies</a><br>
10 |   University of Wisconsin-Madison</em>
11 | </p> -->
12 | 
13 | A powerful tool that automatically extracts and maps SQL dependencies and outputs using Large Language Models (LLMs).
14 | 
15 | ## Overview
16 | 
17 | SQLDeps analyzes SQL scripts to identify:
18 | 
19 | 1. **Dependencies**: Tables and columns that must exist BEFORE query execution
20 | 2. **Outputs**: Tables and columns permanently CREATED or MODIFIED by the query
21 | 
22 | It intelligently filters out temporary constructs like CTEs and derived tables, focusing only on the real database objects that matter.
23 | 
24 | ## Key Benefits
25 | 
26 | - 🛠️ **Change Management:** Safely modify schemas by identifying true dependencies
27 | - 💾 **Storage Optimization:** Focus resources on essential tables and columns
28 | - 🚢 **Migration Planning:** Precisely determine what needs to be migrated
29 | - 📝 **Project Documentation:** Create comprehensive maps of database dependencies
30 | 
31 | ## Why SQLDeps?
32 | 
33 | Traditional approaches to SQL dependency tracking:
34 | 
35 | - **Manual Inspection**: Time-consuming and error-prone
36 | - **Traditional Parsers**: Lacks context and intelligence for complex SQL
37 | 
38 | SQLDeps leverages the power of Large Language Models to provide intelligent, accurate dependency extraction that works across SQL dialects and complexity levels.
39 | 
40 | ## Supported LLM Providers
41 | 
42 | All models availables on [LiteLLM](https://docs.litellm.ai/docs/providers) [Groq](https://console.groq.com/docs/models), [OpenAI](https://platform.openai.com/docs/models), and [DeepSeek](https://api-docs.deepseek.com/).
43 | 
44 | ## Getting Started
45 | 
46 | ```bash
47 | # Install the package
48 | pip install sqldeps
49 | 
50 | # Basic usage
51 | sqldeps extract path/to/query.sql
52 | ```
53 | 
54 | Check out the [Quick Start](getting-started/quick-start.md) guide for more details.
55 | 
56 | <!-- ## Try the Web App
57 | 
58 | SQLDeps includes a Streamlit-based web interface for interactive dependency extraction:
59 | 
60 | ```bash
61 | # Run the web app
62 | sqldeps app
63 | ```
64 | 
65 | ## Simulate SQLDeps Savings
66 | 
67 | Interested in evaluating how SQLDeps could benefit your team? Check out our [Time & Cost Simulator](https://sqldeps-simulator.streamlit.app/) to visualize time savings, cost benefits, and return on investment. -->


--------------------------------------------------------------------------------
/docs/stylesheets/custom.css:
--------------------------------------------------------------------------------
 1 | /* Primary theme color customization */
 2 | :root {
 3 |     --md-primary-fg-color: #bf050b;
 4 |     --md-primary-fg-color--light: #e01b22;
 5 |     --md-primary-fg-color--dark: #8f0408;
 6 |     
 7 |     /* Ensure text has sufficient contrast */
 8 |     --md-primary-bg-color: #ffffff;
 9 |     --md-primary-bg-color--light: #ffffff;
10 |   }
11 | 
12 |   /* Make sure text on primary color background is visible */
13 |   .md-header {
14 |     color: var(--md-primary-bg-color);
15 |   }
16 | 
17 |   /* Direct link color overrides */
18 | .md-typeset a {
19 |   color: #0066cc !important;
20 | }
21 | .md-typeset a:hover {
22 |     color: #111111 !important;
23 | }
24 | 
25 | .md-typeset a:hover {
26 |     text-decoration: underline;
27 | }
28 | 


--------------------------------------------------------------------------------
/docs/user-guide/api-usage.md:
--------------------------------------------------------------------------------
  1 | # API Usage
  2 | 
  3 | SQLDeps provides a comprehensive Python API for extracting SQL dependencies and validating them against database schemas.
  4 | 
  5 | ## Creating an Extractor
  6 | 
  7 | The main entry point for using SQLDeps is the `create_extractor()` function:
  8 | 
  9 | ```python
 10 | from sqldeps.llm_parsers import create_extractor
 11 | 
 12 | # Create extractor with default settings (framework="litellm", model="openai/gpt-4.1")
 13 | extractor = create_extractor()
 14 | 
 15 | # Specify a different framework and model
 16 | extractor = create_extractor(
 17 |     framework="openai",
 18 |     model="gpt-4o"
 19 | )
 20 | 
 21 | # Specify additional parameters for the LLM
 22 | extractor = create_extractor(
 23 |     framework="litellm",
 24 |     model="openai/gpt-4.1-mini",
 25 |     params={"temperature": 0.1}
 26 | )
 27 | 
 28 | # Use a custom prompt template
 29 | extractor = create_extractor(
 30 |     framework="deepseek",
 31 |     model="deepseek-chat",
 32 |     prompt_path="path/to/custom_prompt.yml"
 33 | )
 34 | ```
 35 | 
 36 | Note that the API keys should be set through environment variables as explained in the [Installation](../getting-started/installation.md) guide.
 37 | 
 38 | ## Extracting Dependencies
 39 | 
 40 | Once you have an extractor, you can use it to extract dependencies from SQL queries, files, or folders:
 41 | 
 42 | ### From a Query String
 43 | 
 44 | ```python
 45 | # Extract from a SQL query string
 46 | sql_query = """
 47 | SELECT u.id, u.name, o.order_id, o.amount
 48 | FROM users u
 49 | JOIN orders o ON u.id = o.user_id
 50 | WHERE u.status = 'active'
 51 | """
 52 | 
 53 | result = extractor.extract_from_query(sql_query)
 54 | ```
 55 | 
 56 | ### From a File
 57 | 
 58 | ```python
 59 | # Extract from a SQL file
 60 | result = extractor.extract_from_file("path/to/query.sql")
 61 | ```
 62 | 
 63 | ### From a Folder
 64 | 
 65 | ```python
 66 | # Extract from all SQL files in a folder
 67 | result = extractor.extract_from_folder("path/to/sql_folder")
 68 | 
 69 | # Extract recursively from all SQL files in a folder and subfolders
 70 | result = extractor.extract_from_folder("path/to/sql_folder", recursive=True)
 71 | 
 72 | # Extract from files with specific extensions
 73 | result = extractor.extract_from_folder(
 74 |     "path/to/sql_folder",
 75 |     recursive=True,
 76 |     valid_extensions={"sql", "pgsql", "tsql"}
 77 | )
 78 | 
 79 | # Process with parallel workers (uses all available CPUs)
 80 | result = extractor.extract_from_folder(
 81 |     "path/to/sql_folder",
 82 |     recursive=True,
 83 |     n_workers=-1, # -1 means all available workers
 84 |     rpm=100  # Rate limit to 100 requests per minute
 85 | )
 86 | 
 87 | # Merge results into a single SQLProfile
 88 | result = extractor.extract_from_folder(
 89 |     "path/to/sql_folder",
 90 |     recursive=True,
 91 |     merge_sql_profiles=True
 92 | )
 93 | ```
 94 | 
 95 | ## Working with Results
 96 | 
 97 | The `extract_*` methods return a `SQLProfile` object that contains the extracted dependencies and outputs:
 98 | 
 99 | ```python
100 | # Access dependencies and outputs as dictionaries
101 | dependencies = result.dependencies  # Dict of tables and their columns
102 | outputs = result.outputs  # Dict of tables and columns created or modified
103 | 
104 | # Get a list of all referenced tables
105 | tables = result.dependency_tables
106 | 
107 | # Get a list of all output tables
108 | output_tables = result.outcome_tables
109 | 
110 | # Convert to a dictionary
111 | result_dict = result.to_dict()
112 | 
113 | # Convert to a DataFrame for easier analysis
114 | result_df = result.to_dataframe()
115 | ```
116 | 
117 | ## Database Schema Validation
118 | 
119 | You can validate the extracted dependencies against a real database schema:
120 | 
121 | ```python
122 | from sqldeps.database import PostgreSQLConnector
123 | 
124 | # Connect to the database
125 | db_conn = PostgreSQLConnector(
126 |     host="localhost",
127 |     port=5432,
128 |     database="mydatabase",
129 |     username="user"
130 |     # Password from .pgpass or environment variables
131 | )
132 | 
133 | # Match extracted dependencies against database schema
134 | validated_schema = extractor.match_database_schema(
135 |     result,  # The SQLProfile from extraction
136 |     db_connection=db_conn,
137 |     target_schemas=["public", "sales"]  # Optional: schemas to validate against
138 | )
139 | 
140 | # The result is a DataFrame with database schema information
141 | print(validated_schema)
142 | 
143 | # Filter for exact matches
144 | exact_matches = validated_schema[validated_schema["exact_match"]]
145 | 
146 | # Filter for schema-agnostic matches or cross-schema matches
147 | missing_deps = validated_schema[~validated_schema["exact_match"]]
148 | ```
149 | 
150 | ## Custom Prompts
151 | 
152 | You can create custom prompts to guide the LLM extraction process:
153 | 
154 | ```yaml
155 | # custom_prompt.yml
156 | system_prompt: |
157 |   You are a SQL analyzer that extracts two key elements from SQL queries:
158 |   
159 |   1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution.
160 |   2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query.
161 |   
162 |   # Add detailed instructions for the LLM here...
163 | 
164 | user_prompt: |
165 |   Extract SQL dependencies (tables/columns needed BEFORE execution) and outputs 
166 |   (tables/columns CREATED or MODIFIED) from this query.
167 |   
168 |   Respond ONLY with JSON in this exact format:
169 |   {{
170 |     "dependencies": {{"table_name": ["column1", "column2"]}},
171 |     "outputs": {{"table_name": ["column1", "column2"]}}
172 |   }}
173 |   
174 |   SQL query to analyze:
175 |   {sql}
176 | ```
177 | 
178 | Use the custom prompt with:
179 | 
180 | ```python
181 | extractor = create_extractor(prompt_path="path/to/custom_prompt.yml")
182 | ```
183 | 
184 | ## Using Cache
185 | 
186 | SQLDeps can cache extraction results to avoid reprocessing the same files:
187 | 
188 | ```python
189 | # Enable cache (default: True)
190 | result = extractor.extract_from_folder(
191 |     "path/to/sql_folder",
192 |     recursive=True,
193 |     use_cache=True
194 | )
195 | 
196 | # Clear cache after processing
197 | result = extractor.extract_from_folder(
198 |     "path/to/sql_folder",
199 |     recursive=True,
200 |     use_cache=True,
201 |     clear_cache=True
202 | )
203 | ```
204 | 
205 | The cache is stored in the `.sqldeps_cache` directory.
206 | 


--------------------------------------------------------------------------------
/docs/user-guide/cli-usage.md:
--------------------------------------------------------------------------------
  1 | # CLI Usage
  2 | 
  3 | SQLDeps includes a powerful command-line interface for extracting SQL dependencies.
  4 | 
  5 | ## Basic Usage
  6 | 
  7 | The basic command syntax is:
  8 | 
  9 | ```bash
 10 | sqldeps extract PATH [OPTIONS]
 11 | ```
 12 | 
 13 | Where `PATH` is the path to a SQL file or directory containing SQL files.
 14 | 
 15 | ## Common Options
 16 | 
 17 | | Option | Description |
 18 | |--------|-------------|
 19 | | `--framework` | LLM framework to use (litellm, groq, openai, deepseek) |
 20 | | `--model` | Model name within the selected framework |
 21 | | `--prompt` | Path to custom prompt YAML file |
 22 | | `-r, --recursive` | Recursively scan folder for SQL files |
 23 | | `-o, --output` | Output file path (.json or .csv) |
 24 | | `--n-workers` | Number of workers for parallel processing (-1 for all CPUs) |
 25 | | `--rpm` | Maximum requests per minute for API rate limiting |
 26 | | `--use-cache` | Use local cache for SQL extraction results |
 27 | | `--clear-cache` | Clear local cache after processing |
 28 | 
 29 | ## Basic Examples
 30 | 
 31 | ```bash
 32 | # Basic usage with default settings (litellm openai/gpt-4.1)
 33 | sqldeps extract path/to/query.sql
 34 | 
 35 | # Specify a different framework and model
 36 | sqldeps extract path/to/query.sql --framework=openai --model=gpt-4.1-mini
 37 | 
 38 | # Process all SQL files in a directory
 39 | sqldeps extract path/to/sql_folder
 40 | 
 41 | # Process recursively with a specific output file
 42 | sqldeps extract path/to/sql_folder --recursive -o results.csv
 43 | 
 44 | # Use a custom prompt
 45 | sqldeps extract path/to/query.sql --prompt=path/to/custom_prompt.yml
 46 | 
 47 | # Use parallel processing with rate limiting
 48 | sqldeps extract path/to/sql_folder --recursive --n-workers=-1 --rpm=50
 49 | ```
 50 | 
 51 | ## Database Validation
 52 | 
 53 | SQLDeps can validate extracted dependencies against a real database schema:
 54 | 
 55 | ```bash
 56 | # Validate against a database
 57 | sqldeps extract path/to/query.sql \
 58 |     --db-match-schema \
 59 |     --db-target-schemas public,sales \
 60 |     --db-credentials path/to/database.yml
 61 | ```
 62 | 
 63 | Database validation options:
 64 | 
 65 | | Option | Description |
 66 | |--------|-------------|
 67 | | `--db-match-schema` | Enable database schema validation |
 68 | | `--db-target-schemas` | Comma-separated list of target schemas |
 69 | | `--db-credentials` | Path to database credentials YAML file |
 70 | | `--db-dialect` | Database dialect (default: postgresql) |
 71 | 
 72 | ## Output Formats
 73 | 
 74 | SQLDeps supports both JSON and CSV output formats:
 75 | 
 76 | ```bash
 77 | # Output as JSON (default)
 78 | sqldeps extract path/to/query.sql -o results.json
 79 | 
 80 | # Output as CSV
 81 | sqldeps extract path/to/query.sql -o results.csv
 82 | ```
 83 | 
 84 | ## Managing Cache
 85 | 
 86 | SQLDeps provides commands to manage the extraction cache:
 87 | 
 88 | ```bash
 89 | # Clear the cache
 90 | sqldeps cache clear
 91 | ```
 92 | 
 93 | ## Running the Web App
 94 | 
 95 | SQLDeps includes a Streamlit-based web application:
 96 | 
 97 | ```bash
 98 | # Start the web app
 99 | sqldeps app
100 | ```
101 | 
102 | ## Advanced Examples
103 | 
104 | ```bash
105 | # Complete example with all options
106 | sqldeps extract data/sql_folder \
107 |     --recursive \
108 |     --framework=deepseek \
109 |     --model=deepseek-chat \
110 |     --prompt=configs/prompts/custom.yml \
111 |     --db-match-schema \
112 |     --db-target-schemas public,sales,reporting \
113 |     --db-credentials configs/database.yml \
114 |     --n-workers=10 \
115 |     --rpm=100 \
116 |     --use-cache \
117 |     -o folder_deps.csv
118 | ```
119 | 
120 | ## Help Command
121 | 
122 | For a complete list of options, use the help command:
123 | 
124 | ```bash
125 | sqldeps --help
126 | 
127 | # View help for a specific command
128 | sqldeps extract --help
129 | ```
130 | 
131 | ## Exit Codes
132 | 
133 | The CLI will return the following exit codes:
134 | 
135 | - `0`: Success
136 | - `1`: Error (file not found, connection error, extraction failed, etc.)
137 | 
138 | ## Integration with Shell Scripts
139 | 
140 | SQLDeps can be easily integrated into shell scripts:
141 | 
142 | ```bash
143 | #!/bin/bash
144 | 
145 | # Process all SQL files in a directory
146 | sqldeps extract sql_files/ --recursive -o results.json
147 | 
148 | # Check exit code
149 | if [ $? -eq 0 ]; then
150 |     echo "Dependencies extracted successfully."
151 | else
152 |     echo "Failed to extract dependencies."
153 |     exit 1
154 | fi
155 | 
156 | # Process results
157 | cat results.json | jq '.dependencies'
158 | ```


--------------------------------------------------------------------------------
/docs/user-guide/database-integration.md:
--------------------------------------------------------------------------------
  1 | # Database Integration
  2 | 
  3 | SQLDeps provides robust database integration for matching extracted dependencies against actual database schemas.
  4 | 
  5 | ## Supported Databases
  6 | 
  7 | Currently, SQLDeps supports:
  8 | 
  9 | - PostgreSQL (primary support)
 10 | 
 11 | ## Database Connection
 12 | 
 13 | ### Using the PostgreSQLConnector
 14 | 
 15 | The `PostgreSQLConnector` class provides a secure way to connect to PostgreSQL databases:
 16 | 
 17 | ```python
 18 | from sqldeps.database import PostgreSQLConnector
 19 | 
 20 | # Create a connection using direct parameters
 21 | conn = PostgreSQLConnector(
 22 |     host="localhost",
 23 |     port=5432,
 24 |     database="mydatabase",
 25 |     username="username",
 26 |     password="password"  # Optional, can use .pgpass
 27 | )
 28 | 
 29 | # Alternative: load from YAML config file
 30 | conn = PostgreSQLConnector(
 31 |     config_path="path/to/database.yml"
 32 | )
 33 | 
 34 | # Alternative: use environment variables
 35 | # DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD
 36 | conn = PostgreSQLConnector()
 37 | ```
 38 | 
 39 | ### Connection Priority
 40 | 
 41 | The connector uses the following priority for connection parameters:
 42 | 
 43 | 1. Direct parameters in constructor
 44 | 2. YAML config file
 45 | 3. Environment variables
 46 | 4. .pgpass file (for password only)
 47 | 
 48 | ### Database Configuration YAML
 49 | 
 50 | ```yaml
 51 | # database.yml
 52 | database:
 53 |   host: localhost
 54 |   port: 5432
 55 |   database: mydatabase
 56 |   username: username
 57 |   password: password  # Optional
 58 | ```
 59 | 
 60 | ### Environment Variables
 61 | 
 62 | ```
 63 | DB_HOST=localhost
 64 | DB_PORT=5432
 65 | DB_NAME=mydatabase
 66 | DB_USER=username
 67 | DB_PASSWORD=password
 68 | ```
 69 | 
 70 | ### PostgreSQL Password File
 71 | 
 72 | SQLDeps supports standard PostgreSQL password file (`~/.pgpass`) format:
 73 | 
 74 | ```
 75 | hostname:port:database:username:password
 76 | ```
 77 | 
 78 | ## Schema Retrieval
 79 | 
 80 | You can directly access database schema information:
 81 | 
 82 | ```python
 83 | # Get all schemas
 84 | db_schema = conn.get_schema()
 85 | 
 86 | # Get specific schemas
 87 | db_schema = conn.get_schema(schemas=["public", "sales"])
 88 | 
 89 | # Export schema to CSV
 90 | conn.export_schema_csv("schema.csv")
 91 | ```
 92 | 
 93 | ## Schema Matching
 94 | 
 95 | ### Using the API
 96 | 
 97 | ```python
 98 | from sqldeps.llm_parsers import create_extractor
 99 | from sqldeps.database import PostgreSQLConnector
100 | 
101 | # Create extractor and extract dependencies
102 | extractor = create_extractor()
103 | dependencies = extractor.extract_from_file("query.sql")
104 | 
105 | # Connect to database
106 | conn = PostgreSQLConnector(
107 |     host="localhost",
108 |     port=5432,
109 |     database="mydatabase",
110 |     username="username"
111 | )
112 | 
113 | # Match extracted dependencies against database schema
114 | matching_results = extractor.match_database_schema(
115 |     dependencies,
116 |     db_connection=conn,
117 |     target_schemas=["public", "sales"]
118 | )
119 | 
120 | # Analyze database-matching results
121 | exact_matches = matching_results[matching_results["exact_match"]]
122 | agnostic_matches = matching_results[~matching_results["exact_match"]]
123 | 
124 | print(f"Found {len(exact_matches)} exact matches.")
125 | print(f"Found {len(agnostic_matches)} schema-agnostic matches.")
126 | ```
127 | 
128 | ### Using the CLI
129 | 
130 | ```bash
131 | sqldeps extract path/to/query.sql \
132 |     --db-match-schema \
133 |     --db-target-schemas public,sales \
134 |     --db-credentials configs/database.yml \
135 |     -o db_matching_results.csv
136 | ```
137 | 
138 | ## Matching Results
139 | 
140 | The matching results are returned as a pandas DataFrame with these columns:
141 | 
142 | | Column | Description |
143 | |--------|-------------|
144 | | `schema` | Database schema name |
145 | | `table` | Table name |
146 | | `column` | Column name |
147 | | `data_type` | Database data type |
148 | | `exact_match` | Boolean indicating if schema name matched exactly |
149 | 
150 | ### Interpreting Results
151 | 
152 | - `exact_match=True`: The table/column was found in the specified schema
153 | - `exact_match=False`: The table/column does not have a specified schema
154 | - Missing entries: Dependencies that weren't found in the database
155 | 
156 | ## Using Schema Information in Applications
157 | 
158 | The schema matching results can be used to:
159 | 
160 | 1. Identify missing dependencies before executing SQL
161 | 2. Generate data type-aware documentation
162 | 3. Create migration scripts
163 | 4. Highlight potential issues in SQL queries
164 | 5. Ensure referential integrity across schemas
165 | 
166 | ## Security Considerations
167 | 
168 | SQLDeps follows security best practices for database connections:
169 | 
170 | - No hardcoded credentials in code
171 | - Support for PostgreSQL password file
172 | - Environment variable support
173 | - Secure parameter handling (parameters are cleared after use)
174 | - Connection timeouts to prevent hanging
175 | 


--------------------------------------------------------------------------------
/docs/user-guide/visualization.md:
--------------------------------------------------------------------------------
 1 | # Interactive Visualization of SQL Dependencies
 2 | 
 3 | SQLDeps provides powerful visualization capabilities to help you understand and explore SQL dependencies across your projects.
 4 | 
 5 | ## Interactive Dependency Graphs
 6 | 
 7 | The `visualize_sql_dependencies()` function creates an interactive network graph that shows the relationships between SQL files, tables, and their dependencies.
 8 | 
 9 | ### Basic Usage
10 | 
11 | ```python
12 | from sqldeps.llm_parsers import create_extractor
13 | from sqldeps.visualization import visualize_sql_dependencies
14 | 
15 | # Create an interactive network graph from multiple SQL files
16 | extractor = create_extractor()
17 | sql_profiles = extractor.extract_from_folder("path/to/folder", recursive=False)
18 | 
19 | # Generate an interactive visualization (saving output to an HTML file)
20 | figure = visualize_sql_dependencies(sql_profiles, output_path="dependencies.html")
21 | 
22 | # Show figure on IDE
23 | figure.show()
24 | ```
25 | 
26 | ### Visualization Options
27 | 
28 | The `visualize_sql_dependencies()` function offers extensive customization:
29 | 
30 | ```python
31 | # See API documentation for more options
32 | # https://sqldeps.readthedocs.io/en/latest/api-reference/visualization
33 | 
34 | figure = visualize_sql_dependencies(
35 |     dependencies,
36 |     output_path="dependencies.html",  # Optional: Save to HTML file
37 |     show_columns=True,                # Show column details in hover text
38 |     layout_algorithm="spring",        # Layout options: 'spring', 'circular', 'kamada_kawai'
39 |     highlight_common_tables=True,     # Highlight tables used by multiple files
40 |     show_file_text=True,              # Show file names
41 |     show_table_text=False,            # Show table names
42 |     color_gradient=True,              # Color intensity based on usage frequency
43 |     min_file_size=20,                 # Minimum node size for files
44 |     max_file_size=40,                 # Maximum node size for files
45 |     show_text_buttons=True,           # Add buttons to toggle text visibility
46 |     show_layout_buttons=True          # Add buttons to change graph layout
47 | )
48 | ```
49 | 
50 | ### Visualization Features
51 | 
52 | - **Interactive Exploration**: Hover over nodes to see detailed information
53 | - **Dynamic Layout**: Change graph layout with built-in buttons
54 | - **Text Toggle**: Show/hide labels for files and tables
55 | - **Usage Visualization**: 
56 |     - Node sizes indicate usage frequency
57 |     - Color intensity represents how many files use a particular table
58 | - **Dependency Insights**: 
59 |     - Visualize connections between SQL files and tables
60 |     - Identify common tables across multiple files
61 | 
62 | ### Example Use Cases
63 | 
64 | 1. **Project Dependency Mapping**
65 | ```python
66 | # Map dependencies across an entire project
67 | project_deps = extractor.extract_from_folder(
68 |     "path/to/project/sql", 
69 |     recursive=True
70 | )
71 | # Plot dependencies
72 | visualize_sql_dependencies(project_deps, output_path="project_deps.html")
73 | ```
74 | 
75 | 2. **Focused Analysis**
76 | ```python
77 | # Analyze dependencies for a specific subset of files
78 | subset_deps = extractor.extract_from_folder(
79 |     "path/to/specific/sql/folder", 
80 |     recursive=False
81 | )
82 | # Plot dependencies
83 | visualize_sql_dependencies(subset_deps, output_path="subset_deps.html")
84 | ```
85 | 
86 | ## Use cases
87 | 
88 | You can use the visualization to identify:
89 | 
90 |   - Shared tables across different files
91 |   - Potential refactoring opportunities
92 |   - Complex dependency relationships
93 | 
94 | > Note: The visualization is best suited for projects with a moderate number of SQL files
95 | 


--------------------------------------------------------------------------------
/docs/user-guide/web-app.md:
--------------------------------------------------------------------------------
 1 | # Web Application
 2 | 
 3 | SQLDeps includes a Streamlit-based web interface for interactive SQL dependency exploration.
 4 | 
 5 | ## Installation
 6 | 
 7 | To use the web application, install SQLDeps with the app dependencies:
 8 | 
 9 | ```bash
10 | pip install "sqldeps[app]"
11 | ```
12 | 
13 | ## Starting the App
14 | 
15 | Start the web application with:
16 | 
17 | ```bash
18 | # Using the CLI command
19 | sqldeps app
20 | 
21 | # Or directly with streamlit
22 | streamlit run -m sqldeps.app.main
23 | ```
24 | 
25 | This will launch the Streamlit app in your default web browser, typically at `http://localhost:8501`.
26 | 
27 | ## Using the Web Interface
28 | 
29 | The web interface provides an intuitive way to analyze SQL dependencies:
30 | 
31 | ### Configuration Panel
32 | 
33 | On the left sidebar, you'll find configuration options:
34 | 
35 | 1. **Framework Selection**: Choose between Groq, OpenAI, or DeepSeek
36 | 2. **Model Selection**: Select the specific model to use
37 | 3. **Custom Prompt**: Optionally upload a custom prompt YAML file
38 | 4. **Database Connection**: Configure database connection for schema validation
39 | 5. **SQL Input**: Upload a SQL file or enter SQL directly
40 | 
41 | ### Analysis Results
42 | 
43 | After clicking "Extract Dependencies", the main panel displays:
44 | 
45 | 1. **SQL Query**: The formatted SQL query that was analyzed
46 | 2. **Extracted Dependencies**:
47 |     - Tables listed in a clear format
48 |     - Columns organized by table
49 |     - Database schema validation results (if enabled)
50 |     - DataFrame representation
51 |     - Raw JSON output
52 | 
53 | ### Download Options
54 | 
55 | The app provides options to download the results as:
56 | 
57 | - CSV file
58 | - JSON file
59 | - Data types for dependencies matching database (when enabled)
60 | 
61 | ## Database Matching
62 | 
63 | To enable database schema matching:
64 | 
65 | 1. Check the "Enable Database Schema Validation" option
66 | 2. Enter database connection details:
67 |     - Host
68 |     - Port
69 |     - Database name
70 |     - Username
71 |     - Target schemas (comma-separated)
72 | 
73 | When database matching is enabled, the app will:
74 | 
75 | 1. Connect to the specified database
76 | 2. Retrieve schema information for the target schemas
77 | 3. Match extracted dependencies against the actual schema
78 | 4. Display dependency data types showing exact matches and schema-agnostic dependencies
79 | 
80 | ## Example Workflow
81 | 
82 | 1. Select your preferred framework and model
83 | 2. Either upload a SQL file or enter a SQL query
84 | 3. Optionally configure database schema validation
85 | 4. Click "Extract Dependencies" to analyze
86 | 5. Explore the results in the main panel
87 | 6. Download the results in your preferred format
88 | 
89 | ## Notes
90 | 
91 | The web application is designed for demonstration and exploration of single SQL files. For processing multiple files or entire folders, use the CLI or API interfaces.
92 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: SQLDeps
 2 | site_description: SQL Dependency Extractor using Large Language Models
 3 | site_author: Cainã Max Couto da Silva
 4 | repo_url: https://github.com/glue-lab/sqldeps
 5 | repo_name: glue-lab/sqldeps
 6 | 
 7 | extra_css:
 8 |   - stylesheets/custom.css
 9 | 
10 | theme:
11 |   name: material
12 |   favicon: assets/images/sqldeps_logo.png
13 |   palette:
14 |     primary: custom
15 |     accent: custom
16 |   features:
17 |     - navigation.tabs
18 |     - navigation.sections
19 |     - navigation.top
20 |     - search.suggest
21 |     - search.highlight
22 | 
23 |   icon:
24 |     repo: fontawesome/brands/github
25 |   icons:
26 |     - material/
27 |     - fontawesome/brands/
28 | 
29 | plugins:
30 |   - search
31 |   - mkdocstrings:
32 |       handlers:
33 |         python:
34 |           options:
35 |             show_root_heading: true
36 |             show_root_full_name: true
37 |             show_source: true
38 |             show_if_no_docstring: false
39 |             docstring_style: google
40 | 
41 | markdown_extensions:
42 |   - pymdownx.highlight:
43 |       anchor_linenums: true
44 |   - pymdownx.superfences
45 |   - pymdownx.inlinehilite
46 |   - admonition
47 |   - pymdownx.details
48 |   - pymdownx.snippets
49 |   - pymdownx.tabbed:
50 |       alternate_style: true
51 |   - tables
52 |   - footnotes
53 |   - pymdownx.emoji:
54 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
55 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
56 | 
57 | nav:
58 |   - Home: index.md
59 |   - Getting Started:
60 |     - Installation: getting-started/installation.md
61 |     - Quick Start: getting-started/quick-start.md
62 |   - User Guide:
63 |     - API Usage: user-guide/api-usage.md
64 |     - CLI Usage: user-guide/cli-usage.md
65 |     - Web Application: user-guide/web-app.md
66 |     - Database Integration: user-guide/database-integration.md
67 |     - Interactive Graph Visualization: user-guide/visualization.md
68 |   - API Reference:
69 |     - Core features:
70 |       - Models: api-reference/models.md
71 |       - LLM Parsers: api-reference/llm-parsers.md
72 |       - Database: api-reference/database.md
73 |       - Visualization: api-reference/visualization.md
74 |     - Advanced features:
75 |       - Config: api-reference/config.md
76 |       - Utils: api-reference/utils.md
77 |       - Cache: api-reference/cache.md
78 |       - Rate Limiter: api-reference/rate-limiter.md
79 |       - Parallelization: api-reference/parallel.md
80 |     # - Interfaces: # No need to document these interfaces
81 |     #   - CLI: api-reference/cli.md
82 |     #   - Web Application: api-reference/app.md
83 | 
84 |   # - Examples: examples.md
85 |   - Development:
86 |     - Contributing: contributing.md
87 |     - ChangeLog: changelog.md
88 |   - Team:
89 |     - Authors: authors.md
90 |     # - Contributors: team/contributors.md # ToDo: add script to show contributors
91 | 


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/notebooks/.gitkeep


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "sqldeps"
  7 | version = "0.1.1"
  8 | description = "SQL Dependency Extractor"
  9 | requires-python = ">=3.10"
 10 | classifiers = [
 11 |     "Programming Language :: Python :: 3",
 12 |     "Programming Language :: Python :: 3.10",
 13 |     "Programming Language :: Python :: 3.11",
 14 |     "Programming Language :: Python :: 3.12",
 15 |     "Programming Language :: Python :: 3.13",
 16 |     "Operating System :: OS Independent",
 17 | ]
 18 | authors = [
 19 |     {name = "Cainã Silva", email = "coutodasilva@wisc.edu"},
 20 |     {name = "Matt Christie", email = "mjchristie@wisc.edu"}
 21 | ]
 22 | readme = "README.md"
 23 | keywords = ["sql", "dependency", "parser", "extractor", "llm"]
 24 | dependencies = [
 25 |     "groq>=0.17.0",
 26 |     "litellm>=1.67.6",
 27 |     "loguru>=0.7.3",
 28 |     "openai>=1.59.5",
 29 |     "pandas>=2.2.3",
 30 |     "python-dotenv>=1.0.1",
 31 |     "pyyaml>=6.0.2",
 32 |     "sqlalchemy>=2.0.37",
 33 |     "sqlparse>=0.5.3",
 34 |     "tenacity>=9.0.0",
 35 |     "typer>=0.15.1",
 36 | ]
 37 | 
 38 | [project.optional-dependencies]
 39 | app = [
 40 |     "streamlit>=1.42.1",
 41 | ]
 42 | postgres = [
 43 |     "psycopg2>=2.9.10",
 44 | ]
 45 | dataviz = [
 46 |     "nbformat>=5.10.4",
 47 |     "networkx>=3.4.2",
 48 |     "plotly>=6.0.0",
 49 |     "scipy>=1.15.2",
 50 | ]
 51 | 
 52 | [project.urls]
 53 | Repository = "https://github.com/glue-lab/sqldeps"
 54 | Documentation = "https://sqldeps.readthedocs.io"
 55 | Questions = "https://github.com/glue-lab/sqldeps/discussions/categories/questions"
 56 | Issues = "https://github.com/glue-lab/sqldeps/issues"
 57 | 
 58 | [project.scripts]
 59 | sqldeps = "sqldeps.cli:app"
 60 | 
 61 | [tool.pytest.ini_options]
 62 | pythonpath = "."
 63 | testpaths = ["tests"]
 64 | markers = [
 65 |     "llm: marks tests that require LLM API calls (skipped by default)",
 66 |     "slow: marks tests that are slow to execute",
 67 | ]
 68 | addopts = "-m 'not llm and not integration and not slow' --ignore=sqldeps/app"
 69 | 
 70 | [tool.hatch.metadata]
 71 | allow-direct-references = false
 72 | 
 73 | [tool.hatch.build]
 74 | packages = ["sqldeps"]
 75 | 
 76 | [dependency-groups]
 77 | dev = [
 78 |     "pytest-cov>=6.0.0",
 79 |     "pytest>=8.3.4",
 80 |     "ruff>=0.9.7",
 81 |     "pre-commit>=4.2.0",
 82 | ]
 83 | docs = [
 84 |     "mkdocs>=1.6.1",
 85 |     "mkdocs-material>=9.6.9",
 86 |     "mkdocstrings[python]>=0.29.1",
 87 |     "pymdown-extensions>=10.14.3",
 88 | ]
 89 | analysis = [
 90 |     "ipykernel>=6.29.5",
 91 |     "seaborn>=0.13.2",
 92 |     "tabulate>=0.9.0",
 93 | ]
 94 | 
 95 | [tool.uv]
 96 | default-groups = ["dev", "docs", "analysis"]
 97 | 
 98 | [tool.ruff]
 99 | line-length = 88   # Like Black, use 88 characters per line.
100 | indent-width = 4   # Like Black, use 4 spaces per indentation level.
101 | exclude = ["*.ipynb"] # Exclude Jupyter notebooks from linting.
102 | 
103 | [tool.ruff.lint]
104 | select = [
105 |     "F",      # Pyflakes
106 |     "E", "W", # pycodestyle
107 |     "I",      # isort
108 |     "N",      # pep8-naming
109 |     "Q",      # flake8-quotes
110 |     "UP",     # pyupgrade
111 |     "D",      # pydocstyle
112 |     "RUF",    # Ruff-specific rules
113 |     "B",      # flake8-bugbear
114 |     "T20",    # flake8-print
115 |     "C90",    # mccabe (complex structures)
116 |     "SIM",    # flake8-simplify
117 |     "ANN",    # flake8-annotations
118 |     "TID",    # flake8-tidy-imports
119 | ]
120 | ignore = [] # ignore specific rules here
121 | 
122 | # Allow fix for all enabled rules (when `--fix`) is provided.
123 | fixable = ["ALL"]
124 | unfixable = []
125 | 
126 | [tool.ruff.lint.pydocstyle]
127 | convention = "google"
128 | 
129 | [tool.ruff.lint.isort]
130 | combine-as-imports = true
131 | force-single-line = false
132 | 
133 | [tool.ruff.format]
134 | # Like Black, use double quotes for strings.
135 | quote-style = "double"
136 | 
137 | # Like Black, indent with spaces, rather than tabs.
138 | indent-style = "space"
139 | 
140 | # Like Black, respect magic trailing commas.
141 | skip-magic-trailing-comma = false
142 | 
143 | # Like Black, automatically detect the appropriate line ending.
144 | line-ending = "auto"
145 | 


--------------------------------------------------------------------------------
/scripts/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/scripts/.gitkeep


--------------------------------------------------------------------------------
/sqldeps/__init__.py:
--------------------------------------------------------------------------------
 1 | """SQLDeps: SQL Dependency Extractor using Large Language Models.
 2 | 
 3 | SQLDeps provides tools to automatically extract and map table and colum dependencies
 4 | from SQL scripts using LLMs. It identifies both dependencies (tables/columns needed
 5 | before execution) and outputs (tables/columns created or modified by the query).
 6 | """
 7 | 
 8 | from importlib.metadata import version
 9 | 
10 | __version__ = version("sqldeps")
11 | 


--------------------------------------------------------------------------------
/sqldeps/app/__init__.py:
--------------------------------------------------------------------------------
1 | """SQLDeps web application.
2 | 
3 | This package provides a Streamlit-based web interface for the SQLDeps tool,
4 | allowing users to interactively extract and visualize SQL dependencies.
5 | """
6 | 


--------------------------------------------------------------------------------
/sqldeps/app/assets/images/sqldeps_gray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/sqldeps/app/assets/images/sqldeps_gray.png


--------------------------------------------------------------------------------
/sqldeps/app/assets/images/sqldeps_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/sqldeps/app/assets/images/sqldeps_white.png


--------------------------------------------------------------------------------
/sqldeps/cache.py:
--------------------------------------------------------------------------------
  1 | """Caching utilities for SQL dependency extraction.
  2 | 
  3 | This module provides functions for caching extraction results to avoid
  4 | repeatedly processing the same SQL files, which can save API calls, cost, and time.
  5 | """
  6 | 
  7 | import hashlib
  8 | import json
  9 | from pathlib import Path
 10 | 
 11 | from loguru import logger
 12 | 
 13 | from sqldeps.models import SQLProfile
 14 | 
 15 | CACHE_DIR = ".sqldeps_cache"
 16 | 
 17 | 
 18 | def get_cache_path(file_path: str | Path, cache_dir: str | Path = CACHE_DIR) -> Path:
 19 |     """Generate a consistent cache file path based on SQL file content.
 20 | 
 21 |     Creates a unique cache filename by hashing the SQL file's content.
 22 |     Includes the original filename in the cache name for easier debugging.
 23 | 
 24 |     Args:
 25 |         file_path: Path to the SQL file to be processed
 26 |         cache_dir: Directory where cache files will be stored.
 27 |                    Defaults to ".sqldeps_cache"
 28 | 
 29 |     Returns:
 30 |         Path object pointing to the cache file location
 31 | 
 32 |     Raises:
 33 |         FileNotFoundError: If the SQL file doesn't exist
 34 |         PermissionError: If the SQL file can't be read
 35 |     """
 36 |     file_path = Path(file_path).resolve()
 37 | 
 38 |     # Read file content and create hash
 39 |     with open(file_path, "rb") as f:
 40 |         content = f.read()
 41 | 
 42 |     # Hash the content
 43 |     content_hash = hashlib.md5(content).hexdigest()[:16]
 44 | 
 45 |     # Use a combination of filename and content hash for better readability/debugging
 46 |     cache_name = f"{file_path.stem}_{content_hash}"
 47 | 
 48 |     # Ensure a valid filename
 49 |     cache_name = "".join(c if c.isalnum() or c in "_-." else "_" for c in cache_name)
 50 | 
 51 |     return Path(cache_dir) / f"{cache_name}.json"
 52 | 
 53 | 
 54 | def save_to_cache(
 55 |     result: SQLProfile, file_path: Path, cache_dir: Path = Path(CACHE_DIR)
 56 | ) -> bool:
 57 |     """Save extraction result to cache.
 58 | 
 59 |     Args:
 60 |         result: The SQLProfile to save
 61 |         file_path: The original SQL file path
 62 |         cache_dir: The cache directory
 63 | 
 64 |     Returns:
 65 |         True if saved successfully, False otherwise
 66 |     """
 67 |     cache_dir.mkdir(parents=True, exist_ok=True)
 68 |     cache_file = get_cache_path(file_path, cache_dir)
 69 | 
 70 |     try:
 71 |         with open(cache_file, "w") as f:
 72 |             json.dump(result.to_dict(), f)
 73 |         return True
 74 |     except Exception as e:
 75 |         logger.warning(f"Failed to save cache for {file_path}: {e}")
 76 |         return False
 77 | 
 78 | 
 79 | def load_from_cache(
 80 |     file_path: Path, cache_dir: Path = Path(CACHE_DIR)
 81 | ) -> SQLProfile | None:
 82 |     """Load extraction result from cache.
 83 | 
 84 |     Args:
 85 |         file_path: The original SQL file path
 86 |         cache_dir: The cache directory
 87 | 
 88 |     Returns:
 89 |         SQLProfile if loaded successfully, None otherwise
 90 |     """
 91 |     cache_file = get_cache_path(file_path, cache_dir)
 92 | 
 93 |     if not cache_file.exists():
 94 |         return None
 95 | 
 96 |     try:
 97 |         with open(cache_file) as f:
 98 |             cached_data = json.load(f)
 99 |             logger.info(f"Loading from cache: {file_path}")
100 |             return SQLProfile(**cached_data)
101 |     except Exception as e:
102 |         logger.warning(f"Failed to load cache for {file_path}: {e}")
103 |         return None
104 | 
105 | 
106 | def cleanup_cache(cache_dir: Path = Path(CACHE_DIR)) -> bool:
107 |     """Clean up cache directory.
108 | 
109 |     Args:
110 |         cache_dir: The cache directory to clean up
111 | 
112 |     Returns:
113 |         True if cleaned up successfully, False otherwise
114 |     """
115 |     if not cache_dir.exists():
116 |         return True
117 | 
118 |     try:
119 |         # Remove all JSON files
120 |         for cache_file in cache_dir.glob("*.json"):
121 |             cache_file.unlink()
122 | 
123 |         # Try to remove directory if empty
124 |         if not any(cache_dir.iterdir()):
125 |             cache_dir.rmdir()
126 |             logger.info(f"Removed cache directory: {cache_dir}")
127 |         else:
128 |             logger.info(
129 |                 "Cache directory cleaned but not removed (contains other files)"
130 |             )
131 |         return True
132 |     except Exception as e:
133 |         logger.warning(f"Failed to clean up cache: {e}")
134 |         return False
135 | 


--------------------------------------------------------------------------------
/sqldeps/config.py:
--------------------------------------------------------------------------------
 1 | """Configuration utilities for SQLDeps.
 2 | 
 3 | This module provides functions for loading configuration from YAML files.
 4 | """
 5 | 
 6 | import yaml
 7 | 
 8 | 
 9 | def load_config(config_path: str) -> dict:
10 |     """Load configuration from a YAML file.
11 | 
12 |     Args:
13 |         config_path: Path to the YAML configuration file
14 | 
15 |     Returns:
16 |         dict: Parsed configuration dictionary
17 |     """
18 |     with open(config_path) as config_file:
19 |         config = yaml.safe_load(config_file)
20 |     return config
21 | 


--------------------------------------------------------------------------------
/sqldeps/configs/prompts/default_v0.1.0.yml:
--------------------------------------------------------------------------------
 1 | system_prompt: |
 2 |   You are a SQL analyzer that extracts two key elements from SQL queries:
 3 | 
 4 |   1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution.
 5 |     - Source tables in `FROM`, `JOIN`, CTEs, subqueries, etc.
 6 |     - ALL target tables in operations like `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE` must be included in dependencies.
 7 |     - Referenced columns in `SELECT`, `WHERE`, `CASE`, `JOIN`, `GROUP BY`, `HAVING`, `ORDER BY`, etc.
 8 |     - Columns used in expressions, `CASE` statements, and aggregate functions.
 9 | 
10 |   2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query.
11 |      - Tables modified with `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE`.
12 |      - Target columns in these operations.
13 |      - Tables created with `CREATE TABLE`.
14 | 
15 |   KEY RULES (CRITICAL):
16 |   - ALL target tables (INSERT, UPDATE, DELETE, TRUNCATE) MUST appear in BOTH dependencies AND outputs.
17 |     - Example: For `INSERT INTO table_x (col1, col2) VALUES (1, 2)`
18 |       → Dependencies: `{"table_x": ["col1", "col2"]}`
19 |       → Outputs: `{"table_x": ["col1", "col2"]}`
20 |     - Example: For `TRUNCATE TABLE table_x`
21 |       → Dependencies: `{"table_x": []}`
22 |       → Outputs: `{"table_x": []}`
23 |   - This applies even inside CTEs, functions or stored procedures.
24 |   - EXCEPTIONS:
25 |     - If a table is CREATED in the same query (CREATE TABLE), it appears ONLY in outputs.
26 |     - If a table is used in INSERT statement after TRUNCATE, include the specified columns for both dependencies and outputs instead of an empty list.
27 |   - ALWAYS include schema-qualified tables (e.g., `schema.table`) in both dependencies and outputs, preserving the schema name.
28 | 
29 |   COLUMN HANDLING:
30 |   - Explicit `SELECT *` should return ["*"] in dependencies
31 |   - Functions like COUNT(*) with specific names, do NOT use ["*"], only include explicitly named columns
32 |     - Example: For `SELECT COUNT(*), name FROM users`, dependencies would include `{"users": ["name"]}`
33 |   - INSERT without column list creates dependencies on all columns in the target table: ["*"]
34 | 
35 |   CTE HANDLING:
36 |   - CTEs (WITH queries) are temporary structures and should NOT be included as dependencies or outputs themselves.
37 |   - However, include tables and columns used within CTEs that originated outside, since they are dependencies.
38 |   - Example:
39 |     ```sql
40 |     WITH cte AS (SELECT * FROM table_x)
41 |     INSERT INTO table_y SELECT * FROM cte
42 |     ```
43 |     → Dependencies: `{"table_x": ["*"], "table_y": ["*"]}`
44 |     → Outputs: `{"table_y": ["*"]}`
45 | 
46 |   SCHEMA-QUALIFIED TABLES:
47 |   - Always preserve schema names exactly as they appear in the query.
48 |   - Example: For `INSERT INTO schema_a.table_x SELECT col1,col2 FROM schema_b.table_y`
49 |     → Dependencies: `{"schema_a.table_x": ["*"], "schema_b.table_y": ["col1","col2"]}`
50 |     → Outputs: `{"schema_a.table_x": ["*"]}`
51 | 
52 |   FUNCTION & PROCEDURE HANDLING:
53 |   - Even inside functions or stored procedures, any `INSERT`, `UPDATE`, `DELETE`, or `TRUNCATE` statements affect real tables and must be included as dependencies and outputs.
54 | 
55 |   ADDITIONAL CONSIDERATIONS:
56 |   - Resolve table aliases to real table names.
57 |   - `CASE` expressions → dependencies on all examined columns.
58 |   - `MERGE`/`UPSERT` → both dependencies and outputs.
59 |   - Ignore variables and parameters as dependencies.
60 | 
61 |   OUTPUT JSON FORMAT:
62 |   {
63 |     "dependencies": {"table_name": ["column1", "column2"]},
64 |     "outputs": {"table_name": ["column1", "column2"]}
65 |   }
66 | 
67 | user_prompt: |
68 |   Extract SQL dependencies (tables/columns needed BEFORE execution) and outputs (tables/columns CREATED or MODIFIED) from this query.
69 | 
70 |   Respond ONLY with JSON in this exact format:
71 |   {{
72 |     "dependencies": {{"table_name": ["column1", "column2"]}},
73 |     "outputs": {{"table_name": ["column1", "column2"]}}
74 |   }}
75 | 
76 |   SQL query to analyze:
77 |   {sql}
78 | 


--------------------------------------------------------------------------------
/sqldeps/configs/prompts/simplified.yml:
--------------------------------------------------------------------------------
 1 | system_prompt: |
 2 |   You are a SQL analyzer that extracts two key elements from SQL queries:
 3 | 
 4 |   1. DEPENDENCIES: All tables and columns that must exist BEFORE query execution so that the query can run without error.
 5 |   2. OUTPUTS: All tables and columns permanently CREATED or MODIFIED by the query.
 6 | 
 7 |   OUTPUT JSON FORMAT:
 8 |   {
 9 |     "dependencies": {"table_name": ["column1", "column2"]},
10 |     "outputs": {"table_name": ["column1", "column2"]}
11 |   }
12 | 
13 | user_prompt: |
14 |   Extract SQL dependencies and outputs for this query:
15 |   {sql}
16 | 


--------------------------------------------------------------------------------
/sqldeps/database/__init__.py:
--------------------------------------------------------------------------------
1 | """Database connectors for SQLDeps.
2 | 
3 | This package provides database connectors for schema retrieval and validation.
4 | """
5 | 
6 | from .postgresql import PostgreSQLConnector
7 | 
8 | __all__ = ["PostgreSQLConnector"]
9 | 


--------------------------------------------------------------------------------
/sqldeps/database/base.py:
--------------------------------------------------------------------------------
  1 | """Base class for database connections.
  2 | 
  3 | This module defines the abstract base class for SQL database connections
  4 | and schema inspection.
  5 | """
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | from pathlib import Path
  9 | from typing import Any
 10 | 
 11 | import pandas as pd
 12 | from dotenv import load_dotenv
 13 | from sqlalchemy.engine.base import Engine
 14 | 
 15 | load_dotenv()
 16 | 
 17 | 
 18 | class SQLBaseConnector(ABC):
 19 |     """Abstract base class for SQL database connections and schema inspection.
 20 | 
 21 |     Provides interface for:
 22 |     - Database connection with multiple configuration sources
 23 |     - Schema inspection and export
 24 |     - Engine-specific connection handling
 25 |     """
 26 | 
 27 |     @abstractmethod
 28 |     def __init__(
 29 |         self,
 30 |         host: str | None = None,
 31 |         port: int | None = None,
 32 |         database: str | None = None,
 33 |         username: str | None = None,
 34 |         password: str | None = None,
 35 |         config_path: Path | None = None,
 36 |     ) -> None:
 37 |         """Initialize database connection.
 38 | 
 39 |         Args:
 40 |             host: Database host address
 41 |             port: Database port
 42 |             database: Database name
 43 |             username: Database username
 44 |             password: Database password
 45 |             config_path: Path to configuration file
 46 |         """
 47 |         pass
 48 | 
 49 |     @abstractmethod
 50 |     def _create_engine(self, params: dict[str, Any]) -> Engine:
 51 |         """Create database engine with given parameters.
 52 | 
 53 |         Args:
 54 |             params: Dictionary of connection parameters
 55 | 
 56 |         Returns:
 57 |             Database engine
 58 |         """
 59 |         pass
 60 | 
 61 |     @abstractmethod
 62 |     def _load_config(self, config_path: Path | None) -> dict[str, Any]:
 63 |         """Load configuration from file.
 64 | 
 65 |         Args:
 66 |             config_path: Path to configuration file
 67 | 
 68 |         Returns:
 69 |             Dictionary with configuration parameters
 70 |         """
 71 |         pass
 72 | 
 73 |     @abstractmethod
 74 |     def _get_env_vars(self) -> dict[str, Any]:
 75 |         """Get environment variables for connection.
 76 | 
 77 |         Returns:
 78 |             Dictionary with environment variables
 79 |         """
 80 |         pass
 81 | 
 82 |     @abstractmethod
 83 |     def _resolve_params(
 84 |         self,
 85 |         host: str | None,
 86 |         port: int | None,
 87 |         database: str | None,
 88 |         username: str | None,
 89 |         password: str | None,
 90 |         config_path: Path | None,
 91 |         **kwargs: dict[str, Any],
 92 |     ) -> dict[str, Any]:
 93 |         """Resolve connection parameters from all sources.
 94 | 
 95 |         Args:
 96 |             host: Database host address
 97 |             port: Database port
 98 |             database: Database name
 99 |             username: Database username
100 |             password: Database password
101 |             config_path: Path to configuration file
102 |             **kwargs: Additional parameters
103 | 
104 |         Returns:
105 |             Dictionary with resolved connection parameters
106 |         """
107 |         pass
108 | 
109 |     @abstractmethod
110 |     def get_schema(self, schemas: str | list[str] | None = None) -> pd.DataFrame:
111 |         """Get database schema information.
112 | 
113 |         Args:
114 |             schemas: Optional schema name or list of schema names to filter results
115 | 
116 |         Returns:
117 |             DataFrame with schema information
118 |         """
119 |         pass
120 | 
121 |     def export_schema_csv(
122 |         self,
123 |         path: str,
124 |         schemas: str | list[str] | None = None,
125 |     ) -> None:
126 |         """Export schema to CSV file.
127 | 
128 |         Args:
129 |             path: Path to output CSV file
130 |             schemas: Optional schema name or list of schema names to filter results
131 | 
132 |         Returns:
133 |             None
134 |         """
135 |         df = self.get_schema(schemas)
136 |         df.to_csv(path, index=False)
137 | 


--------------------------------------------------------------------------------
/sqldeps/llm_parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | """LLM-based SQL parsers for dependency extraction.
 2 | 
 3 | This package provides integrations with various LLM providers for extracting
 4 | SQL dependencies, with a common interface and factory function.
 5 | """
 6 | 
 7 | from pathlib import Path
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | from .base import BaseSQLExtractor
12 | from .deepseek import DeepseekExtractor
13 | from .groq import GroqExtractor
14 | from .litellm import LiteLlmExtractor
15 | from .openai import OpenaiExtractor
16 | 
17 | load_dotenv()
18 | 
19 | DEFAULTS = {
20 |     "litellm": {"class": LiteLlmExtractor, "model": "openai/gpt-4.1"},
21 |     "groq": {"class": GroqExtractor, "model": "llama-3.3-70b-versatile"},
22 |     "openai": {"class": OpenaiExtractor, "model": "gpt-4.1"},
23 |     "deepseek": {"class": DeepseekExtractor, "model": "deepseek-chat"},
24 | }
25 | 
26 | 
27 | def create_extractor(
28 |     framework: str = "litellm",
29 |     model: str | None = None,
30 |     params: dict | None = None,
31 |     prompt_path: Path | None = None,
32 | ) -> BaseSQLExtractor:
33 |     """Create an appropriate SQL extractor based on the specified framework.
34 | 
35 |     Args:
36 |         framework: The LLM framework to use ("litellm", "groq", "openai", or "deepseek")
37 |             Note: Direct framework options are maintained for backward compatibility,
38 |             but "litellm" is recommended as it provides integrations for all models
39 |             from multiple providers
40 |         model: The model name within the selected framework (uses default if None)
41 |         params: Additional parameters to pass to the LLM API
42 |         prompt_path: Path to a custom prompt YAML file
43 | 
44 |     Returns:
45 |         An instance of the appropriate SQL extractor
46 | 
47 |     Raises:
48 |         ValueError: If an unsupported framework is specified
49 |     """
50 |     framework = framework.lower()
51 |     if framework not in DEFAULTS:
52 |         raise ValueError(
53 |             f"Unsupported framework: {framework}. "
54 |             f"Must be one of: {', '.join(DEFAULTS.keys())}"
55 |         )
56 | 
57 |     config = DEFAULTS[framework]
58 |     extractor_class = config["class"]
59 |     model_name = model or config["model"]
60 | 
61 |     return extractor_class(model=model_name, params=params, prompt_path=prompt_path)
62 | 
63 | 
64 | __all__ = [
65 |     "DeepseekExtractor",
66 |     "GroqExtractor",
67 |     "LiteLlmExtractor",
68 |     "OpenaiExtractor",
69 |     "create_extractor",
70 | ]
71 | 


--------------------------------------------------------------------------------
/sqldeps/llm_parsers/deepseek.py:
--------------------------------------------------------------------------------
 1 | """DeepSeek-based SQL parser implementation.
 2 | 
 3 | This module provides the DeepSeek-specific implementation of the BaseSQLExtractor
 4 | for using DeepSeek's models to extract SQL dependencies.
 5 | """
 6 | 
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from openai import OpenAI
11 | 
12 | from sqldeps.llm_parsers.base import BaseSQLExtractor
13 | 
14 | 
15 | class DeepseekExtractor(BaseSQLExtractor):
16 |     """DeepSeek-based SQL dependency extractor.
17 | 
18 |     Attributes:
19 |         ENV_VAR_NAME: Environment variable name for the API key
20 |         client: OpenAI client instance configured for DeepSeek API
21 |     """
22 | 
23 |     # Expected environmental variable with the DeepSeek key
24 |     ENV_VAR_NAME = "DEEPSEEK_API_KEY"
25 | 
26 |     def __init__(
27 |         self,
28 |         model: str = "deepseek-chat",
29 |         params: dict | None = None,
30 |         api_key: str | None = None,
31 |         prompt_path: Path | None = None,
32 |     ) -> None:
33 |         """Initialize DeepSeek extractor.
34 | 
35 |         Args:
36 |             model: DeepSeek model name to use
37 |             params: Additional parameters for the API
38 |             api_key: DeepSeek API key (defaults to environment variable)
39 |             prompt_path: Path to custom prompt YAML file
40 | 
41 |         Raises:
42 |             ValueError: If API key is not provided
43 |         """
44 |         super().__init__(model, params, prompt_path=prompt_path)
45 | 
46 |         api_key = api_key or os.getenv(self.ENV_VAR_NAME)
47 |         if not api_key:
48 |             raise ValueError(
49 |                 "No API key provided. Either pass api_key parameter or set "
50 |                 f"{self.ENV_VAR_NAME} environment variable."
51 |             )
52 | 
53 |         self.client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
54 | 
55 |     def _query_llm(self, user_prompt: str) -> str:
56 |         """Query the DeepSeek LLM with the generated prompt.
57 | 
58 |         Args:
59 |             user_prompt: Generated prompt to send to DeepSeek
60 | 
61 |         Returns:
62 |             Response content from DeepSeek
63 |         """
64 |         response = self.client.chat.completions.create(
65 |             model=self.model,
66 |             messages=[
67 |                 {"role": "system", "content": self.prompts["system_prompt"]},
68 |                 {"role": "user", "content": user_prompt},
69 |             ],
70 |             response_format={"type": "json_object"},
71 |             stream=False,
72 |             **self.params,
73 |         )
74 | 
75 |         return response.choices[0].message.content
76 | 


--------------------------------------------------------------------------------
/sqldeps/llm_parsers/groq.py:
--------------------------------------------------------------------------------
 1 | """Groq-based SQL parser implementation.
 2 | 
 3 | This module provides the Groq-specific implementation of the BaseSQLExtractor
 4 | for using Groq's models to extract SQL dependencies.
 5 | """
 6 | 
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from groq import Groq
11 | 
12 | from sqldeps.llm_parsers.base import BaseSQLExtractor
13 | 
14 | 
15 | class GroqExtractor(BaseSQLExtractor):
16 |     """Groq-based SQL dependency extractor.
17 | 
18 |     Attributes:
19 |         ENV_VAR_NAME: Environment variable name for the API key
20 |         client: Groq client instance
21 |     """
22 | 
23 |     ENV_VAR_NAME = "GROQ_API_KEY"
24 | 
25 |     def __init__(
26 |         self,
27 |         model: str = "llama-3.3-70b-versatile",
28 |         params: dict | None = None,
29 |         api_key: str | None = None,
30 |         prompt_path: Path | None = None,
31 |     ) -> None:
32 |         """Initialize Groq extractor."""
33 |         super().__init__(model, params, prompt_path=prompt_path)
34 | 
35 |         api_key = api_key or os.getenv(self.ENV_VAR_NAME)
36 |         if not api_key:
37 |             raise ValueError(
38 |                 "No API key provided. Either pass api_key parameter or set "
39 |                 f"{self.ENV_VAR_NAME} environment variable."
40 |             )
41 | 
42 |         self.client = Groq(api_key=api_key)
43 | 
44 |     def _query_llm(self, user_prompt: str) -> str:
45 |         """Query the Groq LLM with the generated prompt.
46 | 
47 |         Args:
48 |             user_prompt: Generated prompt to send to Groq
49 | 
50 |         Returns:
51 |             Response content from Groq
52 |         """
53 |         response = self.client.chat.completions.create(
54 |             model=self.model,
55 |             messages=[
56 |                 {"role": "system", "content": self.prompts["system_prompt"]},
57 |                 {"role": "user", "content": user_prompt},
58 |             ],
59 |             response_format={"type": "json_object"},
60 |             **self.params,
61 |         )
62 | 
63 |         return response.choices[0].message.content
64 | 


--------------------------------------------------------------------------------
/sqldeps/llm_parsers/litellm.py:
--------------------------------------------------------------------------------
 1 | """LiteLLM-based SQL parser implementation.
 2 | 
 3 | This module provides the LiteLLM-specific implementation of the BaseSQLExtractor
 4 | for using various LLM models to extract SQL dependencies.
 5 | """
 6 | 
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from litellm import UnsupportedParamsError, completion
11 | 
12 | from sqldeps.llm_parsers.base import BaseSQLExtractor
13 | 
14 | 
15 | class LiteLlmExtractor(BaseSQLExtractor):
16 |     """LiteLLM-based SQL dependency extractor.
17 | 
18 |     This extractor supports multiple LLM providers through LiteLLM.
19 |     Authentication is handled by LiteLLM, which supports various methods
20 |     depending on the provider (API keys, tokens, or no authentication).
21 | 
22 |     API keys can be provided as a dictionary mapping environment variable names
23 |     to their values. For example:
24 |         {
25 |             "OPENAI_API_KEY": "sk-...",
26 |             "ANTHROPIC_API_KEY": "sk-...",
27 |         }
28 |     """
29 | 
30 |     def __init__(
31 |         self,
32 |         model: str = "openai/gpt-4.1",
33 |         params: dict | None = None,
34 |         api_key: dict[str, str] | None = None,
35 |         prompt_path: Path | None = None,
36 |     ) -> None:
37 |         """Initialize LiteLLM extractor.
38 | 
39 |         Args:
40 |             model: LLM model name to use (supports various providers through LiteLLM)
41 |             params: Additional parameters for the API
42 |             api_key: Optional dictionary mapping environment variable names to
43 |                 API key values. For example: {"OPENAI_API_KEY": "sk-..."}
44 |             prompt_path: Path to custom prompt YAML file
45 |         """
46 |         super().__init__(model, params, prompt_path=prompt_path)
47 | 
48 |         if api_key:
49 |             for env_var, key_value in api_key.items():
50 |                 os.environ[env_var] = key_value
51 | 
52 |     def _query_llm(self, user_prompt: str) -> str:
53 |         """Query the LLM with the generated prompt using LiteLLM.
54 | 
55 |         Args:
56 |             user_prompt: Generated prompt to send to the LLM
57 | 
58 |         Returns:
59 |             Response content from the LLM
60 |         """
61 |         messages = [
62 |             {"role": "system", "content": self.prompts["system_prompt"]},
63 |             {"role": "user", "content": user_prompt},
64 |         ]
65 | 
66 |         try:
67 |             response = completion(
68 |                 model=self.model,
69 |                 messages=messages,
70 |                 response_format={"type": "json_object"},
71 |                 **self.params,
72 |             )
73 |         except UnsupportedParamsError:
74 |             response = completion(
75 |                 model=self.model,
76 |                 messages=messages,
77 |                 response_format={"type": "json_object"},
78 |             )
79 | 
80 |         return response.choices[0].message.content
81 | 


--------------------------------------------------------------------------------
/sqldeps/llm_parsers/openai.py:
--------------------------------------------------------------------------------
 1 | """OpenAI-based SQL parser implementation.
 2 | 
 3 | This module provides the OpenAI-specific implementation of the BaseSQLExtractor
 4 | for using OpenAI's models to extract SQL dependencies.
 5 | """
 6 | 
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from openai import BadRequestError, OpenAI
11 | 
12 | from sqldeps.llm_parsers.base import BaseSQLExtractor
13 | 
14 | 
15 | class OpenaiExtractor(BaseSQLExtractor):
16 |     """OpenAI-based SQL dependency extractor.
17 | 
18 |     Attributes:
19 |         ENV_VAR_NAME: Environment variable name for the API key
20 |         client: OpenAI client instance
21 |     """
22 | 
23 |     # Expected environmental variable with the OpenAI key
24 |     ENV_VAR_NAME = "OPENAI_API_KEY"
25 | 
26 |     def __init__(
27 |         self,
28 |         model: str = "gpt-4o",
29 |         params: dict | None = None,
30 |         api_key: str | None = None,
31 |         prompt_path: Path | None = None,
32 |     ) -> None:
33 |         """Initialize OpenAI extractor.
34 | 
35 |         Args:
36 |             model: OpenAI model name to use
37 |             params: Additional parameters for the API
38 |             api_key: OpenAI API key (defaults to environment variable)
39 |             prompt_path: Path to custom prompt YAML file
40 | 
41 |         Raises:
42 |             ValueError: If API key is not provided
43 |         """
44 |         super().__init__(model, params, prompt_path=prompt_path)
45 | 
46 |         api_key = api_key or os.getenv(self.ENV_VAR_NAME)
47 |         if not api_key:
48 |             raise ValueError(
49 |                 "No API key provided. Either pass api_key parameter or set "
50 |                 f"{self.ENV_VAR_NAME} environment variable."
51 |             )
52 | 
53 |         self.client = OpenAI(api_key=api_key)
54 | 
55 |     def _query_llm(self, user_prompt: str) -> str:
56 |         """Query the OpenAI LLM with the generated prompt.
57 | 
58 |         Args:
59 |             user_prompt: Generated prompt to send to OpenAI
60 | 
61 |         Returns:
62 |             Response content from OpenAI
63 |         """
64 |         messages = [
65 |             {"role": "system", "content": self.prompts["system_prompt"]},
66 |             {"role": "user", "content": user_prompt},
67 |         ]
68 | 
69 |         try:
70 |             response = self.client.chat.completions.create(
71 |                 model=self.model,
72 |                 messages=messages,
73 |                 response_format={"type": "json_object"},
74 |                 **self.params,
75 |             )
76 |         except BadRequestError as e:
77 |             if any(param in str(e) for param in ["temperature", "unsupported"]):
78 |                 response = self.client.chat.completions.create(
79 |                     model=self.model,
80 |                     messages=messages,
81 |                     response_format={"type": "json_object"},
82 |                 )
83 |             else:
84 |                 raise
85 | 
86 |         return response.choices[0].message.content
87 | 


--------------------------------------------------------------------------------
/sqldeps/models.py:
--------------------------------------------------------------------------------
  1 | """Data models for SQLDeps.
  2 | 
  3 | This module defines the core data structures used by SQLDeps for
  4 | representing SQL dependencies and outputs.
  5 | """
  6 | 
  7 | from dataclasses import dataclass
  8 | 
  9 | import pandas as pd
 10 | 
 11 | 
 12 | @dataclass
 13 | class SQLProfile:
 14 |     """Data class to hold both SQL dependencies and outputs."""
 15 | 
 16 |     # Dependencies (input tables/columns required by the query)
 17 |     dependencies: dict[str, list[str]]
 18 | 
 19 |     # Outputs (tables/columns created or modified by the query)
 20 |     outputs: dict[str, list[str]]
 21 | 
 22 |     def __post_init__(self) -> None:
 23 |         """Sort tables and columns for consistent output."""
 24 |         self.dependencies = {
 25 |             table: sorted(set(cols))
 26 |             for table, cols in sorted(self.dependencies.items())
 27 |         }
 28 |         self.outputs = {
 29 |             table: sorted(set(cols)) for table, cols in sorted(self.outputs.items())
 30 |         }
 31 | 
 32 |     @property
 33 |     def dependency_tables(self) -> list[str]:
 34 |         """Get list of dependency tables.
 35 | 
 36 |         Returns:
 37 |             list[str]: Sorted list of table names referenced as dependencies
 38 |         """
 39 |         return sorted(self.dependencies.keys())
 40 | 
 41 |     @property
 42 |     def outcome_tables(self) -> list[str]:
 43 |         """Get list of outcome tables.
 44 | 
 45 |         Returns:
 46 |             list[str]: Sorted list of table names referenced as outputs
 47 |         """
 48 |         return sorted(self.outputs.keys())
 49 | 
 50 |     def to_dict(self) -> dict:
 51 |         """Convert to dictionary format.
 52 | 
 53 |         Returns:
 54 |             dict: Dictionary with dependencies and outputs
 55 |         """
 56 |         return {"dependencies": self.dependencies, "outputs": self.outputs}
 57 | 
 58 |     def to_dataframe(self) -> pd.DataFrame:
 59 |         """Convert to a DataFrame with type column indicating dependency or outcome.
 60 | 
 61 |         Returns:
 62 |             pd.DataFrame: DataFrame with columns for type, schema, table, and column
 63 |         """
 64 |         records = []
 65 | 
 66 |         # Add dependencies
 67 |         for table, columns in self.dependencies.items():
 68 |             schema, table_name = table.split(".") if "." in table else (None, table)
 69 |             if columns:
 70 |                 for column in columns:
 71 |                     records.append(
 72 |                         {
 73 |                             "type": "dependency",
 74 |                             "schema": schema,
 75 |                             "table": table_name,
 76 |                             "column": column,
 77 |                         }
 78 |                     )
 79 |             else:
 80 |                 records.append(
 81 |                     {
 82 |                         "type": "dependency",
 83 |                         "schema": schema,
 84 |                         "table": table_name,
 85 |                         "column": None,
 86 |                     }
 87 |                 )
 88 | 
 89 |         # Add outputs
 90 |         for table, columns in self.outputs.items():
 91 |             schema, table_name = table.split(".") if "." in table else (None, table)
 92 |             if columns:
 93 |                 for column in columns:
 94 |                     records.append(
 95 |                         {
 96 |                             "type": "outcome",
 97 |                             "schema": schema,
 98 |                             "table": table_name,
 99 |                             "column": column,
100 |                         }
101 |                     )
102 |             else:
103 |                 records.append(
104 |                     {
105 |                         "type": "outcome",
106 |                         "schema": schema,
107 |                         "table": table_name,
108 |                         "column": None,
109 |                     }
110 |                 )
111 | 
112 |         return pd.DataFrame(records)
113 | 


--------------------------------------------------------------------------------
/sqldeps/parallel.py:
--------------------------------------------------------------------------------
  1 | """Parallel processing utilities for SQL dependency extraction.
  2 | 
  3 | This module provides functions for extracting SQL dependencies in parallel
  4 | using multiple worker processes, with shared rate limiting.
  5 | """
  6 | 
  7 | from concurrent.futures import ProcessPoolExecutor, as_completed
  8 | from functools import partial
  9 | from multiprocessing import Manager, cpu_count
 10 | from pathlib import Path
 11 | 
 12 | import numpy as np
 13 | from loguru import logger
 14 | from tenacity import retry, stop_after_attempt, wait_exponential
 15 | 
 16 | from sqldeps.cache import load_from_cache, save_to_cache
 17 | from sqldeps.models import SQLProfile
 18 | from sqldeps.rate_limiter import MultiprocessingRateLimiter
 19 | 
 20 | 
 21 | def resolve_workers(n_workers: int) -> int:
 22 |     """Resolve the number of worker processes to use.
 23 | 
 24 |     Args:
 25 |         n_workers: Requested number of workers (-1 for all, >0 for specific count)
 26 | 
 27 |     Returns:
 28 |         int: Actual number of worker processes to use
 29 | 
 30 |     Raises:
 31 |         ValueError: If n_workers is invalid (not -1, or not between 1 and cpu_count)
 32 |     """
 33 |     max_workers = cpu_count()
 34 | 
 35 |     if n_workers == -1:
 36 |         return max_workers
 37 |     if 1 <= n_workers <= max_workers:
 38 |         return n_workers
 39 | 
 40 |     raise ValueError(
 41 |         f"Invalid worker count: {n_workers}. "
 42 |         f"Must be -1 (all), 1 (single), or up to {max_workers}."
 43 |     )
 44 | 
 45 | 
 46 | def _extract_from_file(
 47 |     file_path: Path,
 48 |     rate_limiter: MultiprocessingRateLimiter,
 49 |     framework: str,
 50 |     model: str,
 51 |     prompt_path: Path | None = None,
 52 |     use_cache: bool = True,
 53 | ) -> tuple[Path, object]:
 54 |     """Process a single file with rate limiting and extraction.
 55 | 
 56 |     Args:
 57 |         file_path: Path to SQL file
 58 |         rate_limiter: Rate limiter instance
 59 |         framework: LLM framework to use
 60 |         model: Model name within the framework
 61 |         prompt_path: Optional path to custom prompt
 62 |         use_cache: Whether to use cache
 63 | 
 64 |     Returns:
 65 |         Tuple of (file_path, result) or (file_path, None) on failure
 66 |     """
 67 |     from sqldeps.llm_parsers import create_extractor
 68 | 
 69 |     # Check cache if enabled
 70 |     if use_cache:
 71 |         result = load_from_cache(file_path)
 72 |         if result:
 73 |             return file_path, result
 74 | 
 75 |     try:
 76 |         # Create extractor
 77 |         extractor = create_extractor(
 78 |             framework=framework, model=model, prompt_path=prompt_path
 79 |         )
 80 | 
 81 |         # Apply rate limiting and extract with retry
 82 |         @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=10))
 83 |         def extract_with_rate_limit() -> SQLProfile:
 84 |             rate_limiter.wait_if_needed()
 85 |             logger.debug(f"Extracting from file: {file_path}")
 86 |             return extractor.extract_from_file(file_path)
 87 | 
 88 |         result = extract_with_rate_limit()
 89 | 
 90 |         # Save to cache if enabled
 91 |         if use_cache:
 92 |             save_to_cache(result, file_path)
 93 | 
 94 |         return file_path, result
 95 |     except Exception as e:
 96 |         logger.error(f"Failed to process {file_path}: {e}")
 97 |         return file_path, None
 98 | 
 99 | 
100 | def _process_batch_files(
101 |     batch_files: list[Path],
102 |     rate_limiter: MultiprocessingRateLimiter,
103 |     framework: str,
104 |     model: str,
105 |     prompt_path: Path | None = None,
106 |     use_cache: bool = True,
107 | ) -> dict:
108 |     """Process a batch of files with shared rate limiting.
109 | 
110 |     Args:
111 |         batch_files: List of file paths to process
112 |         rate_limiter: Shared rate limiter
113 |         framework: LLM framework to use
114 |         model: Model name
115 |         prompt_path: Optional path to custom prompt
116 |         use_cache: Whether to use cache
117 | 
118 |     Returns:
119 |         Dictionary mapping file paths to results
120 |     """
121 |     results = {}
122 | 
123 |     for file_path in batch_files:
124 |         path, result = _extract_from_file(
125 |             file_path, rate_limiter, framework, model, prompt_path, use_cache
126 |         )
127 |         if result:
128 |             results[str(path)] = result
129 | 
130 |     return results
131 | 
132 | 
133 | def process_files_in_parallel(
134 |     sql_files: list[Path],
135 |     framework: str = "groq",
136 |     model: str | None = None,
137 |     prompt_path: Path | None = None,
138 |     n_workers: int = 1,
139 |     rpm: int = 100,
140 |     use_cache: bool = True,
141 | ) -> dict:
142 |     """Extract SQL dependencies from SQL files in parallel with rate limiting.
143 | 
144 |     Args:
145 |         sql_files: List of Paths to SQL files to process
146 |         framework: LLM framework to use (e.g., groq, openai, deepseek)
147 |         model: Model name within the selected framework
148 |         prompt_path: Path to custom prompt YAML file
149 |         n_workers: Number of worker processes to use (-1 for all)
150 |         rpm: Requests per minute limit across all workers
151 |         use_cache: Whether to use cached results
152 | 
153 |     Returns:
154 |         Dictionary mapping file paths to SQLProfile objects
155 | 
156 |     Raises:
157 |         ValueError: If no SQL files provided or no dependencies extracted
158 |     """
159 |     # Resolve number of workers
160 |     n_workers = resolve_workers(n_workers)
161 | 
162 |     # Ensure we have a list of Path objects
163 |     sql_files = [Path(f) for f in sql_files]
164 | 
165 |     if not sql_files:
166 |         raise ValueError("No SQL files provided")
167 | 
168 |     logger.info(f"Processing {len(sql_files)} SQL files")
169 |     logger.info(
170 |         f"Using {n_workers} workers with global rate limit of {rpm} requests per minute"
171 |     )
172 |     logger.info(f"Cache usage: {'enabled' if use_cache else 'disabled'}")
173 | 
174 |     # Calculate optimal number of workers (don't use more workers than files)
175 |     n_workers = min(n_workers, len(sql_files))
176 | 
177 |     # Split files into batches
178 |     batches = np.array_split(sql_files, n_workers)
179 |     batches = [list(batch) for batch in batches if len(batch) > 0]
180 | 
181 |     all_results = {}
182 | 
183 |     # Create shared rate limiter
184 |     with Manager() as manager:
185 |         rate_limiter = MultiprocessingRateLimiter(manager, rpm)
186 | 
187 |         # Process batches in parallel
188 |         with ProcessPoolExecutor(max_workers=n_workers) as executor:
189 |             process_func = partial(
190 |                 _process_batch_files,
191 |                 rate_limiter=rate_limiter,
192 |                 framework=framework,
193 |                 model=model,
194 |                 prompt_path=prompt_path,
195 |                 use_cache=use_cache,
196 |             )
197 | 
198 |             futures = {
199 |                 executor.submit(process_func, batch): i
200 |                 for i, batch in enumerate(batches)
201 |             }
202 | 
203 |             for future in as_completed(futures):
204 |                 batch_idx = futures[future]
205 |                 try:
206 |                     batch_results = future.result()
207 |                     all_results.update(batch_results)
208 |                     logger.info(
209 |                         f"Completed batch {batch_idx + 1}/{len(batches)} with "
210 |                         f"{len(batch_results)} results"
211 |                     )
212 |                 except Exception as e:
213 |                     logger.error(f"Batch {batch_idx + 1} failed: {e}")
214 | 
215 |     # If no results were extracted
216 |     if not all_results:
217 |         raise ValueError("No dependencies could be extracted from any SQL file")
218 | 
219 |     return all_results
220 | 


--------------------------------------------------------------------------------
/sqldeps/rate_limiter.py:
--------------------------------------------------------------------------------
  1 | """Rate limiting utilities for API calls.
  2 | 
  3 | This module provides classes for limiting the rate of API calls to stay
  4 | within provider limits, in both single-process and multi-process contexts.
  5 | """
  6 | 
  7 | import time
  8 | from collections import deque
  9 | from multiprocessing.managers import SyncManager
 10 | 
 11 | from loguru import logger
 12 | 
 13 | 
 14 | class RateLimiter:
 15 |     """Rate limiter to prevent exceeding API rate limits.
 16 | 
 17 |     Tracks API call timestamps and enforces waiting periods
 18 |     to respect the specified requests per minute (RPM) limit.
 19 | 
 20 |     Attributes:
 21 |         rpm: Maximum requests per minute allowed
 22 |         call_times: Deque storing timestamps of recent API calls
 23 |         window: Time window in seconds (default: 60 seconds = 1 minute)
 24 |     """
 25 | 
 26 |     def __init__(self, rpm: int) -> None:
 27 |         """Initialize the rate limiter with an RPM limit.
 28 | 
 29 |         Args:
 30 |             rpm: Maximum number of API requests allowed per minute
 31 |         """
 32 |         self.rpm = rpm
 33 |         self.call_times = deque()
 34 |         self.window = 60  # 60 seconds =  1 minute window
 35 | 
 36 |     def wait_if_needed(self) -> None:
 37 |         """Ensures that calls do not exceed the rate limit.
 38 | 
 39 |         If the limit is reached, it waits until a slot is available.
 40 |         """
 41 |         if self.rpm <= 0:  # Disable rate limiting if rpm is 0
 42 |             return
 43 | 
 44 |         now = time.time()
 45 | 
 46 |         # Remove timestamps older than our time window (60 seconds)
 47 |         cutoff = now - self.window
 48 |         while self.call_times and self.call_times[0] < cutoff:
 49 |             self.call_times.popleft()
 50 | 
 51 |         # If we've reached the RPM limit, wait until the oldest timestamp expires
 52 |         if len(self.call_times) >= self.rpm:
 53 |             wait_time = max(0, self.call_times[0] + self.window - now)
 54 |             if wait_time > 0:
 55 |                 logger.debug(f"Rate limit reached. Waiting {wait_time:.2f} seconds")
 56 |                 time.sleep(wait_time)
 57 | 
 58 |                 # After waiting, recalculate current time and clean up again
 59 |                 now = time.time()
 60 |                 cutoff = now - self.window
 61 |                 while self.call_times and self.call_times[0] < cutoff:
 62 |                     self.call_times.popleft()
 63 | 
 64 |         # Record this API call's timestamp
 65 |         self.call_times.append(now)
 66 | 
 67 | 
 68 | class MultiprocessingRateLimiter:
 69 |     """A shared rate limiter for multiprocessing environments.
 70 | 
 71 |     Uses a manager to share state between processes, ensuring
 72 |     all processes collectively respect the RPM limit.
 73 | 
 74 |     Attributes:
 75 |         call_times: A shared list of API call timestamps
 76 |         lock: A shared lock for thread-safe operations
 77 |         rpm: Maximum requests per minute allowed
 78 |         window: Time window in seconds
 79 |     """
 80 | 
 81 |     def __init__(self, manager: SyncManager, rpm: int) -> None:
 82 |         """Initialize with a multiprocessing manager and RPM limit.
 83 | 
 84 |         Args:
 85 |             manager: A multiprocessing.Manager instance
 86 |             rpm: Maximum requests per minute allowed
 87 |         """
 88 |         self.call_times = manager.list()
 89 |         self.lock = manager.RLock()
 90 |         self.rpm = rpm
 91 |         self.window = 60
 92 | 
 93 |     def wait_if_needed(self) -> None:
 94 |         """Ensures calls don't exceed the rate limit across processes."""
 95 |         if self.rpm <= 0:
 96 |             return
 97 | 
 98 |         with self.lock:
 99 |             now = time.time()
100 |             cutoff = now - self.window
101 | 
102 |             # Remove old timestamps
103 |             while self.call_times and self.call_times[0] < cutoff:
104 |                 self.call_times.pop(0)
105 | 
106 |             # Wait if at RPM limit
107 |             if len(self.call_times) >= self.rpm:
108 |                 wait_time = max(0, self.call_times[0] + self.window - now)
109 |                 if wait_time > 0:
110 |                     logger.debug(f"Rate limit reached. Waiting {wait_time:.2f} seconds")
111 |                     time.sleep(wait_time)
112 | 
113 |                     # Recalculate after waiting
114 |                     now = time.time()
115 |                     cutoff = now - self.window
116 |                     while self.call_times and self.call_times[0] < cutoff:
117 |                         self.call_times.pop(0)
118 | 
119 |             # Record this call
120 |             self.call_times.append(now)
121 | 


--------------------------------------------------------------------------------
/sqldeps/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for SQLDeps.
  2 | 
  3 | This module provides helper functions for finding SQL files, merging SQL profiles,
  4 | and performing schema validation and comparison.
  5 | """
  6 | 
  7 | from pathlib import Path
  8 | 
  9 | import pandas as pd
 10 | 
 11 | from sqldeps.models import SQLProfile
 12 | 
 13 | 
 14 | def find_sql_files(
 15 |     folder_path: str | Path,
 16 |     recursive: bool = False,
 17 |     valid_extensions: set[str] | None = None,
 18 | ) -> list[Path]:
 19 |     """Find SQL files in a folder.
 20 | 
 21 |     Args:
 22 |         folder_path: Path to the folder
 23 |         recursive: Whether to search recursively
 24 |         valid_extensions: Set of valid file extensions (default: {'sql'})
 25 | 
 26 |     Returns:
 27 |         List of file paths
 28 | 
 29 |     Raises:
 30 |         FileNotFoundError: If folder doesn't exist
 31 |         NotADirectoryError: If path is not a directory
 32 |         ValueError: If no SQL files are found
 33 |     """
 34 |     folder_path = Path(folder_path)
 35 | 
 36 |     # Validate folder
 37 |     if not folder_path.exists():
 38 |         raise FileNotFoundError(f"Folder not found: {folder_path}")
 39 | 
 40 |     if not folder_path.is_dir():
 41 |         raise NotADirectoryError(f"Path is not a directory: {folder_path}")
 42 | 
 43 |     # Default extensions if not provided
 44 |     valid_extensions = valid_extensions or {"sql"}
 45 |     valid_extensions = {ext.lower().lstrip(".") for ext in valid_extensions}
 46 | 
 47 |     # Find matching files
 48 |     pattern = "**/*" if recursive else "*"
 49 |     sql_files = [
 50 |         f
 51 |         for f in folder_path.glob(pattern)
 52 |         if f.is_file() and f.suffix.lower().lstrip(".") in valid_extensions
 53 |     ]
 54 | 
 55 |     if not sql_files:
 56 |         raise ValueError(f"No SQL files found in {folder_path}")
 57 | 
 58 |     return sql_files
 59 | 
 60 | 
 61 | def merge_profiles(analyses: list[SQLProfile]) -> SQLProfile:
 62 |     """Merges multiple SQLProfile objects into a single one.
 63 | 
 64 |     Args:
 65 |         analyses: List of SQLProfile objects to merge
 66 | 
 67 |     Returns:
 68 |         A new SQLProfile with merged dependencies and outputs
 69 |     """
 70 |     merged_dependencies = {}
 71 |     merged_outputs = {}
 72 | 
 73 |     for analysis in analyses:
 74 |         # Merge dependencies
 75 |         for table, columns in analysis.dependencies.items():
 76 |             if "*" in columns:
 77 |                 merged_dependencies[table] = {"*"}
 78 |             else:
 79 |                 merged_dependencies.setdefault(table, set()).update(columns)
 80 | 
 81 |         # Merge outputs
 82 |         for table, columns in analysis.outputs.items():
 83 |             if "*" in columns:
 84 |                 merged_outputs[table] = {"*"}
 85 |             else:
 86 |                 merged_outputs.setdefault(table, set()).update(columns)
 87 | 
 88 |     return SQLProfile(
 89 |         dependencies={
 90 |             table: list(columns) for table, columns in merged_dependencies.items()
 91 |         },
 92 |         outputs={table: list(columns) for table, columns in merged_outputs.items()},
 93 |     )
 94 | 
 95 | 
 96 | def merge_schemas(
 97 |     df_extracted_schema: pd.DataFrame, df_db_schema: pd.DataFrame
 98 | ) -> pd.DataFrame:
 99 |     """Matches extracted SQL dependencies with the actual database schema.
100 | 
101 |     Handles both exact schema matches and schema-agnostic matches.
102 |     Expands wildcards ('*') to match all columns from the relevant table(s).
103 |     Handles tables with no columns (None).
104 | 
105 |     Args:
106 |         df_extracted_schema: Extracted table-column dependencies
107 |         df_db_schema: Actual database schema information
108 | 
109 |     Returns:
110 |         Merged schema with an `exact_match` flag indicating whether
111 |         the schema name matched exactly
112 |     """
113 |     # Create copy to avoid modifying input
114 |     df_extracted = df_extracted_schema.copy()
115 |     df_extracted["exact_match"] = pd.Series(dtype="boolean")
116 | 
117 |     # Initialize empty DataFrame with correct dtypes
118 |     df_no_columns = pd.DataFrame(
119 |         {
120 |             "schema": pd.Series(dtype="object"),
121 |             "table": pd.Series(dtype="object"),
122 |             "column": pd.Series(dtype="object"),
123 |             "data_type": pd.Series(dtype="object"),
124 |             "exact_match": pd.Series(dtype="boolean"),
125 |         }
126 |     )
127 | 
128 |     # Handle tables with no columns (None)
129 |     if (no_columns_mask := df_extracted["column"].isna()).any():
130 |         no_columns_deps = df_extracted.loc[no_columns_mask, ["schema", "table"]]
131 |         df_extracted = df_extracted.loc[~no_columns_mask]
132 | 
133 |         # Exact schema match
134 |         exact_matches = (
135 |             no_columns_deps.dropna(subset=["schema"])
136 |             .merge(df_db_schema[["schema", "table"]], on=["schema", "table"])
137 |             .assign(column=None, data_type=None, exact_match=True)
138 |         )
139 | 
140 |         # Schema-agnostic match
141 |         schema_agnostic = no_columns_deps[no_columns_deps["schema"].isna()]
142 |         matching_schemas = df_db_schema.merge(schema_agnostic[["table"]], on="table")[
143 |             ["schema", "table"]
144 |         ]
145 |         schema_agnostic_matches = matching_schemas.assign(
146 |             column=None, data_type=None, exact_match=False
147 |         )
148 | 
149 |         # Combine results
150 |         df_no_columns = pd.concat(
151 |             [exact_matches, schema_agnostic_matches], ignore_index=True
152 |         )
153 | 
154 |     # Expand wildcards (*) to include all relevant columns
155 |     if (wildcard_mask := df_extracted["column"] == "*").any():
156 |         regular_deps = df_extracted[~wildcard_mask]
157 |         wildcard_deps = df_extracted[wildcard_mask]
158 |         expanded_wildcard_deps = []
159 | 
160 |         for _, row in wildcard_deps.iterrows():
161 |             mask = df_db_schema["table"] == row["table"]
162 |             if pd.notna(row["schema"]):
163 |                 mask &= df_db_schema["schema"] == row["schema"]
164 |                 wildcard_schema = df_db_schema[mask][
165 |                     ["schema", "table", "column"]
166 |                 ].assign(exact_match=True)
167 |             else:
168 |                 wildcard_schema = df_db_schema[mask][
169 |                     ["schema", "table", "column"]
170 |                 ].assign(exact_match=False)
171 |             expanded_wildcard_deps.append(wildcard_schema)
172 | 
173 |         df_extracted = pd.concat(
174 |             [regular_deps, *expanded_wildcard_deps], ignore_index=True
175 |         )
176 | 
177 |     # Exact schema matches
178 |     exact_matches = (
179 |         df_extracted[df_extracted["schema"].notna()]
180 |         .merge(df_db_schema, how="inner")
181 |         .fillna({"exact_match": True})
182 |     )
183 | 
184 |     # Schema-agnostic matches (ignoring schema column)
185 |     schemaless_matches = (
186 |         df_extracted[df_extracted["schema"].isna()]
187 |         .drop(columns="schema")
188 |         .merge(df_db_schema, how="inner")
189 |         .fillna({"exact_match": False})
190 |     )
191 | 
192 |     # Combine all results & remove duplicates with priority to exact matches
193 |     df_merged_schemas = (
194 |         pd.concat([exact_matches, schemaless_matches, df_no_columns], ignore_index=True)
195 |         .reindex(columns=["schema", "table", "column", "data_type", "exact_match"])
196 |         # Sort values to give priority to exact matches
197 |         .sort_values(
198 |             by=["schema", "table", "column", "data_type", "exact_match"],
199 |             ascending=[True, True, True, True, False],
200 |             na_position="last",
201 |         )
202 |         # Drop duplicates (keep exact matches)
203 |         .drop_duplicates(subset=["schema", "table", "column", "data_type"])
204 |         .reset_index(drop=True)
205 |     )
206 | 
207 |     return df_merged_schemas
208 | 
209 | 
210 | def schema_diff(
211 |     df_extracted_schema: pd.DataFrame, df_db_schema: pd.DataFrame, copy: bool = True
212 | ) -> pd.DataFrame:
213 |     """Checks if extracted schema entries exist in the database schema.
214 | 
215 |     Args:
216 |         df_extracted_schema: Extracted table-column dependencies
217 |         df_db_schema: Actual database schema information
218 |         copy: Whether to create a copy of the input DataFrame
219 | 
220 |     Returns:
221 |         The extracted schema with an added `match_db` flag
222 |     """
223 |     # Copy dataframe to avoid in-place update
224 |     if copy:
225 |         df_extracted_schema = df_extracted_schema.copy()
226 | 
227 |     # Create sets for quick lookup
228 |     db_exact_matches = set(
229 |         zip(
230 |             df_db_schema["schema"],
231 |             df_db_schema["table"],
232 |             df_db_schema["column"],
233 |             strict=False,
234 |         )
235 |     )
236 |     db_table_matches = set(
237 |         zip(df_db_schema["schema"], df_db_schema["table"], strict=False)
238 |     )
239 |     db_schema_agnostic = set(
240 |         zip(df_db_schema["table"], df_db_schema["column"], strict=False)
241 |     )
242 |     db_table_agnostic = set(df_db_schema["table"])
243 | 
244 |     def check_existence(row: pd.Series) -> bool:
245 |         """Helper function to determine if a row exists in the DB schema."""
246 |         if pd.notna(row["schema"]):
247 |             if row["column"] == "*":
248 |                 return (row["schema"], row["table"]) in db_table_matches
249 |             return (row["schema"], row["table"], row["column"]) in db_exact_matches
250 |         else:
251 |             if row["column"] == "*":
252 |                 return row["table"] in db_table_agnostic
253 |             return (row["table"], row["column"]) in db_schema_agnostic
254 | 
255 |     # Apply vectorized check
256 |     df_extracted_schema["match_db"] = df_extracted_schema.apply(check_existence, axis=1)
257 | 
258 |     return df_extracted_schema
259 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Test configuration and fixtures for all tests.
 2 | 
 3 | This module provides pytest configuration, custom command-line options,
 4 | and fixtures shared across test modules.
 5 | """
 6 | 
 7 | from pathlib import Path
 8 | 
 9 | import pytest
10 | 
11 | from sqldeps.llm_parsers import BaseSQLExtractor, create_extractor
12 | 
13 | # Base paths
14 | TEST_DATA_DIR = Path(__file__).parent / "data"
15 | SQL_DIR = TEST_DATA_DIR / "sql"
16 | EXPECTED_OUTPUT_DIR = TEST_DATA_DIR / "expected_outputs"
17 | 
18 | 
19 | def pytest_addoption(parser: pytest.Parser) -> None:
20 |     """Register custom pytest command-line options.
21 | 
22 |     Args:
23 |         parser: Pytest command-line parser
24 |     """
25 |     parser.addoption(
26 |         "--framework",
27 |         action="store",
28 |         default="litellm",
29 |         help="Specify the framework to use (litellm, openai, groq, deepseek)",
30 |     )
31 |     parser.addoption(
32 |         "--model",
33 |         action="store",
34 |         default=None,
35 |         help="Specify the model to use within the selected framework",
36 |     )
37 |     parser.addoption(
38 |         "--prompt",
39 |         action="store",
40 |         default=None,
41 |         help="Specify the path to the prompt yml file to use a custom prompt",
42 |     )
43 | 
44 | 
45 | def pytest_configure(config: pytest.Config) -> None:
46 |     """Register custom markers.
47 | 
48 |     Args:
49 |         config: Pytest configuration object
50 |     """
51 |     config.addinivalue_line(
52 |         "markers",
53 |         "llm: mark tests that require LLM API calls (typically skipped in CI/CD)",
54 |     )
55 |     config.addinivalue_line(
56 |         "markers", "integration: mark tests that integrate with external services"
57 |     )
58 |     config.addinivalue_line("markers", "slow: mark tests that are slow to execute")
59 | 
60 | 
61 | def pytest_collection_modifyitems(
62 |     items: list[pytest.Item], config: pytest.Config
63 | ) -> None:
64 |     """Skip slow tests when only llm marker is specified."""
65 |     # Get the value of -m if specified
66 |     markexpr = config.getoption("-m", default="")
67 | 
68 |     # Check if "llm" is specified but "slow" is not
69 |     if "llm" in markexpr and "slow" not in markexpr:
70 |         skip_marker = pytest.mark.skip(
71 |             reason=(
72 |                 "Slow tests are skipped by default. Use -m 'llm and slow' to run them."
73 |             )
74 |         )
75 |         for item in items:
76 |             # If the test has both llm and slow markers, skip it
77 |             if "slow" in item.keywords and "llm" in item.keywords:
78 |                 item.add_marker(skip_marker)
79 | 
80 | 
81 | @pytest.fixture
82 | def extractor(request: pytest.FixtureRequest) -> BaseSQLExtractor:
83 |     """Create an extractor based on command-line options.
84 | 
85 |     Args:
86 |         request: Pytest request object
87 | 
88 |     Returns:
89 |         A configured SQLDeps extractor
90 |     """
91 |     framework = request.config.getoption("--framework")
92 |     model = request.config.getoption("--model")
93 |     prompt = request.config.getoption("--prompt")
94 | 
95 |     return create_extractor(framework, model, prompt_path=prompt)
96 | 


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example10_expected.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dependencies": {
 3 |         "customers": ["customer_name", "id"],
 4 |         "employees": [],
 5 |         "logs": [],
 6 |         "my_db.orders": ["order_date", "order_id", "total_amount"],
 7 |         "products": ["product_id", "product_name"],
 8 |         "reports.sales_report": ["customer_name", "product_name", "sale_id"],
 9 |         "sales": ["amount", "customer_id", "id"]
10 |     },
11 |     "outputs": {
12 |       "logs": [],
13 |       "reports.sales_report": ["customer_name", "product_name", "sale_id"]
14 |     }
15 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example1_expected.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": {
3 |         "users": ["id", "name"]
4 |     },
5 |     "outputs": {
6 |         
7 |     }
8 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example2_expected.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": {
3 |         "users": []
4 |     },
5 |     "outputs": {
6 |         
7 |     }
8 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example3_expected.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dependencies": {
 3 |       "my_db.users": ["id", "name"],
 4 |       "orders": ["order_id", "user_id"]
 5 |     },
 6 |     "outputs": {
 7 |       
 8 |     }
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example4_expected.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dependencies": {
 3 |         "my_db.users": ["email", "id", "name", "status"],
 4 |         "orders": ["order_date", "order_id", "order_type", "priority_level", "shipping_status", "total_amount", "user_id"]
 5 |     },
 6 |     "outputs": {
 7 |       
 8 |     }
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example5_expected.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": {
3 |         "orders": ["user_id"],
4 |         "users": ["id", "name"]
5 |     },
6 |     "outputs": {
7 |       
8 |     }
9 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example6_expected.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": {
3 |         "orders": ["user_id"],
4 |         "users": ["id", "name"]
5 |     },
6 |     "outputs": {
7 |       
8 |     }
9 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example7_expected.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": {
3 |         "orders": ["user_id"],
4 |         "users": ["id", "name"]
5 |     },
6 |     "outputs": {
7 |       
8 |     }
9 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example8_expected.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dependencies": {
 3 |         "build_public.Property_Shape": ["PropertyId", "ShapeId"],
 4 |         "build_spatial.Shape_Defor": ["ShapeId", "Year", "areaha"],
 5 |         "web_import.Api_Property_Defor": ["Ha", "PropertyId", "Year"]
 6 |     },
 7 |     "outputs": {
 8 |       "web_import.Api_Property_Defor": ["Ha", "PropertyId", "Year"]
 9 |     }
10 | }


--------------------------------------------------------------------------------
/tests/data/expected_outputs/example9_expected.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dependencies": {
 3 |         "orders": ["user_id"],
 4 |         "pgi_shape_clusters": ["PropertyGroupId", "ShapeCluster", "ShapeGroupId"],
 5 |         "spatial.Shape": ["ShapeId", "geom"],
 6 |         "users": ["id", "name"]
 7 |     },
 8 |     "outputs": {
 9 |       "pgi_shape_geom_clusters": ["PropertyGroupId", "ShapeCluster", "ShapeGroupId", "geom"]
10 |     }
11 | }


--------------------------------------------------------------------------------
/tests/data/oneshot.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": {
 3 |     "schema1.users": ["user_id", "username", "status", "*", "registration_date"],
 4 |     "schema1.orders": ["user_id", "order_id", "order_date", "total_amount", "customer_id"],
 5 |     "schema1.order_items": ["order_id", "product_id", "quantity", "order_date"],
 6 |     "schema1.products": ["product_id", "product_name", "category", "current_stock", "stock_status", "last_updated"],
 7 |     "schema2.customer_metrics": ["customer_id", "monthly_order_count", "monthly_spend", "last_updated"],
 8 |     "schema2.audit_logs": [],
 9 |     "schema2.daily_summary": []
10 |   },
11 |   "outputs": {
12 |     "schema2.customer_metrics": ["customer_id", "monthly_order_count", "monthly_spend", "last_updated"],
13 |     "schema2.audit_logs": [],
14 |     "schema2.daily_summary": ["date", "total_orders", "total_revenue", "avg_order_value"],
15 |     "schema1.products": ["stock_status", "last_updated"],
16 |     "schema2.monthly_report": ["month", "category", "order_count", "customer_count", "total_items_sold", "total_revenue"]
17 |   }
18 | }


--------------------------------------------------------------------------------
/tests/data/oneshot.sql:
--------------------------------------------------------------------------------
  1 | -- This oneshot example demonstrates key SQL dependency extraction scenarios
  2 | -- including table dependencies, outputs, temporary artifacts, truncates, and more
  3 | 
  4 | -- Scenario 1: Simple SELECT with JOIN and WHERE clauses
  5 | SELECT 
  6 |     u.user_id, 
  7 |     u.username, 
  8 |     o.order_date,
  9 |     o.total_amount,
 10 |     p.product_name
 11 | FROM 
 12 |     schema1.users u
 13 | JOIN 
 14 |     schema1.orders o ON u.user_id = o.user_id
 15 | JOIN 
 16 |     schema1.order_items oi ON o.order_id = oi.order_id
 17 | JOIN 
 18 |     schema1.products p ON oi.product_id = p.product_id
 19 | WHERE 
 20 |     o.order_date > '2023-01-01'
 21 |     AND p.category = 'Electronics'
 22 |     AND u.status = 'active';
 23 | 
 24 | -- Scenario 2: CTE and INSERT operation with columns
 25 | WITH recent_orders AS (
 26 |     SELECT 
 27 |         customer_id,
 28 |         COUNT(*) as order_count,
 29 |         SUM(total_amount) as total_spent
 30 |     FROM 
 31 |         schema1.orders
 32 |     WHERE 
 33 |         order_date > CURRENT_DATE - INTERVAL '30 days'
 34 |     GROUP BY 
 35 |         customer_id
 36 | )
 37 | INSERT INTO schema2.customer_metrics (customer_id, monthly_order_count, monthly_spend, last_updated)
 38 | SELECT 
 39 |     customer_id,
 40 |     order_count,
 41 |     total_spent,
 42 |     CURRENT_TIMESTAMP
 43 | FROM 
 44 |     recent_orders
 45 | WHERE 
 46 |     order_count > 0;
 47 | 
 48 | -- Scenario 3: TRUNCATE alone (should appear in both dependencies and outputs)
 49 | TRUNCATE TABLE schema2.audit_logs;
 50 | 
 51 | -- Scenario 4: TRUNCATE followed by population of specific columns
 52 | TRUNCATE TABLE schema2.daily_summary;
 53 | 
 54 | INSERT INTO schema2.daily_summary (date, total_orders, total_revenue, avg_order_value)
 55 | SELECT 
 56 |     CURRENT_DATE,
 57 |     COUNT(*),
 58 |     SUM(total_amount),
 59 |     AVG(total_amount)
 60 | FROM 
 61 |     schema1.orders
 62 | WHERE 
 63 |     order_date = CURRENT_DATE;
 64 | 
 65 | -- Scenario 5: UPDATE with subquery
 66 | UPDATE schema1.products
 67 | SET 
 68 |     stock_status = 
 69 |         CASE 
 70 |             WHEN current_stock = 0 THEN 'Out of Stock'
 71 |             WHEN current_stock < 10 THEN 'Low Stock'
 72 |             ELSE 'In Stock'
 73 |         END,
 74 |     last_updated = CURRENT_TIMESTAMP
 75 | WHERE 
 76 |     product_id IN (
 77 |         SELECT 
 78 |             product_id
 79 |         FROM 
 80 |             schema1.order_items
 81 |         WHERE 
 82 |             order_date > CURRENT_DATE - INTERVAL '7 days'
 83 |     );
 84 | 
 85 | -- Scenario 6: CREATE TABLE and immediate population
 86 | CREATE TABLE schema2.monthly_report AS
 87 | SELECT 
 88 |     DATE_TRUNC('month', o.order_date) AS month,
 89 |     p.category,
 90 |     COUNT(DISTINCT o.order_id) AS order_count,
 91 |     COUNT(DISTINCT o.user_id) AS customer_count,
 92 |     SUM(oi.quantity) AS total_items_sold,
 93 |     SUM(o.total_amount) AS total_revenue
 94 | FROM 
 95 |     schema1.orders o
 96 | JOIN 
 97 |     schema1.order_items oi ON o.order_id = oi.order_id
 98 | JOIN 
 99 |     schema1.products p ON oi.product_id = p.product_id
100 | GROUP BY 
101 |     DATE_TRUNC('month', o.order_date),
102 |     p.category;
103 | 
104 | -- Scenario 7: SELECT * (should generate ["*"] in dependencies)
105 | SELECT *
106 | FROM schema1.users
107 | WHERE registration_date > CURRENT_DATE - INTERVAL '90 days';


--------------------------------------------------------------------------------
/tests/data/sql/example1.sql:
--------------------------------------------------------------------------------
1 | -- Simple query selecting a subset of columns
2 | SELECT id, name FROM users


--------------------------------------------------------------------------------
/tests/data/sql/example10.sql:
--------------------------------------------------------------------------------
 1 | -- PostgreSQL function that uses CTEs and creates a table
 2 | CREATE OR REPLACE FUNCTION generate_sales_report()
 3 | RETURNS void AS $$
 4 | BEGIN
 5 |     -- Use CTEs to process data
 6 |     WITH cte_sales AS (
 7 |         SELECT 
 8 |             s.id AS sale_id, 
 9 |             s.amount, 
10 |             c.customer_name 
11 |         FROM sales s
12 |         JOIN customers c ON s.customer_id = c.id
13 |     ),
14 |     cte_products AS (
15 |         SELECT 
16 |             p.product_id, 
17 |             p.product_name 
18 |         FROM products p
19 |     )
20 |     -- Insert the processed data into a new table
21 |     INSERT INTO reports.sales_report (sale_id, customer_name, product_name)
22 |     SELECT 
23 |         cte_sales.sale_id, 
24 |         cte_sales.customer_name, 
25 |         cte_products.product_name
26 |     FROM cte_sales
27 |     JOIN cte_products ON cte_sales.sale_id = cte_products.product_id;
28 | END;
29 | $$ LANGUAGE plpgsql;
30 | 
31 | -- Truncate a table
32 | TRUNCATE TABLE logs;
33 | 
34 | -- Query from a specific database
35 | SELECT 
36 |     my_db.orders.order_id, 
37 |     my_db.orders.order_date, 
38 |     my_db.orders.total_amount 
39 | FROM my_db.orders;
40 | 
41 | -- Select all columns from a table
42 | SELECT * 
43 | FROM employees
44 | LIMIT 10;
45 | 


--------------------------------------------------------------------------------
/tests/data/sql/example2.sql:
--------------------------------------------------------------------------------
1 | -- Simple query selecting all columns
2 | SELECT * FROM users LIMIT 100


--------------------------------------------------------------------------------
/tests/data/sql/example3.sql:
--------------------------------------------------------------------------------
1 | -- Query with table alias, with and without database specification, and join
2 | SELECT u.id, u.name, o.order_id
3 | FROM my_db.users u
4 | JOIN orders AS o ON u.id = o.user_id


--------------------------------------------------------------------------------
/tests/data/sql/example4.sql:
--------------------------------------------------------------------------------
 1 | -- Query with table alias, with and without database specification, and join, and where clauses
 2 | SELECT u.id, u.name, o.order_id
 3 | FROM my_db.users u
 4 | JOIN orders AS o ON u.id = o.user_id
 5 | WHERE u.status = 'active'
 6 |     AND o.order_date >= '2024-01-01'
 7 |     AND o.total_amount > 100.00
 8 |     AND u.email LIKE '%@company.com'
 9 |     AND o.order_type IN ('retail', 'wholesale')
10 |     AND (
11 |         o.shipping_status = 'pending'
12 |         OR (o.shipping_status = 'processed' AND o.priority_level = 'high')
13 |     );


--------------------------------------------------------------------------------
/tests/data/sql/example5.sql:
--------------------------------------------------------------------------------
1 | -- Simple CTE
2 | WITH user_orders AS (
3 |     SELECT user_id, COUNT(*) as order_count
4 |     FROM orders
5 |     GROUP BY user_id
6 | )
7 | SELECT u.name, uo.order_count
8 | FROM users u
9 | JOIN user_orders uo ON u.id = uo.user_id;


--------------------------------------------------------------------------------
/tests/data/sql/example6.sql:
--------------------------------------------------------------------------------
 1 | -- Simple Subquery 1
 2 | SELECT 
 3 |     u.name, 
 4 |     (
 5 |         SELECT COUNT(*) 
 6 |         FROM orders o 
 7 |         WHERE o.user_id = u.id
 8 |         GROUP BY o.user_id
 9 |     ) as order_count
10 | FROM users u;


--------------------------------------------------------------------------------
/tests/data/sql/example7.sql:
--------------------------------------------------------------------------------
 1 | -- Simple Subquery 2
 2 | SELECT 
 3 |     u.name, 
 4 |     uo.order_count
 5 | FROM users u
 6 | JOIN (
 7 |     SELECT 
 8 |         user_id, 
 9 |         COUNT(*) as order_count
10 |     FROM orders
11 |     GROUP BY user_id
12 | ) uo ON u.id = uo.user_id;


--------------------------------------------------------------------------------
/tests/data/sql/example8.sql:
--------------------------------------------------------------------------------
 1 | -- Postgres Function
 2 | CREATE OR REPLACE FUNCTION web_import."build_Api_Property_Defor"()
 3 |  RETURNS void
 4 |  LANGUAGE plpgsql
 5 | AS $function$BEGIN
 6 | 	TRUNCATE TABLE web_import."Api_Property_Defor";
 7 | 
 8 | INSERT INTO web_import."Api_Property_Defor"(
 9 | 		"PropertyId", "Year", "Ha"
10 | 	)
11 | 	SELECT ps."PropertyId", d."Year", avg("Defor") AS "Ha"
12 | FROM build_public."Property_Shape" ps
13 | INNER JOIN (
14 | 				SELECT "ShapeId", "Year"::INTEGER, SUM("areaha") AS "Defor"
15 | FROM build_spatial."Shape_Defor"
16 | WHERE "Year"::text ~ '^[0-9]+$'
17 | -- and "areaha">6.25
18 | GROUP BY "ShapeId", "Year"::INTEGER
19 | 			) d
20 | 				ON
21 | d."ShapeId" = ps."ShapeId"
22 | WHERE ps."PropertyId" IS NOT NULL
23 | GROUP BY ps."PropertyId", d."Year";
24 | 
25 | END
26 | $function$


--------------------------------------------------------------------------------
/tests/data/sql/example9.sql:
--------------------------------------------------------------------------------
 1 | -- Multiple queries with CTEs & function
 2 | CREATE OR REPLACE FUNCTION make_pgi_shape_geom_clusters()
 3 |   RETURNS VOID
 4 |   LANGUAGE plpgsql
 5 | AS $function$
 6 | BEGIN
 7 | 
 8 |     -- Build table with cluster + geom data
 9 |     DROP TABLE IF EXISTS pgi_shape_geom_clusters CASCADE;
10 |     CREATE TABLE pgi_shape_geom_clusters AS
11 |         SELECT
12 |             pgic."PropertyGroupId",
13 |             pgic."ShapeGroupId",
14 |             sh.geom,
15 |             pgic."ShapeCluster" 
16 |         FROM
17 |             pgi_shape_clusters pgic
18 |         LEFT JOIN
19 |             spatial."Shape" sh
20 |         ON
21 |             pgic."PropertyGroupId" = sh."ShapeId";
22 | 
23 |     -- Integrity check: A Property observation should have at most one row
24 |     ALTER TABLE pgi_shape_geom_clusters ADD PRIMARY KEY ("PropertyGroupId","ShapeGroupId");
25 |     ANALYZE VERBOSE pgi_shape_geom_clusters;
26 | 
27 | END
28 | $function$;
29 | 
30 | WITH user_orders AS (
31 |     SELECT user_id, COUNT(*) as order_count
32 |     FROM orders
33 |     GROUP BY user_id
34 | )
35 | SELECT u.name, uo.order_count
36 | FROM users u
37 | JOIN user_orders uo ON u.id = uo.user_id;


--------------------------------------------------------------------------------
/tests/functional/test_sql.py:
--------------------------------------------------------------------------------
  1 | """Functional tests for SQL dependency extraction.
  2 | 
  3 | This module tests the end-to-end functionality of SQL dependency extraction
  4 | against a set of predefined SQL files with expected outputs.
  5 | 
  6 | The module provides two testing approaches:
  7 | 1. A fast batch test (test_sql_dependency_extraction_batch) that extracts
  8 |    dependencies from all SQL files at once using parallel processing
  9 | 2. A slower individual test (test_sql_dependency_extraction_individual) that
 10 |    processes each file separately, which is useful for debugging specific files
 11 | 
 12 | The batch approach is more efficient as it leverages parallel processing
 13 | and extracts dependencies from all files in a single operation.
 14 | """
 15 | 
 16 | import json
 17 | from pathlib import Path
 18 | 
 19 | import pytest
 20 | 
 21 | from sqldeps.llm_parsers import BaseSQLExtractor
 22 | 
 23 | TEST_DATA_DIR = Path(__file__).parent.parent / "data"
 24 | SQL_DIR = TEST_DATA_DIR / "sql"
 25 | EXPECTED_OUTPUT_DIR = TEST_DATA_DIR / "expected_outputs"
 26 | 
 27 | 
 28 | # Create a list of test cases from data directory
 29 | def get_test_cases() -> list:
 30 |     """Create a list of test cases from data directory.
 31 | 
 32 |     Each test case is a tuple of (sql_file, expected_output_file).
 33 | 
 34 |     Returns:
 35 |         list: List of test case tuples
 36 |     """
 37 |     test_cases = []
 38 |     for sql_file in SQL_DIR.glob("example*.sql"):
 39 |         expected_file = EXPECTED_OUTPUT_DIR / f"{sql_file.stem}_expected.json"
 40 |         if expected_file.exists():
 41 |             test_cases.append((sql_file, expected_file))
 42 |     return test_cases
 43 | 
 44 | 
 45 | def load_expected_output(expected_output_file: Path) -> dict:
 46 |     """Load the expected output from a JSON file.
 47 | 
 48 |     Args:
 49 |         expected_output_file: Path to the expected output JSON file
 50 | 
 51 |     Returns:
 52 |         dict: The expected output as a dictionary
 53 |     """
 54 |     with open(expected_output_file) as f:
 55 |         return json.load(f)
 56 | 
 57 | 
 58 | @pytest.mark.llm
 59 | def test_sql_dependency_extraction_batch(extractor: BaseSQLExtractor) -> None:
 60 |     """Test extraction of dependencies from all SQL files at once.
 61 | 
 62 |     This is more efficient than testing each file individually as it
 63 |     extracts dependencies from all files in a single batch.
 64 | 
 65 |     Args:
 66 |         extractor: SQLDeps extractor fixture
 67 |     """
 68 |     # Get all the test cases
 69 |     test_cases = get_test_cases()
 70 | 
 71 |     # Extract dependencies from all SQL files at once
 72 |     results = extractor.extract_from_folder(
 73 |         SQL_DIR, recursive=False, n_workers=-1, use_cache=False, rpm=100
 74 |     )
 75 | 
 76 |     # Verify each result against its expected output
 77 |     for sql_file, expected_output_file in test_cases:
 78 |         expected_output = load_expected_output(expected_output_file)
 79 |         extracted = results[str(sql_file)].to_dict()
 80 | 
 81 |         # Use a more descriptive assertion message
 82 |         assert extracted == expected_output, f"Mismatch for {sql_file.name}"
 83 | 
 84 | 
 85 | # Keep the original test for backward compatibility but mark it as slow
 86 | @pytest.mark.parametrize(
 87 |     "sql_file,expected_output_file",
 88 |     get_test_cases(),
 89 |     ids=lambda x: x.name if isinstance(x, Path) else str(x),
 90 | )
 91 | @pytest.mark.llm
 92 | @pytest.mark.slow
 93 | def test_sql_dependency_extraction_individual(
 94 |     sql_file: Path, expected_output_file: Path, extractor: BaseSQLExtractor
 95 | ) -> None:
 96 |     """Test extraction of dependencies from SQL files individually.
 97 | 
 98 |     This is slower than the batch test but useful for debugging specific files.
 99 |     This test will only run when both 'llm' and 'slow' markers are specified.
100 | 
101 |     Args:
102 |         sql_file: Path to SQL file
103 |         expected_output_file: Path to expected output JSON file
104 |         extractor: SQLDeps extractor fixture
105 |     """
106 |     # Load SQL code
107 |     with open(sql_file) as f:
108 |         sql = f.read()
109 | 
110 |     # Load expected output
111 |     with open(expected_output_file) as f:
112 |         expected_output = json.load(f)
113 | 
114 |     # Run the extractor
115 |     dependency = extractor.extract_from_query(sql)
116 | 
117 |     # Assert the output matches the expected
118 |     assert dependency.to_dict() == expected_output, f"Mismatch for {sql_file.name}"
119 | 


--------------------------------------------------------------------------------
/tests/integration/test_database.py:
--------------------------------------------------------------------------------
  1 | """Integration tests for database connectors.
  2 | 
  3 | These tests connect to an actual PostgreSQL database to verify
  4 | schema retrieval and validation functionality.
  5 | """
  6 | 
  7 | import os
  8 | 
  9 | import pandas as pd
 10 | import pytest
 11 | 
 12 | from sqldeps.database import PostgreSQLConnector
 13 | 
 14 | # Skip all tests if no test database is configured
 15 | pytestmark = [
 16 |     pytest.mark.skipif(
 17 |         os.environ.get("TEST_DB_HOST") is None, reason="Test database not configured"
 18 |     ),
 19 |     pytest.mark.integration,
 20 | ]
 21 | 
 22 | 
 23 | class TestPostgreSQLIntegration:
 24 |     """Integration tests for PostgreSQL connector.
 25 | 
 26 |     To run these tests, set the following environment variables:
 27 |     - TEST_DB_HOST
 28 |     - TEST_DB_PORT (optional, defaults to 5432)
 29 |     - TEST_DB_NAME
 30 |     - TEST_DB_USER
 31 |     - TEST_DB_PASSWORD
 32 |     """
 33 | 
 34 |     @pytest.fixture
 35 |     def db_connector(self) -> PostgreSQLConnector:
 36 |         """Create a database connector for testing.
 37 | 
 38 |         Returns:
 39 |             PostgreSQLConnector: Configured database connector
 40 |         """
 41 |         return PostgreSQLConnector(
 42 |             host=os.environ.get("TEST_DB_HOST"),
 43 |             port=int(os.environ.get("TEST_DB_PORT", "5432")),
 44 |             database=os.environ.get("TEST_DB_NAME"),
 45 |             username=os.environ.get("TEST_DB_USER"),
 46 |             password=os.environ.get("TEST_DB_PASSWORD"),
 47 |         )
 48 | 
 49 |     def test_connection(self, db_connector: PostgreSQLConnector) -> None:
 50 |         """Test that connection to database succeeds."""
 51 |         # Just creating the connector should establish a connection
 52 |         # If it doesn't, an exception will be raised and the test will fail
 53 |         assert db_connector is not None
 54 |         assert hasattr(db_connector, "engine")
 55 |         assert hasattr(db_connector, "inspector")
 56 | 
 57 |     def test_get_schema(self, db_connector: PostgreSQLConnector) -> None:
 58 |         """Test retrieving schema information."""
 59 |         # Get schema for the public schema
 60 |         schema = db_connector.get_schema("public")
 61 | 
 62 |         # Verify result structure
 63 |         assert isinstance(schema, pd.DataFrame)
 64 |         assert set(schema.columns) == {"schema", "table", "column", "data_type"}
 65 |         assert len(schema) > 0  # Should have at least some tables
 66 | 
 67 |         # Verify all rows have the correct schema
 68 |         assert all(schema["schema"] == "public")
 69 | 
 70 |     # Commented out because it takes too long to run
 71 |     # @pytest.mark.slow
 72 |     # def test_get_schema_multiple(self, db_connector):
 73 |     #     """Test retrieving schema from multiple schemas."""
 74 |     #     # Get schema from all available schemas
 75 |     #     schema = db_connector.get_schema()
 76 | 
 77 |     #     # Verify result
 78 |     #     assert isinstance(schema, pd.DataFrame)
 79 |     #     assert len(schema) > 0
 80 | 
 81 |     #     # Should include multiple schemas if available
 82 |     #     schemas = schema["schema"].unique()
 83 |     #     assert len(schemas) > 0
 84 | 
 85 |     def test_export_schema_csv(
 86 |         self, db_connector: PostgreSQLConnector, tmp_path: str
 87 |     ) -> None:
 88 |         """Test exporting schema to CSV.
 89 | 
 90 |         Args:
 91 |             db_connector: PostgreSQL connector fixture
 92 |             tmp_path: Pytest temporary path fixture
 93 |         """
 94 |         # Export schema to a temporary file
 95 |         output_file = tmp_path / "schema.csv"
 96 |         db_connector.export_schema_csv(output_file, schemas="public")
 97 | 
 98 |         # Verify file was created
 99 |         assert output_file.exists()
100 | 
101 |         # Verify file content
102 |         df = pd.read_csv(output_file)
103 |         assert set(df.columns) == {"schema", "table", "column", "data_type"}
104 |         assert len(df) > 0
105 | 


--------------------------------------------------------------------------------
/tests/unit/app/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for SQLDeps web application.
2 | 
3 | This package contains unit tests for the Streamlit-based web application.
4 | """
5 | 


--------------------------------------------------------------------------------
/tests/unit/database/test_postgresql.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for PostgreSQLConnector.
  2 | 
  3 | This module contains unit tests for the PostgreSQL connector functionality.
  4 | """
  5 | 
  6 | from pathlib import Path
  7 | from unittest.mock import MagicMock, mock_open, patch
  8 | 
  9 | import pandas as pd
 10 | import pytest
 11 | import yaml
 12 | 
 13 | from sqldeps.database.postgresql import PostgreSQLConnector
 14 | 
 15 | 
 16 | class TestPostgreSQLConnector:
 17 |     """Test suite for PostgreSQLConnector."""
 18 | 
 19 |     def test_initialization_with_params(self) -> None:
 20 |         """Test initialization with direct parameters."""
 21 |         # Mock both create_engine and inspect
 22 |         with (
 23 |             patch("sqldeps.database.postgresql.create_engine") as mock_engine,
 24 |             patch("sqldeps.database.postgresql.inspect") as mock_inspect,
 25 |         ):
 26 |             # Set up the mock inspector that will be returned
 27 |             mock_inspector = MagicMock()
 28 |             mock_inspect.return_value = mock_inspector
 29 | 
 30 |             connector = PostgreSQLConnector(
 31 |                 host="localhost",
 32 |                 port=5432,
 33 |                 database="testdb",
 34 |                 username="user",
 35 |                 password="pass",
 36 |             )
 37 | 
 38 |             # Verify engine was created
 39 |             mock_engine.assert_called_once()
 40 |             # Verify inspector was created
 41 |             mock_inspect.assert_called_once()
 42 |             assert connector.inspector == mock_inspector
 43 | 
 44 |     def test_initialization_missing_params(self) -> None:
 45 |         """Test initialization fails with missing parameters."""
 46 |         with (
 47 |             pytest.raises(ValueError, match="Missing required database parameters"),
 48 |             patch("os.getenv", return_value=None),
 49 |         ):
 50 |             PostgreSQLConnector(
 51 |                 host=None, database="testdb", username="user", password="pass"
 52 |             )
 53 | 
 54 |     def test_initialization_with_config_file(self) -> None:
 55 |         """Test initialization with YAML config file."""
 56 |         config_data = {
 57 |             "database": {
 58 |                 "host": "dbhost",
 59 |                 "port": 5432,
 60 |                 "database": "configdb",
 61 |                 "username": "configuser",
 62 |                 "password": "configpass",
 63 |             }
 64 |         }
 65 | 
 66 |         with (
 67 |             patch("builtins.open", mock_open(read_data=yaml.dump(config_data))),
 68 |             patch("sqldeps.database.postgresql.create_engine") as mock_engine,
 69 |             patch("sqldeps.database.postgresql.inspect") as mock_inspect,
 70 |             patch.object(Path, "exists", return_value=True),
 71 |         ):
 72 |             # Set up the mock inspector
 73 |             mock_inspector = MagicMock()
 74 |             mock_inspect.return_value = mock_inspector
 75 | 
 76 |             PostgreSQLConnector(config_path=Path("config.yml"))
 77 | 
 78 |             # Verify engine was created with correct parameters
 79 |             mock_engine.assert_called_once()
 80 |             # Verify connection string contains expected values
 81 |             conn_string = mock_engine.call_args[0][0]
 82 |             assert "dbhost" in conn_string
 83 |             assert "configdb" in conn_string
 84 |             assert "configuser" in conn_string
 85 | 
 86 |     def test_get_schema(self) -> None:
 87 |         """Test schema retrieval functionality."""
 88 |         with (
 89 |             patch("sqldeps.database.postgresql.create_engine"),
 90 |             patch("sqldeps.database.postgresql.inspect") as mock_inspect,
 91 |         ):
 92 |             # Create mock inspector with appropriate return values
 93 |             mock_inspector = MagicMock()
 94 |             mock_inspector.get_schema_names.return_value = ["public"]
 95 |             mock_inspector.get_table_names.return_value = ["users"]
 96 |             mock_inspector.get_columns.return_value = [
 97 |                 {"name": "id", "type": "INTEGER"},
 98 |                 {"name": "name", "type": "VARCHAR"},
 99 |             ]
100 |             mock_inspect.return_value = mock_inspector
101 | 
102 |             # Create connector with mocked components
103 |             connector = PostgreSQLConnector(
104 |                 host="localhost", database="testdb", username="user", password="pass"
105 |             )
106 | 
107 |             # Test get_schema method
108 |             result = connector.get_schema()
109 | 
110 |             # Verify the result
111 |             assert isinstance(result, pd.DataFrame)
112 |             assert len(result) == 2  # Two columns
113 |             assert list(result.columns) == ["schema", "table", "column", "data_type"]
114 |             assert list(result["column"]) == ["id", "name"]
115 | 
116 |     def test_get_schema_with_specific_schemas(self) -> None:
117 |         """Test schema retrieval for specific schemas."""
118 |         with (
119 |             patch("sqldeps.database.postgresql.create_engine"),
120 |             patch("sqldeps.database.postgresql.inspect") as mock_inspect,
121 |         ):
122 |             # Create mock inspector
123 |             mock_inspector = MagicMock()
124 |             mock_inspector.get_table_names.return_value = ["orders"]
125 |             mock_inspector.get_columns.return_value = [
126 |                 {"name": "order_id", "type": "INTEGER"}
127 |             ]
128 |             mock_inspect.return_value = mock_inspector
129 | 
130 |             # Create connector with mocked components
131 |             connector = PostgreSQLConnector(
132 |                 host="localhost", database="testdb", username="user", password="pass"
133 |             )
134 | 
135 |             # Test get_schema method with specific schema
136 |             result = connector.get_schema(schemas="sales")
137 | 
138 |             # Verify the result
139 |             assert isinstance(result, pd.DataFrame)
140 |             assert len(result) == 1
141 |             assert result["schema"][0] == "sales"
142 |             assert result["table"][0] == "orders"
143 |             assert result["column"][0] == "order_id"
144 | 
145 |     def test_pgpass_lookup(self) -> None:
146 |         """Test .pgpass file password lookup."""
147 |         pgpass_content = (
148 |             "localhost:5432:testdb:user:secretpass\n*:5432:*:admin:adminpass"
149 |         )
150 | 
151 |         with (
152 |             patch("builtins.open", mock_open(read_data=pgpass_content)),
153 |             patch.object(Path, "home", return_value=Path("/home/user")),
154 |             patch.object(Path, "exists", return_value=True),
155 |         ):
156 |             # Test exact match
157 |             password = PostgreSQLConnector._get_password_from_pgpass(
158 |                 None, "user", "localhost", "testdb", 5432, None
159 |             )
160 |             assert password == "secretpass"
161 | 
162 |             # Test wildcard match
163 |             password = PostgreSQLConnector._get_password_from_pgpass(
164 |                 None, "admin", "somehost", "anydb", 5432, None
165 |             )
166 |             assert password == "adminpass"
167 | 


--------------------------------------------------------------------------------
/tests/unit/llm_parsers/test_base.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for BaseSQLExtractor.
  2 | 
  3 | This module contains tests for the common functionality provided by the
  4 | BaseSQLExtractor abstract base class.
  5 | """
  6 | 
  7 | import json
  8 | from pathlib import Path
  9 | from unittest.mock import MagicMock, mock_open, patch
 10 | 
 11 | import pytest
 12 | 
 13 | from sqldeps.llm_parsers import BaseSQLExtractor
 14 | from sqldeps.models import SQLProfile
 15 | 
 16 | 
 17 | class MockSQLExtractor(BaseSQLExtractor):
 18 |     """Test implementation of BaseSQLExtractor for unit testing.
 19 | 
 20 |     This class provides a concrete implementation of the abstract BaseSQLExtractor
 21 |     that can be used in tests with mocked LLM responses.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         model: str = "test-model",
 27 |         params: dict | None = None,
 28 |         prompt_path: str | None = None,
 29 |     ) -> None:
 30 |         """Initialize the mock extractor.
 31 | 
 32 |         Args:
 33 |             model: Model name
 34 |             params: Additional parameters
 35 |             prompt_path: Path to custom prompt file
 36 |         """
 37 |         super().__init__(model, params, prompt_path)
 38 | 
 39 |     def _query_llm(self, prompt: str) -> str:
 40 |         """Implement the abstract method from the parent class.
 41 | 
 42 |         In actual tests, this method will typically be mocked.
 43 | 
 44 |         Args:
 45 |             prompt: Prompt to send to the LLM
 46 | 
 47 |         Returns:
 48 |             Empty string (will be mocked in tests)
 49 |         """
 50 |         return ""
 51 | 
 52 | 
 53 | @pytest.fixture
 54 | def mock_extractor() -> MockSQLExtractor:
 55 |     """Provide a mock SQL extractor for tests.
 56 | 
 57 |     Returns:
 58 |         MockSQLExtractor: A concrete implementation of BaseSQLExtractor
 59 |     """
 60 |     return MockSQLExtractor()
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def mock_sql_response() -> callable:
 65 |     """Create a standard SQL dependency response.
 66 | 
 67 |     Returns:
 68 |         function: A function that creates JSON responses with given dependencies/outputs
 69 |     """
 70 | 
 71 |     def _create_response(
 72 |         dependencies: dict | None = None, outputs: dict | None = None
 73 |     ) -> str:
 74 |         return json.dumps(
 75 |             {"dependencies": dependencies or {}, "outputs": outputs or {}}
 76 |         )
 77 | 
 78 |     return _create_response
 79 | 
 80 | 
 81 | class TestBaseSQLExtractor:
 82 |     """Test suite for BaseSQLExtractor."""
 83 | 
 84 |     def test_initialization(self, mock_extractor: MockSQLExtractor) -> None:
 85 |         """Test proper initialization of BaseSQLExtractor."""
 86 |         assert mock_extractor.model == "test-model"
 87 |         assert mock_extractor.framework == "mocksql"
 88 |         assert mock_extractor.params == {"temperature": 0}
 89 | 
 90 |     def test_extract_from_query(
 91 |         self, mock_extractor: MockSQLExtractor, mock_sql_response: callable
 92 |     ) -> None:
 93 |         """Test extraction from a SQL query."""
 94 |         response = mock_sql_response(
 95 |             dependencies={"table1": ["col1", "col2"]}, outputs={"table2": ["col3"]}
 96 |         )
 97 | 
 98 |         mock_extractor._query_llm = MagicMock(return_value=response)
 99 |         result = mock_extractor.extract_from_query("SELECT col1, col2 FROM table1")
100 | 
101 |         assert isinstance(result, SQLProfile)
102 |         assert result.dependencies == {"table1": ["col1", "col2"]}
103 |         assert result.outputs == {"table2": ["col3"]}
104 |         mock_extractor._query_llm.assert_called_once()
105 | 
106 |     def test_extract_from_file(
107 |         self, mock_extractor: MockSQLExtractor, mock_sql_response: callable
108 |     ) -> None:
109 |         """Test extraction from a SQL file."""
110 |         mock_sql = "SELECT * FROM users"
111 |         response = mock_sql_response(dependencies={"users": ["*"]})
112 | 
113 |         mock_extractor._query_llm = MagicMock(return_value=response)
114 | 
115 |         with (
116 |             patch("builtins.open", mock_open(read_data=mock_sql)),
117 |             patch.object(Path, "exists", return_value=True),
118 |         ):
119 |             result = mock_extractor.extract_from_file("fake_path.sql")
120 | 
121 |         assert result.dependencies == {"users": ["*"]}
122 |         assert result.outputs == {}
123 | 
124 |     def test_file_not_found(self, mock_extractor: MockSQLExtractor) -> None:
125 |         """Test handling of file not found."""
126 |         with (
127 |             patch.object(Path, "exists", return_value=False),
128 |             pytest.raises(FileNotFoundError),
129 |         ):
130 |             mock_extractor.extract_from_file("nonexistent.sql")
131 | 
132 |     def test_extract_from_folder(self, mock_extractor: MockSQLExtractor) -> None:
133 |         """Test extraction from a folder."""
134 |         with patch("sqldeps.llm_parsers.base.find_sql_files") as mock_find:
135 |             # Setup mock files
136 |             mock_files = [Path("file1.sql"), Path("file2.sql")]
137 |             mock_find.return_value = mock_files
138 | 
139 |             # Mock file extraction
140 |             mock_extractor.extract_from_file = MagicMock(
141 |                 return_value=SQLProfile(dependencies={"table1": ["col1"]}, outputs={})
142 |             )
143 | 
144 |             # Explicitly disable cache usage
145 |             result = mock_extractor.extract_from_folder(
146 |                 "test_folder", recursive=True, n_workers=1, use_cache=False
147 |             )
148 | 
149 |             # Verify results
150 |             assert len(result) == len(mock_files)
151 |             assert mock_extractor.extract_from_file.call_count == len(mock_files)
152 | 
153 |     @pytest.mark.parametrize(
154 |         "response,error_pattern",
155 |         [
156 |             ("Invalid JSON", "Failed to decode JSON"),
157 |             ('{"only_dependencies": {}}', "Missing required keys"),
158 |         ],
159 |     )
160 |     def test_process_response_errors(
161 |         self, mock_extractor: MockSQLExtractor, response: str, error_pattern: str
162 |     ) -> None:
163 |         """Test handling of different error conditions.
164 | 
165 |         Args:
166 |             mock_extractor: Mock extractor fixture
167 |             response: Response string to process
168 |             error_pattern: Expected error message pattern
169 |         """
170 |         with pytest.raises(ValueError, match=error_pattern):
171 |             mock_extractor._process_response(response)
172 | 
173 |     def test_load_prompts_default(self, mock_extractor: MockSQLExtractor) -> None:
174 |         """Test loading default prompts."""
175 |         # Define a dict that mimics parsed YAML
176 |         mock_yaml_data = {
177 |             "system_prompt": "test system prompt",
178 |             "user_prompt": "test user prompt",
179 |         }
180 | 
181 |         # Directly patch yaml.safe_load to return our mock data
182 |         with patch("yaml.safe_load", return_value=mock_yaml_data):
183 |             # Create a new extractor to trigger _load_prompts
184 |             extractor = MockSQLExtractor()
185 | 
186 |             # Verify the prompts were loaded correctly
187 |             assert extractor.prompts == mock_yaml_data
188 |             assert extractor.prompts["system_prompt"] == "test system prompt"
189 |             assert extractor.prompts["user_prompt"] == "test user prompt"
190 | 
191 |     def test_normalize_extensions(self) -> None:
192 |         """Test normalization of file extensions."""
193 |         result = MockSQLExtractor._normalize_extensions({".SQL", "sql", ".Sql"})
194 |         assert result == {"sql"}
195 | 
196 |         result = MockSQLExtractor._normalize_extensions(None)
197 |         assert result == {"sql"}  # Default extension
198 | 


--------------------------------------------------------------------------------
/tests/unit/llm_parsers/test_deepseek.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for DeepseekExtractor.
 2 | 
 3 | This module tests the DeepSeek-specific LLM implementation.
 4 | """
 5 | 
 6 | from unittest.mock import MagicMock, patch
 7 | 
 8 | import pytest
 9 | 
10 | from sqldeps.llm_parsers.deepseek import DeepseekExtractor
11 | 
12 | 
13 | class TestDeepseekExtractor:
14 |     """Test suite for DeepseekExtractor."""
15 | 
16 |     def test_initialization(self) -> None:
17 |         """Test proper initialization with API key."""
18 |         with patch.dict("os.environ", {"DEEPSEEK_API_KEY": "fake-key"}):
19 |             extractor = DeepseekExtractor(model="deepseek-chat")
20 | 
21 |             assert extractor.model == "deepseek-chat"
22 |             assert extractor.framework == "deepseek"
23 |             assert hasattr(extractor, "client")
24 | 
25 |     def test_initialization_without_api_key(self) -> None:
26 |         """Test initialization fails without API key."""
27 |         with (
28 |             patch.dict("os.environ", clear=True),
29 |             pytest.raises(ValueError, match="No API key provided"),
30 |         ):
31 |             DeepseekExtractor(model="deepseek-chat")
32 | 
33 |     def test_query_llm(self) -> None:
34 |         """Test LLM query functionality."""
35 |         with patch.dict("os.environ", {"DEEPSEEK_API_KEY": "fake-key"}):
36 |             extractor = DeepseekExtractor(model="deepseek-chat")
37 | 
38 |             # Mock the OpenAI client (which DeepseekExtractor uses)
39 |             mock_response = MagicMock()
40 |             mock_response.choices = [MagicMock()]
41 |             mock_response.choices[
42 |                 0
43 |             ].message.content = '{"dependencies": {}, "outputs": {}}'
44 | 
45 |             extractor.client = MagicMock()
46 |             extractor.client.chat.completions.create.return_value = mock_response
47 | 
48 |             # Test the query
49 |             result = extractor._query_llm("SELECT * FROM test")
50 | 
51 |             # Verify the response and method calls
52 |             assert result == '{"dependencies": {}, "outputs": {}}'
53 |             extractor.client.chat.completions.create.assert_called_once()
54 | 
55 |             # Verify correct parameters were passed
56 |             call_args = extractor.client.chat.completions.create.call_args[1]
57 |             assert call_args["model"] == "deepseek-chat"
58 |             assert call_args["response_format"] == {"type": "json_object"}
59 |             assert len(call_args["messages"]) == 2
60 |             assert call_args["messages"][0]["role"] == "system"
61 |             assert call_args["messages"][1]["role"] == "user"
62 | 


--------------------------------------------------------------------------------
/tests/unit/llm_parsers/test_groq.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for GroqExtractor.
 2 | 
 3 | This module tests the Groq-specific LLM implementation.
 4 | """
 5 | 
 6 | from unittest.mock import MagicMock, patch
 7 | 
 8 | import pytest
 9 | 
10 | from sqldeps.llm_parsers.groq import GroqExtractor
11 | 
12 | 
13 | class TestGroqExtractor:
14 |     """Test suite for GroqExtractor."""
15 | 
16 |     def test_initialization(self) -> None:
17 |         """Test proper initialization with API key."""
18 |         with patch.dict("os.environ", {"GROQ_API_KEY": "fake-key"}):
19 |             extractor = GroqExtractor(model="llama-3.3-70b-versatile")
20 | 
21 |             assert extractor.model == "llama-3.3-70b-versatile"
22 |             assert extractor.framework == "groq"
23 |             assert hasattr(extractor, "client")
24 | 
25 |     def test_initialization_without_api_key(self) -> None:
26 |         """Test initialization fails without API key."""
27 |         with (
28 |             patch.dict("os.environ", clear=True),
29 |             pytest.raises(ValueError, match="No API key provided"),
30 |         ):
31 |             GroqExtractor(model="llama-3.3-70b-versatile")
32 | 
33 |     def test_query_llm(self) -> None:
34 |         """Test LLM query functionality."""
35 |         with patch.dict("os.environ", {"GROQ_API_KEY": "fake-key"}):
36 |             extractor = GroqExtractor(model="llama-3.3-70b-versatile")
37 | 
38 |             # Mock the Groq client
39 |             mock_response = MagicMock()
40 |             mock_response.choices = [MagicMock()]
41 |             mock_response.choices[
42 |                 0
43 |             ].message.content = '{"dependencies": {}, "outputs": {}}'
44 | 
45 |             extractor.client = MagicMock()
46 |             extractor.client.chat.completions.create.return_value = mock_response
47 | 
48 |             # Test the query
49 |             result = extractor._query_llm("SELECT * FROM test")
50 | 
51 |             # Verify the response and method calls
52 |             assert result == '{"dependencies": {}, "outputs": {}}'
53 |             extractor.client.chat.completions.create.assert_called_once()
54 | 
55 |             # Verify correct parameters were passed
56 |             call_args = extractor.client.chat.completions.create.call_args[1]
57 |             assert call_args["model"] == "llama-3.3-70b-versatile"
58 |             assert call_args["response_format"] == {"type": "json_object"}
59 |             assert len(call_args["messages"]) == 2
60 |             assert call_args["messages"][0]["role"] == "system"
61 |             assert call_args["messages"][1]["role"] == "user"
62 | 


--------------------------------------------------------------------------------
/tests/unit/llm_parsers/test_init.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for sqldeps.llm_parsers.__init__.
 2 | 
 3 | This module tests the factory function and other initialization
 4 | logic of the llm_parsers package.
 5 | """
 6 | 
 7 | import pytest
 8 | 
 9 | from sqldeps.llm_parsers import create_extractor
10 | 
11 | 
12 | class TestLLMParsersInit:
13 |     """Test suite for sqldeps.llm_parsers.__init__."""
14 | 
15 |     def test_create_extractor_invalid_framework(self) -> None:
16 |         """Test creating an extractor with invalid framework."""
17 |         # No need to patch anything for this test
18 |         with pytest.raises(ValueError, match="Unsupported framework"):
19 |             create_extractor(framework="invalid_framework")
20 | 


--------------------------------------------------------------------------------
/tests/unit/llm_parsers/test_openai.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for OpenaiExtractor.
 2 | 
 3 | This module tests the OpenAI-specific LLM implementation.
 4 | """
 5 | 
 6 | from unittest.mock import MagicMock, patch
 7 | 
 8 | import pytest
 9 | 
10 | from sqldeps.llm_parsers.openai import OpenaiExtractor
11 | 
12 | 
13 | class TestOpenaiExtractor:
14 |     """Test suite for OpenaiExtractor."""
15 | 
16 |     def test_initialization(self) -> None:
17 |         """Test proper initialization with API key."""
18 |         with patch.dict("os.environ", {"OPENAI_API_KEY": "fake-key"}):
19 |             extractor = OpenaiExtractor(model="gpt-4o")
20 | 
21 |             assert extractor.model == "gpt-4o"
22 |             assert extractor.framework == "openai"
23 |             assert hasattr(extractor, "client")
24 | 
25 |     def test_initialization_without_api_key(self) -> None:
26 |         """Test initialization fails without API key."""
27 |         with (
28 |             patch.dict("os.environ", clear=True),
29 |             pytest.raises(ValueError, match="No API key provided"),
30 |         ):
31 |             OpenaiExtractor(model="gpt-4o")
32 | 
33 |     def test_query_llm(self) -> None:
34 |         """Test LLM query functionality."""
35 |         with patch.dict("os.environ", {"OPENAI_API_KEY": "fake-key"}):
36 |             extractor = OpenaiExtractor(model="gpt-4o")
37 | 
38 |             # Mock the OpenAI client
39 |             mock_response = MagicMock()
40 |             mock_response.choices = [MagicMock()]
41 |             mock_response.choices[
42 |                 0
43 |             ].message.content = '{"dependencies": {}, "outputs": {}}'
44 | 
45 |             extractor.client = MagicMock()
46 |             extractor.client.chat.completions.create.return_value = mock_response
47 | 
48 |             # Test the query
49 |             result = extractor._query_llm("SELECT * FROM test")
50 | 
51 |             # Verify the response and method calls
52 |             assert result == '{"dependencies": {}, "outputs": {}}'
53 |             extractor.client.chat.completions.create.assert_called_once()
54 | 
55 |             # Verify correct parameters were passed
56 |             call_args = extractor.client.chat.completions.create.call_args[1]
57 |             assert call_args["model"] == "gpt-4o"
58 |             assert call_args["response_format"] == {"type": "json_object"}
59 |             assert len(call_args["messages"]) == 2
60 |             assert call_args["messages"][0]["role"] == "system"
61 |             assert call_args["messages"][1]["role"] == "user"
62 | 


--------------------------------------------------------------------------------
/tests/unit/test_cache.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for cache.py.
  2 | 
  3 | This module tests the caching functionality for storing and retrieving
  4 | SQL dependency extraction results.
  5 | """
  6 | 
  7 | import json
  8 | from pathlib import Path
  9 | from unittest.mock import MagicMock, mock_open, patch
 10 | 
 11 | from sqldeps.cache import cleanup_cache, get_cache_path, load_from_cache, save_to_cache
 12 | from sqldeps.models import SQLProfile
 13 | 
 14 | 
 15 | def test_get_cache_path() -> None:
 16 |     """Test generation of cache file paths based on file content."""
 17 |     # Set up mock file content
 18 |     mock_content = b"SELECT * FROM table"
 19 |     mock_content_hash = "0123456789abcdef"  # Simplified hash output
 20 | 
 21 |     with (
 22 |         patch("pathlib.Path.resolve") as mock_resolve,
 23 |         patch("builtins.open", mock_open(read_data=mock_content)),
 24 |         patch("hashlib.md5") as mock_md5,
 25 |     ):
 26 |         # Setup mocks
 27 |         mock_resolve.return_value = Path("/absolute/path/to/file.sql")
 28 |         mock_hash_instance = MagicMock()
 29 |         mock_hash_instance.hexdigest.return_value = mock_content_hash
 30 |         mock_md5.return_value = mock_hash_instance
 31 | 
 32 |         # Test content-based hashing
 33 |         cache_path = get_cache_path("file.sql")
 34 | 
 35 |         # Verify results
 36 |         expected_path = Path(".sqldeps_cache") / f"file_{mock_content_hash[:16]}.json"
 37 |         assert cache_path == expected_path
 38 | 
 39 |         # Verify file was opened for reading
 40 |         open.assert_called_once_with(Path("/absolute/path/to/file.sql"), "rb")
 41 | 
 42 |         # Verify hash was computed with file content
 43 |         mock_md5.assert_called_once()
 44 |         mock_hash_instance.hexdigest.assert_called_once()
 45 | 
 46 | 
 47 | def test_save_load_cache() -> None:
 48 |     """Test saving and loading from cache."""
 49 |     # Create a test SQLProfile
 50 |     profile = SQLProfile(
 51 |         dependencies={"table1": ["col1"]}, outputs={"table2": ["col2"]}
 52 |     )
 53 | 
 54 |     # Mock file operations
 55 |     mock_file_content = json.dumps(profile.to_dict())
 56 | 
 57 |     with (
 58 |         patch("sqldeps.cache.get_cache_path") as mock_get_path,
 59 |         patch("builtins.open", mock_open(read_data=mock_file_content)),
 60 |     ):
 61 |         # Setup mock path
 62 |         mock_cache_path = Path(".sqldeps_cache/test.json")
 63 |         mock_get_path.return_value = mock_cache_path
 64 | 
 65 |         # Test saving to cache
 66 |         result = save_to_cache(profile, "test.sql")
 67 |         assert result is True
 68 | 
 69 |         # Test loading from cache
 70 |         with patch("pathlib.Path.exists") as mock_exists:
 71 |             mock_exists.return_value = True
 72 |             loaded = load_from_cache("test.sql")
 73 |             assert loaded.dependencies == profile.dependencies
 74 |             assert loaded.outputs == profile.outputs
 75 | 
 76 | 
 77 | def test_cleanup_cache_success() -> None:
 78 |     """Test successful cleanup of cache directory."""
 79 |     # Setup - create a mock cache directory with some files
 80 |     mock_cache_dir = Path("mock_cache_dir")
 81 |     mock_json_files = [
 82 |         Path("mock_cache_dir/file1.json"),
 83 |         Path("mock_cache_dir/file2.json"),
 84 |     ]
 85 | 
 86 |     with (
 87 |         patch("pathlib.Path.exists", return_value=True),
 88 |         patch("pathlib.Path.glob") as mock_glob,
 89 |         patch("pathlib.Path.unlink") as mock_unlink,
 90 |         patch("pathlib.Path.iterdir", return_value=[]),
 91 |         patch("pathlib.Path.rmdir") as mock_rmdir,
 92 |         patch("sqldeps.cache.logger") as mock_logger,
 93 |     ):
 94 |         # Setup mock glob to return our json files
 95 |         mock_glob.return_value = mock_json_files
 96 | 
 97 |         # Call the function
 98 |         result = cleanup_cache(mock_cache_dir)
 99 | 
100 |         # Verify the result
101 |         assert result is True
102 |         assert mock_unlink.call_count == 2  # Should unlink both JSON files
103 |         mock_rmdir.assert_called_once()  # Should remove the directory
104 |         mock_logger.info.assert_called()  # Should log success
105 | 
106 | 
107 | def test_cleanup_cache_non_empty_dir() -> None:
108 |     """Test cleanup when directory still has other files."""
109 |     mock_cache_dir = Path("mock_cache_dir")
110 | 
111 |     with (
112 |         patch("pathlib.Path.exists", return_value=True),
113 |         patch("pathlib.Path.glob") as mock_glob,
114 |         patch("pathlib.Path.unlink") as mock_unlink,
115 |         patch("pathlib.Path.iterdir", return_value=["other_file"]),
116 |         patch("pathlib.Path.rmdir") as mock_rmdir,
117 |         patch("sqldeps.cache.logger") as mock_logger,
118 |     ):
119 |         # Setup mock glob to return JSON files
120 |         mock_glob.return_value = [Path("mock_cache_dir/file1.json")]
121 | 
122 |         # Call the function
123 |         result = cleanup_cache(mock_cache_dir)
124 | 
125 |         # Verify the result
126 |         assert result is True
127 |         assert mock_unlink.call_count == 1  # Should unlink the JSON file
128 |         mock_rmdir.assert_not_called()  # Should not remove non-empty directory
129 |         mock_logger.info.assert_called_with(
130 |             "Cache directory cleaned but not removed (contains other files)"
131 |         )
132 | 
133 | 
134 | def test_cleanup_cache_error() -> None:
135 |     """Test cleanup when an error occurs."""
136 |     mock_cache_dir = Path("mock_cache_dir")
137 | 
138 |     with (
139 |         patch("pathlib.Path.exists", return_value=True),
140 |         patch("pathlib.Path.glob") as mock_glob,
141 |         patch("sqldeps.cache.logger") as mock_logger,
142 |     ):
143 |         # Make glob raise an exception
144 |         mock_glob.side_effect = Exception("Test error")
145 | 
146 |         # Call the function
147 |         result = cleanup_cache(mock_cache_dir)
148 | 
149 |         # Verify the result
150 |         assert result is False
151 |         mock_logger.warning.assert_called_once()  # Should log warning
152 | 
153 | 
154 | def test_cleanup_cache_nonexistent() -> None:
155 |     """Test cleanup when cache directory doesn't exist."""
156 |     mock_cache_dir = Path("nonexistent_dir")
157 | 
158 |     with patch("pathlib.Path.exists", return_value=False):
159 |         # Call the function
160 |         result = cleanup_cache(mock_cache_dir)
161 | 
162 |         # Verify the result
163 |         assert result is True  # Should return True when directory doesn't exist
164 | 


--------------------------------------------------------------------------------
/tests/unit/test_cli.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for command-line interface.
  2 | 
  3 | This module tests the functionality of the CLI commands and related functions.
  4 | """
  5 | 
  6 | from pathlib import Path
  7 | from unittest.mock import MagicMock, patch
  8 | 
  9 | import pytest
 10 | from typer.testing import CliRunner
 11 | 
 12 | from sqldeps.cli import app, extract, extract_dependencies, save_output
 13 | from sqldeps.models import SQLProfile
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def runner() -> CliRunner:
 18 |     """Create a CLI runner for testing.
 19 | 
 20 |     Returns:
 21 |         CliRunner: A Typer CLI test runner
 22 |     """
 23 |     return CliRunner()
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def mock_sql_profile() -> SQLProfile:
 28 |     """Create a mock SQLProfile for testing.
 29 | 
 30 |     Returns:
 31 |         SQLProfile: A sample SQLProfile for testing
 32 |     """
 33 |     return SQLProfile(
 34 |         dependencies={"users": ["id", "name"]}, outputs={"reports": ["date", "total"]}
 35 |     )
 36 | 
 37 | 
 38 | class TestCLI:
 39 |     """Test suite for command-line interface."""
 40 | 
 41 |     def test_extract_dependencies(self, mock_sql_profile: SQLProfile) -> None:
 42 |         """Test extraction of dependencies."""
 43 |         # Mock the extractor
 44 |         mock_extractor = MagicMock()
 45 |         mock_extractor.extract_from_file.return_value = mock_sql_profile
 46 |         mock_extractor.extract_from_folder.return_value = {"file.sql": mock_sql_profile}
 47 | 
 48 |         # Test file extraction
 49 |         with patch("pathlib.Path.is_file", return_value=True):
 50 |             result = extract_dependencies(mock_extractor, Path("file.sql"), False)
 51 |             assert result == mock_sql_profile
 52 |             mock_extractor.extract_from_file.assert_called_once()
 53 | 
 54 |         # Test folder extraction
 55 |         with patch("pathlib.Path.is_file", return_value=False):
 56 |             result = extract_dependencies(mock_extractor, Path("folder"), True)
 57 |             assert result == {"file.sql": mock_sql_profile}
 58 |             mock_extractor.extract_from_folder.assert_called_once()
 59 | 
 60 |     def test_save_output(self, mock_sql_profile: SQLProfile, tmp_path: Path) -> None:
 61 |         """Test saving output to different formats."""
 62 |         # Test JSON output
 63 |         json_path = tmp_path / "output.json"
 64 |         save_output(mock_sql_profile, json_path)
 65 |         assert json_path.exists()
 66 | 
 67 |         # Test CSV output
 68 |         csv_path = tmp_path / "output.csv"
 69 |         save_output(mock_sql_profile, csv_path)
 70 |         assert csv_path.exists()
 71 | 
 72 |         # Test CSV output with schema match
 73 |         df_mock = MagicMock()
 74 |         df_mock.to_csv = MagicMock()
 75 |         save_output(df_mock, csv_path, is_schema_match=True)
 76 |         df_mock.to_csv.assert_called_once()
 77 | 
 78 |     def test_cli_command(self, runner: CliRunner, mock_sql_profile: SQLProfile) -> None:
 79 |         """Test the CLI command execution using isolated components."""
 80 |         with (
 81 |             patch("sqldeps.cli.create_extractor") as mock_create_extractor,
 82 |             patch("sqldeps.cli.extract_dependencies") as mock_extract,
 83 |             patch("sqldeps.cli.save_output") as mock_save,
 84 |         ):
 85 |             # Setup mocks
 86 |             mock_extractor = MagicMock()
 87 |             mock_create_extractor.return_value = mock_extractor
 88 |             mock_extract.return_value = mock_sql_profile
 89 | 
 90 |             # Use the extract function directly instead of 'main'
 91 |             extract(
 92 |                 fpath=Path("file.sql"),
 93 |                 framework="groq",
 94 |                 model=None,
 95 |                 prompt=None,
 96 |                 recursive=False,
 97 |                 db_match_schema=False,
 98 |                 db_target_schemas="public",
 99 |                 db_credentials=None,
100 |                 output=Path("dependencies.json"),
101 |             )
102 | 
103 |             # Verify function calls
104 |             mock_create_extractor.assert_called_once()
105 |             mock_extract.assert_called_once()
106 |             mock_save.assert_called_once()
107 | 
108 |     def test_cli_error_handling(self) -> None:
109 |         """Test error handling in CLI using mock directly."""
110 |         with patch("sqldeps.cli.create_extractor") as mock_create_extractor:
111 |             # Make the extractor creation raise an exception
112 |             mock_create_extractor.side_effect = ValueError("Test error")
113 | 
114 |             # Call the extract function directly and catch the exception
115 |             from typer import Exit
116 | 
117 |             with pytest.raises(Exit) as excinfo:
118 |                 extract(
119 |                     fpath=Path("file.sql"),
120 |                     framework="groq",
121 |                     model=None,
122 |                     prompt=None,
123 |                     recursive=False,
124 |                     db_match_schema=False,
125 |                     db_target_schemas="public",
126 |                     db_credentials=None,
127 |                     output=Path("dependencies.json"),
128 |                 )
129 | 
130 |             # Verify the exit code is 1
131 |             assert excinfo.value.exit_code == 1
132 | 
133 |     def test_cli_database_validation(self, mock_sql_profile: SQLProfile) -> None:
134 |         """Test database validation logic directly."""
135 |         with (
136 |             patch("sqldeps.cli.create_extractor") as mock_create_extractor,
137 |             patch("sqldeps.cli.extract_dependencies") as mock_extract,
138 |             patch("sqldeps.cli.match_dependencies_against_schema") as mock_match,
139 |             patch("sqldeps.cli.save_output"),
140 |             patch("builtins.open", MagicMock()),
141 |             patch("yaml.safe_load", return_value={"database": {}}),
142 |         ):
143 |             # Setup mocks
144 |             mock_extractor = MagicMock()
145 |             mock_create_extractor.return_value = mock_extractor
146 |             mock_extract.return_value = mock_sql_profile
147 |             mock_match.return_value = MagicMock()  # Mock DataFrame result
148 | 
149 |             # Call the extract function directly
150 |             extract(
151 |                 fpath=Path("file.sql"),
152 |                 framework="groq",
153 |                 model=None,
154 |                 prompt=None,
155 |                 recursive=False,
156 |                 db_match_schema=True,
157 |                 db_target_schemas="public",
158 |                 db_credentials=Path("config.yml"),
159 |                 output=Path("dependencies.json"),
160 |             )
161 | 
162 |             # Verify function calls
163 |             mock_match.assert_called_once()
164 | 
165 |     def test_app_version(self, runner: CliRunner) -> None:
166 |         """Test CLI app version command."""
167 |         # The version command is a safer option to test CLI integration
168 |         result = runner.invoke(app, ["--version"])
169 | 
170 |         # Version command should not produce an error
171 |         assert result.exit_code == 0
172 |         assert "SQLDeps version:" in result.output
173 | 
174 |     def test_app_command(self) -> None:
175 |         """Test the app command functionality."""
176 |         with (
177 |             patch("sqldeps.cli.subprocess.run") as mock_run,
178 |             patch("sqldeps.cli.Path.exists", return_value=True),
179 |         ):
180 |             from sqldeps.cli import app_main
181 | 
182 |             app_main()
183 |             mock_run.assert_called_once()
184 | 
185 |     def test_cache_clear_command(self) -> None:
186 |         """Test the cache clear command."""
187 |         with patch("sqldeps.cli.cleanup_cache", return_value=True) as mock_cleanup:
188 |             from sqldeps.cli import cache_clear
189 | 
190 |             cache_clear()
191 |             mock_cleanup.assert_called_once()
192 | 


--------------------------------------------------------------------------------
/tests/unit/test_config.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for config.py.
 2 | 
 3 | This module tests configuration loading functionality.
 4 | """
 5 | 
 6 | from unittest.mock import mock_open, patch
 7 | 
 8 | from sqldeps.config import load_config
 9 | 
10 | 
11 | def test_load_config() -> None:
12 |     """Test loading configuration from a YAML file."""
13 |     # Simple test YAML with nested keys
14 |     config_yaml = """
15 |     database:
16 |       host: localhost
17 |       port: 5432
18 |     """
19 | 
20 |     # Mock file open
21 |     with patch("builtins.open", mock_open(read_data=config_yaml)):
22 |         config = load_config("fake_config.yml")
23 | 
24 |     # Verify basic parsing including nested values
25 |     assert config["database"]["host"] == "localhost"
26 |     assert config["database"]["port"] == 5432
27 | 


--------------------------------------------------------------------------------
/tests/unit/test_models.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for data models.
  2 | 
  3 | This module tests the SQLProfile class and its methods.
  4 | """
  5 | 
  6 | import pandas as pd
  7 | 
  8 | from sqldeps.models import SQLProfile
  9 | 
 10 | 
 11 | def test_sql_profile_initialization() -> None:
 12 |     """Test SQLProfile initialization and sorting."""
 13 |     # Create a profile with unsorted data
 14 |     profile = SQLProfile(
 15 |         dependencies={
 16 |             "table_b": ["col_c", "col_a", "col_b"],
 17 |             "table_a": ["col_z", "col_y"],
 18 |         },
 19 |         outputs={
 20 |             "schema.out_table_b": ["out_col_b", "out_col_a"],
 21 |             "schema.out_table_a": ["out_col_x"],
 22 |         },
 23 |     )
 24 | 
 25 |     # Check that tables and columns are sorted
 26 |     assert list(profile.dependencies.keys()) == ["table_a", "table_b"]
 27 |     assert profile.dependencies["table_a"] == ["col_y", "col_z"]
 28 |     assert profile.dependencies["table_b"] == ["col_a", "col_b", "col_c"]
 29 | 
 30 |     assert list(profile.outputs.keys()) == ["schema.out_table_a", "schema.out_table_b"]
 31 |     assert profile.outputs["schema.out_table_a"] == ["out_col_x"]
 32 |     assert profile.outputs["schema.out_table_b"] == ["out_col_a", "out_col_b"]
 33 | 
 34 | 
 35 | def test_to_dataframe_conversion() -> None:
 36 |     """Test conversion to DataFrame with proper structure."""
 37 |     profile = SQLProfile(
 38 |         dependencies={"schema.users": ["id", "name"]},
 39 |         outputs={"public.user_report": ["user_id", "report_date"]},
 40 |     )
 41 | 
 42 |     df = profile.to_dataframe()
 43 | 
 44 |     # Check DataFrame structure
 45 |     assert isinstance(df, pd.DataFrame)
 46 |     assert set(df.columns) == {"type", "schema", "table", "column"}
 47 | 
 48 |     # Check dependencies were properly converted
 49 |     deps = df[df["type"] == "dependency"]
 50 |     assert len(deps) == 2  # Two columns
 51 |     assert set(deps["schema"]) == {"schema"}
 52 |     assert set(deps["table"]) == {"users"}
 53 |     assert set(deps["column"]) == {"id", "name"}
 54 | 
 55 |     # Check outputs were properly converted
 56 |     outs = df[df["type"] == "outcome"]
 57 |     assert len(outs) == 2  # Two columns
 58 |     assert set(outs["schema"]) == {"public"}
 59 |     assert set(outs["table"]) == {"user_report"}
 60 |     assert set(outs["column"]) == {"user_id", "report_date"}
 61 | 
 62 | 
 63 | def test_empty_columns_handling() -> None:
 64 |     """Test handling of tables with no specific columns."""
 65 |     profile = SQLProfile(
 66 |         dependencies={"table_with_no_columns": []},
 67 |         outputs={"output_table_no_columns": []},
 68 |     )
 69 | 
 70 |     df = profile.to_dataframe()
 71 | 
 72 |     # Check tables with no columns are properly represented
 73 |     deps = df[df["type"] == "dependency"]
 74 |     assert len(deps) == 1
 75 |     assert deps.iloc[0]["table"] == "table_with_no_columns"
 76 |     assert deps.iloc[0]["column"] is None
 77 | 
 78 |     outs = df[df["type"] == "outcome"]
 79 |     assert len(outs) == 1
 80 |     assert outs.iloc[0]["table"] == "output_table_no_columns"
 81 |     assert outs.iloc[0]["column"] is None
 82 | 
 83 | 
 84 | def test_to_dict() -> None:
 85 |     """Test conversion to dictionary format."""
 86 |     profile = SQLProfile(
 87 |         dependencies={"users": ["id", "name"]}, outputs={"reports": ["user_id"]}
 88 |     )
 89 | 
 90 |     result = profile.to_dict()
 91 | 
 92 |     assert isinstance(result, dict)
 93 |     assert "dependencies" in result
 94 |     assert "outputs" in result
 95 |     assert result["dependencies"] == {"users": ["id", "name"]}
 96 |     assert result["outputs"] == {"reports": ["user_id"]}
 97 | 
 98 | 
 99 | def test_property_accessors() -> None:
100 |     """Test property accessor methods."""
101 |     profile = SQLProfile(
102 |         dependencies={"schema1.table1": ["col1"], "schema2.table2": ["col2"]},
103 |         outputs={"schema3.table3": ["col3"], "schema4.table4": ["col4"]},
104 |     )
105 | 
106 |     # Test dependency_tables property
107 |     assert profile.dependency_tables == ["schema1.table1", "schema2.table2"]
108 | 
109 |     # Test outcome_tables property
110 |     assert profile.outcome_tables == ["schema3.table3", "schema4.table4"]
111 | 


--------------------------------------------------------------------------------
/tests/unit/test_parallel.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for parallel processing functionality.
  2 | 
  3 | This module tests the parallel execution of SQL dependency extraction
  4 | across multiple processes.
  5 | """
  6 | 
  7 | from concurrent.futures import Future
  8 | from pathlib import Path
  9 | from unittest.mock import MagicMock, patch
 10 | 
 11 | import pytest
 12 | 
 13 | from sqldeps.parallel import (
 14 |     _extract_from_file,
 15 |     _process_batch_files,
 16 |     process_files_in_parallel,
 17 |     resolve_workers,
 18 | )
 19 | 
 20 | 
 21 | class TestParallelProcessing:
 22 |     """Test suite for parallel processing functionality."""
 23 | 
 24 |     def test_resolve_workers(self) -> None:
 25 |         """Test resolution of worker count."""
 26 |         with patch("sqldeps.parallel.cpu_count", return_value=8):
 27 |             # Default (-1) should use all CPUs
 28 |             assert resolve_workers(-1) == 8
 29 | 
 30 |             # Specific number within range
 31 |             assert resolve_workers(4) == 4
 32 | 
 33 |             # Minimum 1
 34 |             assert resolve_workers(1) == 1
 35 | 
 36 |             # Too large should raise ValueError
 37 |             with pytest.raises(ValueError):
 38 |                 resolve_workers(9)
 39 | 
 40 |             # Too small should raise ValueError
 41 |             with pytest.raises(ValueError):
 42 |                 resolve_workers(0)
 43 | 
 44 |     def test_extract_from_file_with_cache(self) -> None:
 45 |         """Test single file extraction with caching."""
 46 |         # Mock dependencies
 47 |         mock_limiter = MagicMock()
 48 |         mock_result = MagicMock()
 49 |         mock_path = Path("test.sql")
 50 | 
 51 |         # Mock cache hit
 52 |         with patch("sqldeps.parallel.load_from_cache", return_value=mock_result):
 53 |             # Should return cached result without extracting
 54 |             path, result = _extract_from_file(
 55 |                 mock_path, mock_limiter, "groq", "model", None, True
 56 |             )
 57 | 
 58 |             assert path == mock_path
 59 |             assert result == mock_result
 60 |             mock_limiter.wait_if_needed.assert_not_called()
 61 | 
 62 |     def test_extract_from_file_without_cache(self) -> None:
 63 |         """Test single file extraction without cache hit."""
 64 |         # Mock dependencies
 65 |         mock_limiter = MagicMock()
 66 |         mock_extractor = MagicMock()
 67 |         mock_extractor.extract_from_file.return_value = "result"
 68 |         mock_path = Path("test.sql")
 69 | 
 70 |         # Setup no cache hit, extract successful
 71 |         with (
 72 |             patch("sqldeps.parallel.load_from_cache", return_value=None),
 73 |             patch("sqldeps.llm_parsers.create_extractor", return_value=mock_extractor),
 74 |             patch("sqldeps.parallel.save_to_cache") as mock_save,
 75 |         ):
 76 |             # Should perform extraction
 77 |             path, result = _extract_from_file(
 78 |                 mock_path, mock_limiter, "groq", "model", None, True
 79 |             )
 80 | 
 81 |             assert path == mock_path
 82 |             assert result == "result"
 83 |             mock_limiter.wait_if_needed.assert_called_once()
 84 |             mock_extractor.extract_from_file.assert_called_once_with(mock_path)
 85 |             mock_save.assert_called_once()
 86 | 
 87 |     def test_process_batch_files(self) -> None:
 88 |         """Test batch processing of files."""
 89 |         # Mock dependencies
 90 |         mock_limiter = MagicMock()
 91 |         mock_files = [Path("test1.sql"), Path("test2.sql")]
 92 | 
 93 |         # Setup extraction results
 94 |         path1_result = MagicMock()
 95 |         path2_result = MagicMock()
 96 | 
 97 |         # Mock the extract_from_file function to return predetermined results
 98 |         with patch(
 99 |             "sqldeps.parallel._extract_from_file",
100 |             side_effect=[(mock_files[0], path1_result), (mock_files[1], path2_result)],
101 |         ):
102 |             # Process batch
103 |             results = _process_batch_files(
104 |                 mock_files, mock_limiter, "groq", "model", None, True
105 |             )
106 | 
107 |             # Verify results
108 |             assert len(results) == 2
109 |             assert results[str(mock_files[0])] == path1_result
110 |             assert results[str(mock_files[1])] == path2_result
111 | 
112 |     def test_process_files_in_parallel(self) -> None:
113 |         """Test parallel file processing."""
114 |         with (
115 |             patch("sqldeps.parallel.ProcessPoolExecutor") as mock_executor_class,
116 |             patch("sqldeps.parallel.Manager") as mock_manager,
117 |             patch("sqldeps.parallel.MultiprocessingRateLimiter") as mock_limiter_class,
118 |             patch("sqldeps.parallel.resolve_workers") as mock_resolve,
119 |             patch("sqldeps.parallel.np.array_split") as mock_array_split,
120 |         ):
121 |             # Setup mocks
122 |             mock_resolve.return_value = 2
123 |             mock_sql_files = [
124 |                 Path("test1.sql"),
125 |                 Path("test2.sql"),
126 |                 Path("test3.sql"),
127 |                 Path("test4.sql"),
128 |             ]
129 |             mock_array_split.return_value = [
130 |                 [mock_sql_files[0], mock_sql_files[1]],
131 |                 [mock_sql_files[2], mock_sql_files[3]],
132 |             ]
133 | 
134 |             # Mock the manager
135 |             manager_instance = MagicMock()
136 |             mock_manager.return_value.__enter__.return_value = manager_instance
137 | 
138 |             # Mock the limiter
139 |             mock_limiter = MagicMock()
140 |             mock_limiter_class.return_value = mock_limiter
141 | 
142 |             # Mock the ProcessPoolExecutor
143 |             executor_instance = MagicMock()
144 |             mock_executor_class.return_value.__enter__.return_value = executor_instance
145 | 
146 |             # Setup futures and their results
147 |             future1 = MagicMock(spec=Future)
148 |             future2 = MagicMock(spec=Future)
149 |             future1.result.return_value = {
150 |                 str(mock_sql_files[0]): "result1",
151 |                 str(mock_sql_files[1]): "result2",
152 |             }
153 |             future2.result.return_value = {
154 |                 str(mock_sql_files[2]): "result3",
155 |                 str(mock_sql_files[3]): "result4",
156 |             }
157 | 
158 |             # Mock the futures dictionary
159 |             executor_instance.submit.side_effect = [future1, future2]
160 | 
161 |             # Mock as_completed to return futures in order
162 |             with patch(
163 |                 "sqldeps.parallel.as_completed", return_value=[future1, future2]
164 |             ):
165 |                 # Call the function
166 |                 results = process_files_in_parallel(
167 |                     mock_sql_files,
168 |                     framework="groq",
169 |                     model="test-model",
170 |                     n_workers=2,
171 |                     rpm=60,
172 |                     use_cache=True,
173 |                 )
174 | 
175 |                 # Verify results
176 |                 assert len(results) == 4
177 |                 assert results[str(mock_sql_files[0])] == "result1"
178 |                 assert results[str(mock_sql_files[1])] == "result2"
179 |                 assert results[str(mock_sql_files[2])] == "result3"
180 |                 assert results[str(mock_sql_files[3])] == "result4"
181 | 
182 |                 # Verify worker resolution
183 |                 mock_resolve.assert_called_once_with(2)
184 | 
185 |                 # Verify batch splitting
186 |                 mock_array_split.assert_called_once()
187 | 
188 |                 # Verify executor was created with correct workers
189 |                 mock_executor_class.assert_called_once_with(max_workers=2)
190 | 
191 |                 # Verify submit was called for each batch
192 |                 assert executor_instance.submit.call_count == 2
193 | 


--------------------------------------------------------------------------------
/tests/unit/test_rate_limiter.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for rate limiter.
 2 | 
 3 | This module tests the rate limiting functionality which is used to control
 4 | the frequency of API calls to LLM providers.
 5 | """
 6 | 
 7 | from unittest.mock import MagicMock, patch
 8 | 
 9 | from sqldeps.rate_limiter import MultiprocessingRateLimiter, RateLimiter
10 | 
11 | 
12 | def test_rate_limiter_no_wait_under_limit() -> None:
13 |     """Test rate limiter when under the RPM limit (no waiting needed)."""
14 |     # Create a rate limiter with 60 RPM (1 request per second)
15 |     limiter = RateLimiter(rpm=60)
16 | 
17 |     # Mock time.time to return controlled values
18 |     with patch("time.time", return_value=100), patch("time.sleep") as mock_sleep:
19 |         # Call wait_if_needed multiple times (less than rpm)
20 |         for _ in range(30):
21 |             limiter.wait_if_needed()
22 | 
23 |         # Verify sleep was not called since we're under the rate limit
24 |         mock_sleep.assert_not_called()
25 | 
26 | 
27 | def test_rate_limiter_wait_when_limit_reached() -> None:
28 |     """Test rate limiter when RPM limit is reached (should wait)."""
29 |     # Create a rate limiter with 10 RPM
30 |     limiter = RateLimiter(rpm=10)
31 | 
32 |     # Set up the call_times list with timestamps that would trigger rate limiting
33 |     current_time = 100
34 |     limiter.call_times = [
35 |         current_time - 50 + i for i in range(10)
36 |     ]  # 10 calls in last 50 seconds
37 | 
38 |     # Mock time functions
39 |     with (
40 |         patch("time.time", return_value=current_time),
41 |         patch("time.sleep") as mock_sleep,
42 |     ):
43 |         # This call should trigger waiting since we've reached 10 calls in the window
44 |         limiter.wait_if_needed()
45 | 
46 |         # Verify sleep was called with the correct wait time (should wait ~10 seconds)
47 |         # First timestamp is (current_time - 50), so it expires at (current_time + 10)
48 |         expected_wait_time = 10  # (current_time - 50) + 60 - current_time
49 |         mock_sleep.assert_called_once()
50 |         actual_wait_time = mock_sleep.call_args[0][0]
51 |         assert (
52 |             abs(actual_wait_time - expected_wait_time) < 0.01
53 |         )  # Allow small float differences
54 | 
55 | 
56 | def test_rate_limiter_zero_rpm() -> None:
57 |     """Test rate limiter when RPM is set to zero (disabled)."""
58 |     limiter = RateLimiter(rpm=0)
59 | 
60 |     with patch("time.time") as mock_time, patch("time.sleep") as mock_sleep:
61 |         # Call wait_if_needed multiple times
62 |         for _ in range(100):
63 |             limiter.wait_if_needed()
64 | 
65 |         # Verify that time and sleep were not called
66 |         mock_time.assert_not_called()
67 |         mock_sleep.assert_not_called()
68 | 
69 | 
70 | def test_multiprocessing_rate_limiter() -> None:
71 |     """Test multiprocessing rate limiter."""
72 |     # Create a mock manager
73 |     mock_manager = MagicMock()
74 |     mock_manager.list.return_value = []
75 |     mock_manager.RLock.return_value = MagicMock()
76 | 
77 |     # Create limiter with mock manager
78 |     limiter = MultiprocessingRateLimiter(mock_manager, rpm=10)
79 | 
80 |     # Test wait_if_needed when under the limit
81 |     with patch("time.time", return_value=100), patch("time.sleep") as mock_sleep:
82 |         limiter.wait_if_needed()
83 | 
84 |         # Since there are no previous calls, sleep should not be called
85 |         mock_sleep.assert_not_called()
86 | 
87 |         # call_times should have been updated
88 |         assert len(limiter.call_times) == 1
89 |         assert limiter.call_times[0] == 100
90 | 


--------------------------------------------------------------------------------
/tests/unit/test_visualization.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for visualization.py.
 2 | 
 3 | This module tests the visualization functions for creating interactive
 4 | network graphs of SQL dependencies.
 5 | """
 6 | 
 7 | from sqldeps.models import SQLProfile
 8 | from sqldeps.visualization import visualize_sql_dependencies
 9 | 
10 | 
11 | def test_visualize_sql_dependencies_basic() -> None:
12 |     """Test basic visualization of SQL dependencies.
13 | 
14 |     Verifies that the visualization function creates a valid Plotly figure
15 |     with the correct title and traces.
16 |     """
17 |     # Simple mock dependencies data
18 |     sql_profiles = {
19 |         "file1.sql": SQLProfile(
20 |             dependencies={"table1": ["col1", "col2"]}, outputs={"table2": ["col3"]}
21 |         ),
22 |     }
23 | 
24 |     # Call the visualization function
25 |     figure = visualize_sql_dependencies(sql_profiles)
26 | 
27 |     # Basic assertions to verify the figure was created properly
28 |     assert figure is not None
29 |     assert len(figure.data) > 0  # Should have at least some traces
30 | 
31 |     # Verify title contains expected information
32 |     assert "SQL Dependency Graph" in figure.layout.title.text
33 |     assert "1 files" in figure.layout.title.text
34 | 
35 | 
36 | def test_visualize_sql_dependencies_empty() -> None:
37 |     """Test visualization with empty dependencies.
38 | 
39 |     Verifies that the function handles empty input gracefully.
40 |     """
41 |     # Call with empty dependencies
42 |     figure = visualize_sql_dependencies({})
43 | 
44 |     # Should still create a figure
45 |     assert figure is not None
46 |     assert "0 files" in figure.layout.title.text
47 | 


--------------------------------------------------------------------------------