├── .coveragerc ├── .env.example ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── documentation.yml │ └── enhancement.yml └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── configs ├── database.yml └── prompts │ └── default.yml ├── data └── examples │ ├── example.sql │ └── folders_with_sql_files │ ├── example1.sql │ ├── example2.sql │ ├── random_file.txt │ ├── subfolder1 │ ├── example3.sql │ └── example3_expected.json │ └── subfolder2 │ ├── random_file.txt │ └── subfolder2_1 │ ├── example4.sql │ ├── example5.sql │ └── random_file.yml ├── docs ├── .gitkeep ├── api-reference │ ├── app.md │ ├── cache.md │ ├── cli.md │ ├── config.md │ ├── database.md │ ├── llm-parsers.md │ ├── models.md │ ├── parallel.md │ ├── rate-limiter.md │ ├── utils.md │ └── visualization.md ├── assets │ └── images │ │ └── sqldeps_logo.png ├── authors.md ├── changelog.md ├── contributing.md ├── docs-requirements.txt ├── examples.md ├── getting-started │ ├── installation.md │ └── quick-start.md ├── index.md ├── stylesheets │ └── custom.css └── user-guide │ ├── api-usage.md │ ├── cli-usage.md │ ├── database-integration.md │ ├── visualization.md │ └── web-app.md ├── mkdocs.yml ├── notebooks ├── .gitkeep └── sqldeps_showcase.ipynb ├── pyproject.toml ├── scripts └── .gitkeep ├── sqldeps ├── __init__.py ├── app │ ├── __init__.py │ ├── assets │ │ └── images │ │ │ ├── sqldeps_gray.png │ │ │ └── sqldeps_white.png │ └── main.py ├── cache.py ├── cli.py ├── config.py ├── configs │ └── prompts │ │ ├── default.yml │ │ ├── default_v0.1.0.yml │ │ └── simplified.yml ├── database │ ├── __init__.py │ ├── base.py │ └── postgresql.py ├── llm_parsers │ ├── __init__.py │ ├── base.py │ ├── deepseek.py │ ├── groq.py │ ├── litellm.py │ └── openai.py ├── models.py ├── parallel.py ├── rate_limiter.py ├── utils.py └── visualization.py └── tests ├── conftest.py ├── data ├── expected_outputs │ ├── example10_expected.json │ ├── example1_expected.json │ ├── example2_expected.json │ ├── example3_expected.json │ ├── example4_expected.json │ ├── example5_expected.json │ ├── example6_expected.json │ ├── example7_expected.json │ ├── example8_expected.json │ └── example9_expected.json ├── oneshot.json ├── oneshot.sql └── sql │ ├── example1.sql │ ├── example10.sql │ ├── example2.sql │ ├── example3.sql │ ├── example4.sql │ ├── example5.sql │ ├── example6.sql │ ├── example7.sql │ ├── example8.sql │ └── example9.sql ├── functional └── test_sql.py ├── integration └── test_database.py └── unit ├── app ├── __init__.py └── test_main.py ├── database └── test_postgresql.py ├── llm_parsers ├── test_base.py ├── test_deepseek.py ├── test_groq.py ├── test_init.py └── test_openai.py ├── test_cache.py ├── test_cli.py ├── test_config.py ├── test_models.py ├── test_parallel.py ├── test_rate_limiter.py ├── test_utils.py └── test_visualization.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | sqldeps/app/* 4 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Database credentials 2 | DB_HOST = "host" 3 | DB_PORT = "5432" 4 | DB_NAME = "database" 5 | DB_USER = "username" 6 | DB_PASSWORD = "password" 7 | 8 | # Test database credentials 9 | TEST_DB_HOST = "host" 10 | TEST_DB_PORT = "5432" 11 | TEST_DB_NAME = "database" 12 | TEST_DB_USER = "username" 13 | TEST_DB_PASSWORD = "password" 14 | 15 | # API Keys 16 | GROQ_API_KEY = "groq_token" 17 | OPENAI_API_KEY = "openai_token" 18 | DEEPSEEK_API_KEY = "deepseek_token" 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: "🐛 Bug Report" 2 | description: Report a bug in SQLDeps. 3 | title: "[BUG]: " 4 | labels: ["bug"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for taking the time to fill out this bug report! 11 | - type: textarea 12 | id: what-happened 13 | attributes: 14 | label: What happened? 15 | description: A clear and concise description of what the bug is. 16 | placeholder: Tell us what you see! 17 | validations: 18 | required: true 19 | - type: textarea 20 | id: reproduce 21 | attributes: 22 | label: Steps to reproduce 23 | description: How can we reproduce this issue? 24 | placeholder: | 25 | 1. Run `...` 26 | 2. See error 27 | validations: 28 | required: true 29 | - type: textarea 30 | id: expected 31 | attributes: 32 | label: Expected behavior 33 | description: What did you expect to happen? 34 | - type: textarea 35 | id: environment 36 | attributes: 37 | label: Environment 38 | description: Include relevant details about your environment 39 | placeholder: | 40 | - Python version: [e.g. 3.12] 41 | - SQLDeps version: [e.g. 0.0.10] 42 | - OS: [e.g. macOS, Windows, Linux] 43 | - type: textarea 44 | id: additional 45 | attributes: 46 | label: Additional context 47 | description: Add any other context or screenshots about the bug here. 48 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Questions 4 | about: Ask a question or ask about a problem in GitHub Discussions. 5 | url: https://github.com/glue-lab/sqldeps/discussions/categories/questions 6 | - name: Feature Request 7 | about: To suggest an idea or ask about a feature, please start with a question saying what you would like to achieve. 8 | url: https://github.com/glue-lab/sqldeps/discussions/categories/ideas 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: "🖹 Documentation" 2 | description: Report an issue (e.g., typo) related to the documentation. 3 | title: "[DOC]: " 4 | labels: [documentation] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for helping us improve the SQLDeps documentation! 11 | - type: dropdown 12 | id: type 13 | attributes: 14 | label: Type of documentation issue 15 | options: 16 | - Error/typo in existing documentation 17 | - Missing documentation 18 | - Confusing explanation 19 | - Other 20 | validations: 21 | required: true 22 | - type: textarea 23 | id: description 24 | attributes: 25 | label: Description 26 | description: What needs to be improved or fixed? 27 | placeholder: A clear description of what's wrong or missing in the documentation 28 | validations: 29 | required: true 30 | - type: textarea 31 | id: location 32 | attributes: 33 | label: Location 34 | description: Where can we find this documentation issue? 35 | placeholder: URLs, file paths, etc. 36 | - type: textarea 37 | id: suggestion 38 | attributes: 39 | label: Suggested improvement 40 | description: Have suggestions for how to improve the documentation? 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.yml: -------------------------------------------------------------------------------- 1 | name: "📈 Enhancement Request" 2 | description: Suggest an enhancement for SQLDeps 3 | title: "[Enhancement]: " 4 | labels: ["enhancement"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for suggesting an enhancement to SQLDeps! 11 | - type: textarea 12 | id: problem 13 | attributes: 14 | label: Problem to solve 15 | description: What problem would this enhancement solve? 16 | placeholder: A clear description of what limitation or issue you're trying to address 17 | validations: 18 | required: true 19 | - type: textarea 20 | id: solution 21 | attributes: 22 | label: Proposed solution 23 | description: What solution would you like to see? 24 | placeholder: Describe how you'd like to see this implemented 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: additional 29 | attributes: 30 | label: Additional details 31 | description: Add any other context, code examples, or references here 32 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Install uv 15 | run: | 16 | curl -LsSf https://astral.sh/uv/install.sh | sh 17 | echo "$HOME/.local/bin" >> $GITHUB_PATH 18 | 19 | - name: Install core dependencies 20 | run: uv sync 21 | 22 | - name: Lint 23 | run: | 24 | uv run ruff check . 25 | uv run ruff format --check . 26 | 27 | test: 28 | strategy: 29 | matrix: 30 | python-version: ["3.10", "3.11", "3.12", "3.13"] 31 | os: ["ubuntu-latest", "macos-latest", "windows-latest"] 32 | 33 | runs-on: ${{ matrix.os }} 34 | 35 | steps: 36 | - uses: actions/checkout@v4 37 | 38 | - name: Set up Python ${{ matrix.python-version }} 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | 43 | - name: Install uv (Unix) 44 | if: runner.os != 'Windows' 45 | run: | 46 | curl -LsSf https://astral.sh/uv/install.sh | sh 47 | echo "$HOME/.local/bin" >> $GITHUB_PATH 48 | 49 | - name: Install uv (Windows) 50 | if: runner.os == 'Windows' 51 | run: | 52 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" 53 | 54 | - name: Install PostgreSQL (Ubuntu) 55 | if: matrix.os == 'ubuntu-latest' 56 | run: sudo apt-get update && sudo apt-get install -y libpq-dev 57 | 58 | - name: Install PostgreSQL (macOS) 59 | if: matrix.os == 'macos-latest' 60 | run: brew install postgresql 61 | 62 | - name: check python version 63 | run: uv run python --version 64 | 65 | - name: Install all dependencies (including optional) 66 | run: uv sync --all-extras 67 | 68 | - name: Run tests 69 | run: uv run pytest 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | # Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | # poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | .idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # SQLDeps-specific cache: 174 | .sqldeps_cache 175 | 176 | # PyPI configuration file 177 | .pypirc 178 | 179 | # Data - ignore all data execpt examples 180 | data/* 181 | !/data/examples 182 | 183 | # Ignore output artifacts 184 | outputs/ 185 | artifacts/ 186 | 187 | # Ignore notebooks except target ones 188 | notebooks/* 189 | !notebooks/.gitkeep 190 | !notebooks/sqldeps_showcase.ipynb 191 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.11.8 5 | hooks: 6 | # Run check for the linter. 7 | - id: ruff 8 | types_or: [ python, pyi ] 9 | # Run check for the formatter. 10 | - id: ruff-format 11 | args: [ --check ] 12 | types_or: [ python, pyi ] 13 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.12" 7 | 8 | mkdocs: 9 | configuration: mkdocs.yml 10 | fail_on_warning: false 11 | 12 | python: 13 | install: 14 | - requirements: docs/docs-requirements.txt 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to SQLDeps will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [0.1.1] - 2025-05-05 8 | 9 | ### Added 10 | - Added LiteLLM as the default proxy to LLM providers 11 | - Added pre-commit hook configuration 12 | - Added example for SQL database connection config file 13 | - Added optional PostgreSQL dependencies 14 | 15 | ### Changed 16 | - Updated and optimized prompts for better SQL dependency analysis 17 | - Improved documentation (README, installation, quick-start, user guides) 18 | - Updated and optimized tests with new test data 19 | - Updated SQLDeps version in package metadata 20 | - Updated web application 21 | 22 | ### Fixed 23 | - Fixed temperature parameter usage in OpenAI calls 24 | 25 | ## [0.1.0] - 2025-04-04 26 | 27 | ### Added 28 | 29 | 30 | - Added caching system (#3, #11) 31 | - Added rate limiter per minute (RPM) (#2, #11) 32 | - Added support to multiprocessing (#2, #11) 33 | - Added Streamlit-based app as part of optional package dependencies (#7) 34 | - Added GiHub Issue templates (#1). 35 | 36 | ### Changed 37 | 38 | - Improved CLI with new features and subcommands (#8, #11) 39 | - Improved unit test coverage and test structure (#5, #11) 40 | - Improved interactive visualization function input (#12, #14) 41 | - Added/improved comprehensive docstring for Python files (#9, #12) 42 | 43 | ### Fixed 44 | 45 | - Bug fixes (#6) 46 | 47 | ### Removed 48 | 49 | - Unused/outdated validation scripts 50 | 51 | ## [0.0.1] - 2025-03-20 52 | 53 | - Pre-release of SQLDeps! 54 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to SQLDeps 2 | 3 | Thank you for considering contributing to SQLDeps! This guide explains how to set up your development environment and contribute to the project. 4 | 5 | ## Questions and Discussion 6 | 7 | For questions or discussions, please check [SQLDeps Discussions](https://github.com/glue-lab/sqldeps/discussions) or reach out to the maintainers directly. 8 | 9 | ## Development Setup 10 | 11 | ### Prerequisites 12 | 13 | - [Git](https://git-scm.com/) 14 | - [UV](https://docs.astral.sh/uv/) 15 | 16 | ### Clone the Repository 17 | 18 | ```bash 19 | git clone https://github.com/glue-lab/sqldeps.git 20 | cd sqldeps 21 | ``` 22 | 23 | ### Install Development Dependencies 24 | 25 | SQLDeps uses [`uv`](https://github.com/astral-sh/uv) as the package manager for development. 26 | 27 | After installing UV, run: 28 | 29 | ```bash 30 | uv sync 31 | ``` 32 | 33 | This will create a virtual environment with the correct Python version and all the required dependencies, including: 34 | 35 | - Core dependencies 36 | - Development tools (`pytest`, `ruff`, etc.) 37 | - Documentation tools (`mkdocs`, etc.) 38 | 39 | ### Environment Variables 40 | 41 | Create a `.env` file in the project root with your API keys: 42 | 43 | ``` 44 | # LLM API Keys 45 | GROQ_API_KEY=your_groq_api_key 46 | OPENAI_API_KEY=your_openai_api_key 47 | DEEPSEEK_API_KEY=your_deepseek_api_key 48 | ANTHROPIC_API_KEY=your_anthropic_api_key 49 | ``` 50 | 51 | For instance, [Groq](https://console.groq.com/keys) offers free tokens without requiring payment details, making it ideal for contributions. 52 | 53 | ## Development Workflow 54 | 55 | ### Code Style 56 | 57 | SQLDeps uses Ruff for code formatting and linting: 58 | 59 | ```bash 60 | # Format code 61 | uv run ruff format . 62 | 63 | # Fix linting issues 64 | uv run ruff check . --fix 65 | ``` 66 | 67 | Alternatively, you can easily check and apply formatting and linting with `make`: 68 | 69 | ```bash 70 | # Check code style without fixing 71 | make check 72 | 73 | # Apply fixes 74 | make fix 75 | ``` 76 | 77 | ### Running Tests 78 | 79 | The test suite is set up with markers to allow selective testing: 80 | 81 | ```bash 82 | # Run all tests except those marked as 'llm' or 'integration' 83 | # This is the default when running pytest without arguments 84 | uv run pytest 85 | 86 | # Run tests with a specific marker 87 | uv run pytest -m llm # Run LLM-dependent tests (requires API keys) 88 | uv run pytest -m integration # Run integration tests (requires database) 89 | 90 | # Run tests with a specific framework 91 | uv run pytest --framework=groq 92 | 93 | # Run specific test files 94 | uv run pytest tests/unit/test_models.py 95 | 96 | # Run with coverage report 97 | uv run pytest --cov=sqldeps 98 | ``` 99 | 100 | Note that by default tests marked with `llm` and `integration` are skipped to avoid requiring external dependencies during CI/CD. These tests require valid API keys and/or database connections. 101 | 102 | ### Building Documentation 103 | 104 | ```bash 105 | # Build and serve documentation locally 106 | uv run mkdocs serve 107 | ``` 108 | 109 | This will start a local server at `http://127.0.0.1:8000` where you can preview the documentation. 110 | 111 | ## Project Structure 112 | 113 | Here's the simplified project structure: 114 | 115 | ``` 116 | sqldeps/ 117 | ├── .github/ # GitHub configuration files 118 | ├── configs/ # External configuration files for experiments 119 | ├── docs/ # Documentation files 120 | ├── sqldeps/ # Main package source code 121 | │ ├── app/ # Streamlit web application 122 | │ ├── database/ # Database connector implementations 123 | │ ├── llm_parsers/ # LLM integration for SQL parsing 124 | │ └── ... # Other core modules 125 | └── tests/ # Test suite 126 | ``` 127 | 128 | ## Adding Features 129 | 130 | ### Adding a New LLM Provider 131 | 132 | 1. Create a new file in `sqldeps/llm_parsers/` following the pattern of existing providers 133 | 2. Implement the required methods from `BaseSQLExtractor` 134 | 3. Add the new provider to `__init__.py` and the `DEFAULTS` dictionary 135 | 4. Add tests in `tests/` (both unit and functional tests) 136 | 137 | ### Adding Database Support 138 | 139 | 1. Create a new file in `sqldeps/database/` following the pattern of existing connectors 140 | 2. Implement the required methods from `SQLBaseConnector` 141 | 3. Add the new connector to `__init__.py` 142 | 4. Add tests in `tests/` (both unit and integration tests) 143 | 144 | ## Pull Request Process 145 | 146 | 1. Fork the repository 147 | 2. Create a new branch for your feature or bug fix 148 | 3. Make your changes and add tests 149 | 4. Run the tests and linting checks 150 | 5. Update documentation if necessary 151 | 6. Submit a pull request to the `main` branch 152 | 153 | ## Package Versioning 154 | 155 | SQLDeps follows [Semantic Versioning](https://semver.org/): 156 | 157 | - **MAJOR** version when making incompatible API changes 158 | - **MINOR** version when adding functionality in a backward-compatible manner 159 | - **PATCH** version when making backward-compatible bug fixes 160 | 161 | ## Code of Conduct 162 | 163 | Please be respectful and inclusive when contributing to SQLDeps. We strive to maintain a welcoming environment for all contributors. 164 | 165 | ## License 166 | 167 | By contributing to SQLDeps, you agree that your contributions will be licensed under the project's MIT License. 168 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Global Land Use and Environment Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: fix 2 | fix: 3 | uv run ruff format . 4 | uv run ruff check . --fix 5 | 6 | .PHONY: check 7 | check: 8 | -uv run ruff format . --check 9 | -uv run ruff check . 10 | 11 | .PHONY: clean 12 | clean: 13 | # Remove Python cache directories 14 | find . -type d \( \ 15 | -name "__pycache__" -o \ 16 | -name "*.egg-info" -o \ 17 | -name ".eggs" -o \ 18 | -name ".ipynb_checkpoints" \ 19 | \) -exec rm -rf {} + 20 | 21 | # Remove compiled Python files 22 | find . -name "*.pyc" -delete 23 | 24 | # Remove build, test, and cache directories 25 | rm -rf dist build htmlcov .pytest_cache .ruff_cache .sqldeps_cache .mypy_cache .tox 2>/dev/null || true 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SQLDeps: SQL Dependency Extractor 2 | 3 |

4 | SQLDeps Logo 5 |

6 | 7 |

8 | 9 | Test 10 | 11 | 12 | Documentation 13 | 14 | 15 | Supported Python versions 16 | 17 | 18 | Package version 19 | 20 | 21 | License 22 | 23 |

24 | 25 | A tool that automatically extracts and maps SQL dependencies and outputs using Large Language Models (LLMs). 26 | 27 | --- 28 | 29 | - **Documentation**: [https://sqldeps.readthedocs.io/](https://sqldeps.readthedocs.io/) 30 | - **Code repositoty**: [https://github.com/glue-lab/sqldeps](https://sqldeps.readthedocs.io/) 31 | 32 | 33 | 34 | --- 35 | 36 | ## Overview 37 | 38 | SQLDeps analyzes SQL scripts to identify: 39 | 40 | 1. **Dependencies**: Tables and columns that must exist BEFORE query execution 41 | 2. **Outputs**: Tables and columns permanently CREATED or MODIFIED by the query 42 | 43 | It intelligently filters out temporary constructs like CTEs and derived tables, focusing only on the real database objects that matter. 44 | 45 | ### Benefits 46 | 47 | - 🛠️ **Change Management:** Safely modify schemas by identifying true dependencies 48 | - 💾 **Storage Optimization:** Focus resources on essential tables and columns 49 | - 🚢 **Migration Planning:** Precisely determine what needs to be migrated 50 | - 📝 **Project Documentation:** Create comprehensive maps of database dependencies 51 | 52 | ## Installation 53 | 54 | ```bash 55 | pip install sqldeps 56 | ``` 57 | 58 | For additional functionality: 59 | 60 | ```bash 61 | # Install with web app dependencies 62 | pip install "sqldeps[app]" 63 | 64 | # Install with data visualization dependencies 65 | pip install "sqldeps[dataviz]" 66 | 67 | # Install all optional dependencies 68 | pip install "sqldeps[app,postgres,dataviz]" 69 | ``` 70 | 71 | ## Quick Start 72 | 73 | SQLDeps provides both API and CLI interfaces: 74 | - **API**: Flexible for Python developers to integrate into scripts, notebooks, or applications. 75 | - **CLI**: Fast and user-friendly for analyzing files or folders directly from the command line. 76 | 77 | ### API Usage 78 | 79 | ```python 80 | from sqldeps.llm_parsers import create_extractor 81 | 82 | # Create extractor with default settings (framework="litellm", model="openai/gpt-4.1") 83 | extractor = create_extractor() 84 | 85 | # Extract dependencies and outputs from a SQL query 86 | sql_query = """ 87 | WITH user_orders AS ( 88 | SELECT o.user_id, COUNT(*) AS order_count 89 | FROM orders o 90 | JOIN users u ON o.user_id = u.id 91 | WHERE u.status = 'active' 92 | GROUP BY o.user_id 93 | ) 94 | 95 | CREATE TABLE transactions.user_order_summary AS 96 | SELECT * FROM user_orders; 97 | """ 98 | result = extractor.extract_from_query(sql_query) 99 | 100 | # Print the results 101 | print("Dependencies:") 102 | print(result.dependencies) 103 | print("\nOutputs:") 104 | print(result.outputs) 105 | 106 | # Or extract from a file 107 | result = extractor.extract_from_file('path/to/query.sql') 108 | 109 | # Convert to dictionary or DataFrame 110 | dict_format = result.to_dict() 111 | df_format = result.to_dataframe() 112 | ``` 113 | 114 | ### CLI Usage 115 | 116 | ```bash 117 | # Basic example with default settings 118 | sqldeps extract path/to/query.sql 119 | 120 | # Specify framework and output format 121 | sqldeps extract path/to/query.sql --framework=openai --model=gpt-4.1-mini -o results.json 122 | 123 | # Scan a folder recursively with intelligent parallelization 124 | sqldeps extract \ 125 | data/sql_folder \ # Automatically detect if path is file or folder 126 | --recursive \ # Scan folder recursively 127 | --framework=deepseek \ # Specify framework/provider 128 | --rpm 50 # Maximum 50 requests per minute 129 | --n-workers -1 \ # Use all available processors 130 | -o results.csv # Output a dataframe as CSV instead of JSON 131 | ``` 132 | 133 | ```bash 134 | # Get help on available commands 135 | sqldeps --help 136 | 137 | # Get help on extract - the main command 138 | sqldeps extract --help 139 | ``` 140 | 141 | ### Web Application 142 | 143 | SQLDeps also includes a Streamlit-based web interface: 144 | 145 | ```bash 146 | # Run the web app 147 | sqldeps app 148 | ``` 149 | 150 | **Note**: The web application is designed for single-file extraction and demonstration purposes. For processing multiple files or entire folders, use the API or CLI instead. 151 | 152 | ## Example 153 | 154 | Given this SQL query: 155 | 156 | ```sql 157 | -- Common Table Expression (CTE) to count user orders for active users 158 | WITH user_orders AS ( 159 | SELECT o.user_id, COUNT(*) AS order_count 160 | FROM orders o 161 | JOIN users u ON o.user_id = u.id 162 | WHERE u.status = 'active' 163 | GROUP BY o.user_id 164 | ) 165 | 166 | -- Create a new table from the CTE 167 | CREATE TABLE transactions.user_order_summary AS 168 | SELECT * FROM user_orders; 169 | ``` 170 | 171 | SQLDeps will extract: 172 | 173 | ```json 174 | { 175 | "dependencies": { 176 | "orders": ["user_id"], 177 | "users": ["id", "status"] 178 | }, 179 | "outputs": { 180 | "transactions.user_order_summary": ["*"] 181 | } 182 | } 183 | ``` 184 | 185 | Notice how: 186 | 187 | - CTE (`user_orders`) is correctly excluded 188 | - Real source tables (`orders`, `users`) are included as dependencies 189 | - Target table (`transactions.user_order_summary`) is correctly identified as output 190 | 191 | ## Supported Models 192 | 193 | All models available on [Groq](https://console.groq.com/docs/models), [OpenAI](https://platform.openai.com/docs/models), and [DeepSeek](https://api-docs.deepseek.com/). 194 | For up-to-date pricing details, please check [Groq](https://groq.com/pricing/), [OpenAI](https://platform.openai.com/docs/pricing), [DeepSeek](https://api-docs.deepseek.com/quick_start/pricing). 195 | 196 | ## API Keys / Configuration 197 | 198 | You'll need to set up API keys for your chosen LLM provider. Create a `.env` file in your project root: 199 | 200 | ``` 201 | # LLM API Keys 202 | GROQ_API_KEY=your_groq_api_key 203 | OPENAI_API_KEY=your_openai_api_key 204 | DEEPSEEK_API_KEY=your_deepseek_api_key 205 | ANTHROPIC_API_KEY=your_anthropic_api_key 206 | 207 | # Database credentials (for schema validation) 208 | DB_HOST=localhost 209 | DB_PORT=5432 210 | DB_NAME=mydatabase 211 | DB_USER=username 212 | DB_PASSWORD=password 213 | ``` 214 | 215 | > **Tip:** [Groq](https://console.groq.com/keys) offers free tokens without requiring payment details, making it ideal for getting started quickly. 216 | 217 | ## Advanced Usage 218 | 219 | ### Database Schema Matching 220 | 221 | SQLDeps allows the user to match SQLDeps results (table/column dependencies and outputs) with database schemas to retrieve column data types. 222 | 223 | ```python 224 | from sqldeps.database import PostgreSQLConnector 225 | from sqldeps.llm_parsers import create_extractor 226 | 227 | # Extract dependencies 228 | extractor = create_extractor(model="openai/gpt-4.1-mini") 229 | result = extractor.extract_from_file('query.sql') 230 | 231 | # Connect to database and validate 232 | conn = PostgreSQLConnector( 233 | host="localhost", 234 | port=5432, 235 | database="mydatabase", 236 | username="username" 237 | ) 238 | 239 | # Match extracted dependencies against database schema 240 | matching_schema = extractor.match_database_schema( 241 | result, 242 | db_connection=conn, 243 | target_schemas=["public", "sales"] 244 | ) 245 | 246 | # View validation results as a pandas DataFrame 247 | print(matching_schema) 248 | ``` 249 | 250 | For custom database YAML configuration file (optional): 251 | 252 | ```yml 253 | # database.yml 254 | database: 255 | host: localhost 256 | port: 5432 257 | database: mydatabase 258 | username: username 259 | password: password 260 | ``` 261 | 262 | ### Using Custom Prompts 263 | 264 | You can customize the prompts used to instruct the LLM: 265 | 266 | ```python 267 | # Create extractor with custom prompt 268 | extractor = create_extractor( 269 | model="groq/llama-3.3-70b-versatile", 270 | prompt_path="path/to/custom_prompt.yml" 271 | ) 272 | ``` 273 | 274 | The custom prompt YAML should include: 275 | 276 | ```yaml 277 | system_prompt: | 278 | You are a SQL analyzer that extracts two key elements from SQL queries: 279 | 280 | 1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution. 281 | 2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query. 282 | 283 | # Add detailed instructions for the LLM here... 284 | 285 | user_prompt: | 286 | Extract SQL dependencies and outputs from this query: 287 | {sql} 288 | ``` 289 | 290 | ### Interactive Visualization of SQL Dependency Graphs 291 | 292 | SQLDeps provides built-in visualization capabilities to help you understand complex SQL dependencies: 293 | 294 | ```python 295 | from sqldeps.llm_parsers import create_extractor 296 | from sqldeps.visualization import visualize_sql_dependencies 297 | 298 | # Create an interactive network graph from multiple SQL files 299 | extractor = create_extractor() 300 | sql_profiles = extractor.extract_from_folder("path/to/folder", recursive=False) 301 | 302 | # Generate an interactive visualization (saving output to an HTML file) 303 | figure = visualize_sql_dependencies(sql_profiles, output_path="dependencies.html") 304 | 305 | # Show figure 306 | figure.show() 307 | ``` 308 | 309 | ## Documentation 310 | 311 | For comprehensive documentation, including API reference and examples, visit [https://sqldeps.readthedocs.io](https://sqldeps.readthedocs.io/). 312 | 313 | ## Contributing 314 | 315 | Contributions are welcome! 316 | 317 | - Found a bug? Please [open an issue](https://github.com/glue-lab/sqldeps/issues) with detailed information. 318 | - Missing a feature? Feel free to [suggest enhancements](https://github.com/glue-lab/sqldeps/discussions/categories/ideas) or submit a pull request. 319 | 320 | Please check out the [Contributing Guide](https://sqldeps.readthedocs.io/en/latest/contributing/) for details. 321 | 322 | 323 | ## License 324 | 325 | MIT 326 | -------------------------------------------------------------------------------- /configs/database.yml: -------------------------------------------------------------------------------- 1 | database: 2 | host: xx.xx.xx.xx 3 | port: 5432 4 | database: database_name 5 | username: username 6 | password: password 7 | -------------------------------------------------------------------------------- /configs/prompts/default.yml: -------------------------------------------------------------------------------- 1 | system_prompt: | 2 | You are a SQL analyzer that extracts two key elements from SQL queries: 3 | 4 | 1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution. 5 | - Source tables in `FROM`, `JOIN`, CTEs, subqueries, etc. 6 | - ALL target tables in operations like `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE` must be included in dependencies. 7 | - Referenced columns in `SELECT`, `WHERE`, `CASE`, `JOIN`, `GROUP BY`, `HAVING`, `ORDER BY`, etc. 8 | - Columns used in expressions, `CASE` statements, and aggregate functions. 9 | 10 | 2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query. 11 | - Tables modified with `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE`. 12 | - Target columns in these operations. 13 | - Tables created with `CREATE TABLE`. 14 | 15 | KEY RULES (CRITICAL): 16 | - ALL target tables (INSERT, UPDATE, DELETE, TRUNCATE) MUST appear in BOTH dependencies AND outputs. 17 | - Example: For `INSERT INTO table_x (col1, col2) VALUES (1, 2)` 18 | → Dependencies: `{"table_x": ["col1", "col2"]}` 19 | → Outputs: `{"table_x": ["col1", "col2"]}` 20 | - Example: For `TRUNCATE TABLE table_x` 21 | → Dependencies: `{"table_x": []}` 22 | → Outputs: `{"table_x": []}` 23 | - This applies even inside CTEs, functions or stored procedures. 24 | - EXCEPTIONS: 25 | - If a table is CREATED in the same query (CREATE TABLE), it appears ONLY in outputs. 26 | - If a table is used in INSERT statement after TRUNCATE, include the specified columns for both dependencies and outputs instead of an empty list. 27 | - ALWAYS include schema-qualified tables (e.g., `schema.table`) in both dependencies and outputs, preserving the schema name. 28 | 29 | COLUMN HANDLING: 30 | - Explicit `SELECT *` should return ["*"] in dependencies 31 | - Functions like COUNT(*) with specific names, do NOT use ["*"], only include explicitly named columns 32 | - Example: For `SELECT COUNT(*), name FROM users`, dependencies would include `{"users": ["name"]}` 33 | - INSERT without column list creates dependencies on all columns in the target table: ["*"] 34 | 35 | CTE HANDLING: 36 | - CTEs (WITH queries) are temporary structures and should NOT be included as dependencies or outputs themselves. 37 | - However, include tables and columns used within CTEs that originated outside, since they are dependencies. 38 | - Example: 39 | ```sql 40 | WITH cte AS (SELECT * FROM table_x) 41 | INSERT INTO table_y SELECT * FROM cte 42 | ``` 43 | → Dependencies: `{"table_x": ["*"], "table_y": ["*"]}` 44 | → Outputs: `{"table_y": ["*"]}` 45 | 46 | SCHEMA-QUALIFIED TABLES: 47 | - Always preserve schema names exactly as they appear in the query. 48 | - Example: For `INSERT INTO schema_a.table_x SELECT col1,col2 FROM schema_b.table_y` 49 | → Dependencies: `{"schema_a.table_x": ["*"], "schema_b.table_y": ["col1","col2"]}` 50 | → Outputs: `{"schema_a.table_x": ["*"]}` 51 | 52 | FUNCTION & PROCEDURE HANDLING: 53 | - Even inside functions or stored procedures, any `INSERT`, `UPDATE`, `DELETE`, or `TRUNCATE` statements affect real tables and must be included as dependencies and outputs. 54 | 55 | ADDITIONAL CONSIDERATIONS: 56 | - Resolve table aliases to real table names. 57 | - `CASE` expressions → dependencies on all examined columns. 58 | - `MERGE`/`UPSERT` → both dependencies and outputs. 59 | - Ignore variables and parameters as dependencies. 60 | 61 | OUTPUT JSON FORMAT: 62 | { 63 | "dependencies": {"table_name": ["column1", "column2"]}, 64 | "outputs": {"table_name": ["column1", "column2"]} 65 | } 66 | 67 | user_prompt: | 68 | Extract SQL dependencies (tables/columns needed BEFORE execution) and outputs (tables/columns CREATED or MODIFIED) from this query. 69 | 70 | Respond ONLY with JSON in this exact format: 71 | {{ 72 | "dependencies": {{"table_name": ["column1", "column2"]}}, 73 | "outputs": {{"table_name": ["column1", "column2"]}} 74 | }} 75 | 76 | SQL query to analyze: 77 | {sql} 78 | -------------------------------------------------------------------------------- /data/examples/example.sql: -------------------------------------------------------------------------------- 1 | -- Common Table Expression (CTE) to count user orders for active users 2 | WITH user_orders AS ( 3 | SELECT o.user_id, COUNT(*) AS order_count 4 | FROM orders o 5 | JOIN users u ON o.user_id = u.id 6 | WHERE u.status = 'active' 7 | GROUP BY o.user_id 8 | ) 9 | 10 | -- Create a new table from the CTE 11 | CREATE TABLE transactions.user_order_summary AS 12 | SELECT * FROM user_orders; 13 | 14 | -- Truncate an existing table before repopulating 15 | TRUNCATE TABLE order_summary; 16 | -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/example1.sql: -------------------------------------------------------------------------------- 1 | -- Simple query selecting a subset of columns 2 | SELECT id, name FROM users -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/example2.sql: -------------------------------------------------------------------------------- 1 | -- Simple query selecting all columns 2 | SELECT * FROM users LIMIT 100 -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/random_file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/data/examples/folders_with_sql_files/random_file.txt -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/subfolder1/example3.sql: -------------------------------------------------------------------------------- 1 | -- Query with table alias, with and without database specification, and join 2 | SELECT u.id, u.name, o.order_id 3 | FROM my_db.users u 4 | JOIN orders AS o ON u.id = o.user_id -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/subfolder1/example3_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": ["my_db.users", "orders"], 3 | "columns": { 4 | "my_db.users": ["id", "name"], 5 | "orders": ["order_id", "user_id"] 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/subfolder2/random_file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/data/examples/folders_with_sql_files/subfolder2/random_file.txt -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example4.sql: -------------------------------------------------------------------------------- 1 | -- Query with table alias, with and without database specification, and join, and where clauses 2 | SELECT u.id, u.name, o.order_id 3 | FROM my_db.users u 4 | JOIN orders AS o ON u.id = o.user_id 5 | WHERE u.status = 'active' 6 | AND o.order_date >= '2024-01-01' 7 | AND o.total_amount > 100.00 8 | AND u.email LIKE '%@company.com' 9 | AND o.order_type IN ('retail', 'wholesale') 10 | AND ( 11 | o.shipping_status = 'pending' 12 | OR (o.shipping_status = 'processed' AND o.priority_level = 'high') 13 | ); -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example5.sql: -------------------------------------------------------------------------------- 1 | -- Simple CTE 2 | WITH user_orders AS ( 3 | SELECT user_id, COUNT(*) as order_count 4 | FROM orders 5 | GROUP BY user_id 6 | ) 7 | SELECT u.name, uo.order_count 8 | FROM users u 9 | JOIN user_orders uo ON u.id = uo.user_id; -------------------------------------------------------------------------------- /data/examples/folders_with_sql_files/subfolder2/subfolder2_1/random_file.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/data/examples/folders_with_sql_files/subfolder2/subfolder2_1/random_file.yml -------------------------------------------------------------------------------- /docs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/docs/.gitkeep -------------------------------------------------------------------------------- /docs/api-reference/app.md: -------------------------------------------------------------------------------- 1 | # Web App Reference 2 | 3 | ::: sqldeps.app 4 | -------------------------------------------------------------------------------- /docs/api-reference/cache.md: -------------------------------------------------------------------------------- 1 | # Cache Reference 2 | 3 | ::: sqldeps.cache 4 | -------------------------------------------------------------------------------- /docs/api-reference/cli.md: -------------------------------------------------------------------------------- 1 | # Command Line Interface 2 | 3 | ::: sqldeps.cli 4 | -------------------------------------------------------------------------------- /docs/api-reference/config.md: -------------------------------------------------------------------------------- 1 | # Config Reference 2 | 3 | ::: sqldeps.config 4 | -------------------------------------------------------------------------------- /docs/api-reference/database.md: -------------------------------------------------------------------------------- 1 | # Database Reference 2 | 3 | ::: sqldeps.database 4 | -------------------------------------------------------------------------------- /docs/api-reference/llm-parsers.md: -------------------------------------------------------------------------------- 1 | # LLM Parsers Reference 2 | 3 | ::: sqldeps.llm_parsers 4 | -------------------------------------------------------------------------------- /docs/api-reference/models.md: -------------------------------------------------------------------------------- 1 | # Models Reference 2 | 3 | ::: sqldeps.models 4 | -------------------------------------------------------------------------------- /docs/api-reference/parallel.md: -------------------------------------------------------------------------------- 1 | # Parallelization Reference 2 | 3 | ::: sqldeps.parallel 4 | -------------------------------------------------------------------------------- /docs/api-reference/rate-limiter.md: -------------------------------------------------------------------------------- 1 | # Rate Limiter Reference 2 | 3 | ::: sqldeps.rate_limiter 4 | -------------------------------------------------------------------------------- /docs/api-reference/utils.md: -------------------------------------------------------------------------------- 1 | # Utils Reference 2 | 3 | ::: sqldeps.utils 4 | -------------------------------------------------------------------------------- /docs/api-reference/visualization.md: -------------------------------------------------------------------------------- 1 | # Interactive Graphs of SQL Dependency 2 | 3 | ::: sqldeps.visualization 4 | -------------------------------------------------------------------------------- /docs/assets/images/sqldeps_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/docs/assets/images/sqldeps_logo.png -------------------------------------------------------------------------------- /docs/authors.md: -------------------------------------------------------------------------------- 1 | # Authors and Maintainers 2 | 3 | ## Core Team 4 | 5 | ### Primary Author 6 | - **Cainã Max Couto da Silva** - *Initial work and primary developer* 7 | - [:fontawesome-brands-github:](https://github.com/cmcouto-silva) [GitHub](https://github.com/cmcouto-silva) | [:fontawesome-brands-linkedin:](https://www.linkedin.com/in/cmcouto-silva/) [LinkedIn](https://www.linkedin.com/in/cmcouto-silva/) | [:material-email:](mailto:coutodasilva@wisc.edu) coutodasilva@wisc.edu 8 | - [:material-school:](https://gibbs-lab.wisc.edu/) Global Land Use and Environment Lab, UW-Madison 9 | 10 | ### Project Lead 11 | - **Matt Christie** - *Project supervision and code review* 12 | - [:fontawesome-brands-github:](https://github.com/mjchristie) [GitHub](https://github.com/mjchristie) | [:material-email:](mailto:mjchristie@wisc.edu) mjchristie@wisc.edu 13 | - [:material-school:](https://gibbs-lab.wisc.edu/) Global Land Use and Environment Lab, UW-Madison 14 | 15 | ## Institutional Support 16 | 17 | SQLDeps was developed as part of research and development work at the University of Wisconsin-Madison's Nelson Institute for Environmental Studies, specifically within the [Global Land Use and Environment (GLUE) Lab](https://gibbs-lab.wisc.edu/). 18 | 19 | The project aims to support data analysis workflows and improve database management practices. 20 | 21 | ## Contributors 22 | 23 | We appreciate all contributions to SQLDeps! 24 | If you contribute to this project, your name will be added here. 25 | 26 | ## Becoming a Contributor 27 | 28 | Interested in contributing to SQLDeps? 29 | Please check the [Contributing Guide](contributing.md) for details on how to get started. 30 | 31 | ## Contact 32 | 33 | For questions about the project, please open an issue on GitHub or contact the maintainers directly via email. 34 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | --8<-- "CHANGELOG.md" 2 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | --8<-- "CONTRIBUTING.md" 2 | -------------------------------------------------------------------------------- /docs/docs-requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs==1.6.1 2 | mkdocs-material==9.6.9 3 | pymdown-extensions==10.14.3 4 | mkdocstrings[python]==0.29.1 5 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Here are some practical examples of using SQLDeps for different use cases. 4 | 5 | ## Basic Dependency Extraction 6 | 7 | ### Example 1: Simple SELECT Query 8 | 9 | ```sql 10 | -- example1.sql 11 | SELECT u.id, u.name, o.order_id, o.amount 12 | FROM users u 13 | JOIN orders o ON u.id = o.user_id 14 | WHERE u.status = 'active' 15 | ``` 16 | 17 | Extracted dependencies: 18 | 19 | ```json 20 | { 21 | "dependencies": { 22 | "users": ["id", "name", "status"], 23 | "orders": ["order_id", "amount", "user_id"] 24 | }, 25 | "outputs": {} 26 | } 27 | ``` 28 | 29 | ### Example 2: CTEs and Table Creation 30 | 31 | ```sql 32 | -- example2.sql 33 | WITH user_orders AS ( 34 | SELECT o.user_id, COUNT(*) AS order_count 35 | FROM orders o 36 | JOIN users u ON o.user_id = u.id 37 | WHERE u.status = 'active' 38 | GROUP BY o.user_id 39 | ) 40 | 41 | CREATE TABLE transactions.user_order_summary AS 42 | SELECT * FROM user_orders; 43 | ``` 44 | 45 | Extracted dependencies: 46 | 47 | ```json 48 | { 49 | "dependencies": { 50 | "orders": ["user_id"], 51 | "users": ["id", "status"] 52 | }, 53 | "outputs": { 54 | "transactions.user_order_summary": ["*"] 55 | } 56 | } 57 | ``` 58 | 59 | ### Example 3: UPDATE Operation 60 | 61 | ```sql 62 | -- example3.sql 63 | UPDATE users 64 | SET status = 'inactive' 65 | WHERE last_login < CURRENT_DATE - INTERVAL '90 days' 66 | AND status = 'active'; 67 | ``` 68 | 69 | Extracted dependencies: 70 | 71 | ```json 72 | { 73 | "dependencies": { 74 | "users": ["status", "last_login"] 75 | }, 76 | "outputs": { 77 | "users": ["status"] 78 | } 79 | } 80 | ``` 81 | 82 | ### Example 4: INSERT Operation 83 | 84 | ```sql 85 | -- example4.sql 86 | INSERT INTO sales.order_summary (date, total_orders, total_amount) 87 | SELECT 88 | DATE_TRUNC('day', order_date) as date, 89 | COUNT(*) as total_orders, 90 | SUM(amount) as total_amount 91 | FROM orders 92 | GROUP BY DATE_TRUNC('day', order_date); 93 | ``` 94 | 95 | Extracted dependencies: 96 | 97 | ```json 98 | { 99 | "dependencies": { 100 | "orders": ["order_date", "amount"], 101 | "sales.order_summary": ["date", "total_orders", "total_amount"] 102 | }, 103 | "outputs": { 104 | "sales.order_summary": ["date", "total_orders", "total_amount"] 105 | } 106 | } 107 | ``` -------------------------------------------------------------------------------- /docs/getting-started/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Prerequisites 4 | 5 | - Python 3.10 or higher 6 | - API keys for your preferred LLM provider (Groq, OpenAI, or DeepSeek) 7 | 8 | ## Install from PyPI 9 | 10 | The simplest way to install SQLDeps is via pip: 11 | 12 | ```bash 13 | pip install sqldeps 14 | ``` 15 | 16 | For additional functionality, you can install optional dependencies: 17 | 18 | ```bash 19 | # Install with web app dependencies 20 | pip install "sqldeps[app]" 21 | 22 | # Install with data visualization dependencies 23 | pip install "sqldeps[dataviz]" 24 | 25 | # Install all optional dependencies 26 | pip install "sqldeps[app,postgres,dataviz]" 27 | ``` 28 | 29 | ## Setup API Keys 30 | 31 | SQLDeps requires API keys for the LLM providers you want to use. These keys are set through environment variables. 32 | 33 | ### Environment Variables 34 | 35 | Create a `.env` file in your project root with your API keys: 36 | 37 | ``` 38 | # LLM API Keys 39 | GROQ_API_KEY=your_groq_api_key 40 | OPENAI_API_KEY=your_openai_api_key 41 | DEEPSEEK_API_KEY=your_deepseek_api_key 42 | ANTHROPIC_API_KEY=your_anthropic_api_key 43 | 44 | # Optional: Database credentials (for schema validation) 45 | DB_HOST=localhost 46 | DB_PORT=5432 47 | DB_NAME=mydatabase 48 | DB_USER=username 49 | DB_PASSWORD=password 50 | ``` 51 | 52 | SQLDeps will automatically load variables from the .env file when you import the package. 53 | 54 | > **Tip:** [Groq](https://console.groq.com/keys) offers free tokens without requiring payment details, making it ideal for getting started quickly. 55 | 56 | ## Database Configuration (Optional) 57 | 58 | If you plan to use the database features, you can set up your database credentials in several ways: 59 | 60 | ### YAML Configuration File 61 | 62 | ```yaml 63 | # database.yml 64 | database: 65 | host: localhost 66 | port: 5432 67 | database: mydatabase 68 | username: username 69 | password: password 70 | ``` 71 | 72 | ### Environment Variables 73 | 74 | ``` 75 | DB_HOST=localhost 76 | DB_PORT=5432 77 | DB_NAME=mydatabase 78 | DB_USER=username 79 | DB_PASSWORD=password 80 | ``` 81 | 82 | ### PostgreSQL Password File 83 | 84 | SQLDeps also supports reading credentials from the standard PostgreSQL password file (`~/.pgpass`). 85 | 86 | ## Verify Installation 87 | 88 | You can verify your installation by running: 89 | 90 | ```bash 91 | sqldeps --help 92 | ``` 93 | 94 | This should display the command-line help information for SQLDeps. 95 | -------------------------------------------------------------------------------- /docs/getting-started/quick-start.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | SQLDeps provides both API and CLI interfaces for extracting dependencies from SQL queries. 4 | 5 | ## API Usage 6 | 7 | ```python 8 | from sqldeps.llm_parsers import create_extractor 9 | 10 | # Create extractor with default settings (framework="litellm", model="openai/gpt-4.1") 11 | extractor = create_extractor() 12 | 13 | # Extract dependencies and outputs from a SQL query 14 | sql_query = """ 15 | WITH user_orders AS ( 16 | SELECT o.user_id, COUNT(*) AS order_count 17 | FROM orders o 18 | JOIN users u ON o.user_id = u.id 19 | WHERE u.status = 'active' 20 | GROUP BY o.user_id 21 | ) 22 | 23 | CREATE TABLE transactions.user_order_summary AS 24 | SELECT * FROM user_orders; 25 | """ 26 | result = extractor.extract_from_query(sql_query) 27 | 28 | # Print the results 29 | print("Dependencies:") 30 | print(result.dependencies) 31 | print("\nOutputs:") 32 | print(result.outputs) 33 | 34 | # Or extract from a file 35 | result = extractor.extract_from_file('path/to/query.sql') 36 | 37 | # Convert to dictionary or DataFrame 38 | dict_format = result.to_dict() 39 | df_format = result.to_dataframe() 40 | ``` 41 | 42 | ## CLI Usage 43 | 44 | ```bash 45 | # Basic example with default settings 46 | sqldeps extract path/to/query.sql 47 | 48 | # Specify framework and output format 49 | sqldeps extract path/to/query.sql --framework=litellm --model=gpt-4.1-mini -o results.json 50 | 51 | # Scan a folder recursively with intelligent parallelization 52 | sqldeps extract \ 53 | data/sql_folder \ # Automatically detect if path is file or folder 54 | --recursive \ # Scan folder recursively 55 | --framework=deepseek \ # Specify framework/provider 56 | --rpm 50 # Maximum 50 requests per minute 57 | --n-workers -1 \ # Use all available processors 58 | -o results.csv # Output a dataframe as CSV instead of JSON 59 | ``` 60 | 61 | ```bash 62 | # Get help on available commands 63 | sqldeps --help 64 | 65 | # Get help on extract - the main command 66 | sqldeps extract --help 67 | ``` 68 | 69 | ## Web Application 70 | 71 | SQLDeps includes a Streamlit-based web interface: 72 | 73 | ```bash 74 | # Run the web app 75 | sqldeps app 76 | ``` 77 | 78 | **Note**: The web application is designed for single-file extraction and demonstration purposes. For processing multiple files or entire folders, use the API or CLI instead. 79 | 80 | ## Example 81 | 82 | Given this SQL query: 83 | 84 | ```sql 85 | -- Common Table Expression (CTE) to count user orders for active users 86 | WITH user_orders AS ( 87 | SELECT o.user_id, COUNT(*) AS order_count 88 | FROM orders o 89 | JOIN users u ON o.user_id = u.id 90 | WHERE u.status = 'active' 91 | GROUP BY o.user_id 92 | ) 93 | 94 | -- Create a new table from the CTE 95 | CREATE TABLE transactions.user_order_summary AS 96 | SELECT * FROM user_orders; 97 | ``` 98 | 99 | SQLDeps will extract: 100 | 101 | ```json 102 | { 103 | "dependencies": { 104 | "orders": ["user_id"], 105 | "users": ["id", "status"] 106 | }, 107 | "outputs": { 108 | "transactions.user_order_summary": ["*"] 109 | } 110 | } 111 | ``` 112 | 113 | Notice how: 114 | 115 | - CTE (`user_orders`) is correctly excluded 116 | - Real source tables (`orders`, `users`) are included as dependencies 117 | - Target table (`transactions.user_order_summary`) is correctly identified as output 118 | 119 | ## Next Steps 120 | 121 | - Read the [API Usage](../user-guide/api-usage.md) guide for detailed API options 122 | - Read the [CLI Usage](../user-guide/cli-usage.md) for easy-to-use command-line features 123 | - Explore [Database Integration](../user-guide/database-integration.md) for schema validation and data type retrieval 124 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # SQLDeps: SQL Dependency Extractor 2 | 3 |

4 | SQLDeps Logo 5 |

6 | 7 | 12 | 13 | A powerful tool that automatically extracts and maps SQL dependencies and outputs using Large Language Models (LLMs). 14 | 15 | ## Overview 16 | 17 | SQLDeps analyzes SQL scripts to identify: 18 | 19 | 1. **Dependencies**: Tables and columns that must exist BEFORE query execution 20 | 2. **Outputs**: Tables and columns permanently CREATED or MODIFIED by the query 21 | 22 | It intelligently filters out temporary constructs like CTEs and derived tables, focusing only on the real database objects that matter. 23 | 24 | ## Key Benefits 25 | 26 | - 🛠️ **Change Management:** Safely modify schemas by identifying true dependencies 27 | - 💾 **Storage Optimization:** Focus resources on essential tables and columns 28 | - 🚢 **Migration Planning:** Precisely determine what needs to be migrated 29 | - 📝 **Project Documentation:** Create comprehensive maps of database dependencies 30 | 31 | ## Why SQLDeps? 32 | 33 | Traditional approaches to SQL dependency tracking: 34 | 35 | - **Manual Inspection**: Time-consuming and error-prone 36 | - **Traditional Parsers**: Lacks context and intelligence for complex SQL 37 | 38 | SQLDeps leverages the power of Large Language Models to provide intelligent, accurate dependency extraction that works across SQL dialects and complexity levels. 39 | 40 | ## Supported LLM Providers 41 | 42 | All models availables on [LiteLLM](https://docs.litellm.ai/docs/providers) [Groq](https://console.groq.com/docs/models), [OpenAI](https://platform.openai.com/docs/models), and [DeepSeek](https://api-docs.deepseek.com/). 43 | 44 | ## Getting Started 45 | 46 | ```bash 47 | # Install the package 48 | pip install sqldeps 49 | 50 | # Basic usage 51 | sqldeps extract path/to/query.sql 52 | ``` 53 | 54 | Check out the [Quick Start](getting-started/quick-start.md) guide for more details. 55 | 56 | -------------------------------------------------------------------------------- /docs/stylesheets/custom.css: -------------------------------------------------------------------------------- 1 | /* Primary theme color customization */ 2 | :root { 3 | --md-primary-fg-color: #bf050b; 4 | --md-primary-fg-color--light: #e01b22; 5 | --md-primary-fg-color--dark: #8f0408; 6 | 7 | /* Ensure text has sufficient contrast */ 8 | --md-primary-bg-color: #ffffff; 9 | --md-primary-bg-color--light: #ffffff; 10 | } 11 | 12 | /* Make sure text on primary color background is visible */ 13 | .md-header { 14 | color: var(--md-primary-bg-color); 15 | } 16 | 17 | /* Direct link color overrides */ 18 | .md-typeset a { 19 | color: #0066cc !important; 20 | } 21 | .md-typeset a:hover { 22 | color: #111111 !important; 23 | } 24 | 25 | .md-typeset a:hover { 26 | text-decoration: underline; 27 | } 28 | -------------------------------------------------------------------------------- /docs/user-guide/api-usage.md: -------------------------------------------------------------------------------- 1 | # API Usage 2 | 3 | SQLDeps provides a comprehensive Python API for extracting SQL dependencies and validating them against database schemas. 4 | 5 | ## Creating an Extractor 6 | 7 | The main entry point for using SQLDeps is the `create_extractor()` function: 8 | 9 | ```python 10 | from sqldeps.llm_parsers import create_extractor 11 | 12 | # Create extractor with default settings (framework="litellm", model="openai/gpt-4.1") 13 | extractor = create_extractor() 14 | 15 | # Specify a different framework and model 16 | extractor = create_extractor( 17 | framework="openai", 18 | model="gpt-4o" 19 | ) 20 | 21 | # Specify additional parameters for the LLM 22 | extractor = create_extractor( 23 | framework="litellm", 24 | model="openai/gpt-4.1-mini", 25 | params={"temperature": 0.1} 26 | ) 27 | 28 | # Use a custom prompt template 29 | extractor = create_extractor( 30 | framework="deepseek", 31 | model="deepseek-chat", 32 | prompt_path="path/to/custom_prompt.yml" 33 | ) 34 | ``` 35 | 36 | Note that the API keys should be set through environment variables as explained in the [Installation](../getting-started/installation.md) guide. 37 | 38 | ## Extracting Dependencies 39 | 40 | Once you have an extractor, you can use it to extract dependencies from SQL queries, files, or folders: 41 | 42 | ### From a Query String 43 | 44 | ```python 45 | # Extract from a SQL query string 46 | sql_query = """ 47 | SELECT u.id, u.name, o.order_id, o.amount 48 | FROM users u 49 | JOIN orders o ON u.id = o.user_id 50 | WHERE u.status = 'active' 51 | """ 52 | 53 | result = extractor.extract_from_query(sql_query) 54 | ``` 55 | 56 | ### From a File 57 | 58 | ```python 59 | # Extract from a SQL file 60 | result = extractor.extract_from_file("path/to/query.sql") 61 | ``` 62 | 63 | ### From a Folder 64 | 65 | ```python 66 | # Extract from all SQL files in a folder 67 | result = extractor.extract_from_folder("path/to/sql_folder") 68 | 69 | # Extract recursively from all SQL files in a folder and subfolders 70 | result = extractor.extract_from_folder("path/to/sql_folder", recursive=True) 71 | 72 | # Extract from files with specific extensions 73 | result = extractor.extract_from_folder( 74 | "path/to/sql_folder", 75 | recursive=True, 76 | valid_extensions={"sql", "pgsql", "tsql"} 77 | ) 78 | 79 | # Process with parallel workers (uses all available CPUs) 80 | result = extractor.extract_from_folder( 81 | "path/to/sql_folder", 82 | recursive=True, 83 | n_workers=-1, # -1 means all available workers 84 | rpm=100 # Rate limit to 100 requests per minute 85 | ) 86 | 87 | # Merge results into a single SQLProfile 88 | result = extractor.extract_from_folder( 89 | "path/to/sql_folder", 90 | recursive=True, 91 | merge_sql_profiles=True 92 | ) 93 | ``` 94 | 95 | ## Working with Results 96 | 97 | The `extract_*` methods return a `SQLProfile` object that contains the extracted dependencies and outputs: 98 | 99 | ```python 100 | # Access dependencies and outputs as dictionaries 101 | dependencies = result.dependencies # Dict of tables and their columns 102 | outputs = result.outputs # Dict of tables and columns created or modified 103 | 104 | # Get a list of all referenced tables 105 | tables = result.dependency_tables 106 | 107 | # Get a list of all output tables 108 | output_tables = result.outcome_tables 109 | 110 | # Convert to a dictionary 111 | result_dict = result.to_dict() 112 | 113 | # Convert to a DataFrame for easier analysis 114 | result_df = result.to_dataframe() 115 | ``` 116 | 117 | ## Database Schema Validation 118 | 119 | You can validate the extracted dependencies against a real database schema: 120 | 121 | ```python 122 | from sqldeps.database import PostgreSQLConnector 123 | 124 | # Connect to the database 125 | db_conn = PostgreSQLConnector( 126 | host="localhost", 127 | port=5432, 128 | database="mydatabase", 129 | username="user" 130 | # Password from .pgpass or environment variables 131 | ) 132 | 133 | # Match extracted dependencies against database schema 134 | validated_schema = extractor.match_database_schema( 135 | result, # The SQLProfile from extraction 136 | db_connection=db_conn, 137 | target_schemas=["public", "sales"] # Optional: schemas to validate against 138 | ) 139 | 140 | # The result is a DataFrame with database schema information 141 | print(validated_schema) 142 | 143 | # Filter for exact matches 144 | exact_matches = validated_schema[validated_schema["exact_match"]] 145 | 146 | # Filter for schema-agnostic matches or cross-schema matches 147 | missing_deps = validated_schema[~validated_schema["exact_match"]] 148 | ``` 149 | 150 | ## Custom Prompts 151 | 152 | You can create custom prompts to guide the LLM extraction process: 153 | 154 | ```yaml 155 | # custom_prompt.yml 156 | system_prompt: | 157 | You are a SQL analyzer that extracts two key elements from SQL queries: 158 | 159 | 1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution. 160 | 2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query. 161 | 162 | # Add detailed instructions for the LLM here... 163 | 164 | user_prompt: | 165 | Extract SQL dependencies (tables/columns needed BEFORE execution) and outputs 166 | (tables/columns CREATED or MODIFIED) from this query. 167 | 168 | Respond ONLY with JSON in this exact format: 169 | {{ 170 | "dependencies": {{"table_name": ["column1", "column2"]}}, 171 | "outputs": {{"table_name": ["column1", "column2"]}} 172 | }} 173 | 174 | SQL query to analyze: 175 | {sql} 176 | ``` 177 | 178 | Use the custom prompt with: 179 | 180 | ```python 181 | extractor = create_extractor(prompt_path="path/to/custom_prompt.yml") 182 | ``` 183 | 184 | ## Using Cache 185 | 186 | SQLDeps can cache extraction results to avoid reprocessing the same files: 187 | 188 | ```python 189 | # Enable cache (default: True) 190 | result = extractor.extract_from_folder( 191 | "path/to/sql_folder", 192 | recursive=True, 193 | use_cache=True 194 | ) 195 | 196 | # Clear cache after processing 197 | result = extractor.extract_from_folder( 198 | "path/to/sql_folder", 199 | recursive=True, 200 | use_cache=True, 201 | clear_cache=True 202 | ) 203 | ``` 204 | 205 | The cache is stored in the `.sqldeps_cache` directory. 206 | -------------------------------------------------------------------------------- /docs/user-guide/cli-usage.md: -------------------------------------------------------------------------------- 1 | # CLI Usage 2 | 3 | SQLDeps includes a powerful command-line interface for extracting SQL dependencies. 4 | 5 | ## Basic Usage 6 | 7 | The basic command syntax is: 8 | 9 | ```bash 10 | sqldeps extract PATH [OPTIONS] 11 | ``` 12 | 13 | Where `PATH` is the path to a SQL file or directory containing SQL files. 14 | 15 | ## Common Options 16 | 17 | | Option | Description | 18 | |--------|-------------| 19 | | `--framework` | LLM framework to use (litellm, groq, openai, deepseek) | 20 | | `--model` | Model name within the selected framework | 21 | | `--prompt` | Path to custom prompt YAML file | 22 | | `-r, --recursive` | Recursively scan folder for SQL files | 23 | | `-o, --output` | Output file path (.json or .csv) | 24 | | `--n-workers` | Number of workers for parallel processing (-1 for all CPUs) | 25 | | `--rpm` | Maximum requests per minute for API rate limiting | 26 | | `--use-cache` | Use local cache for SQL extraction results | 27 | | `--clear-cache` | Clear local cache after processing | 28 | 29 | ## Basic Examples 30 | 31 | ```bash 32 | # Basic usage with default settings (litellm openai/gpt-4.1) 33 | sqldeps extract path/to/query.sql 34 | 35 | # Specify a different framework and model 36 | sqldeps extract path/to/query.sql --framework=openai --model=gpt-4.1-mini 37 | 38 | # Process all SQL files in a directory 39 | sqldeps extract path/to/sql_folder 40 | 41 | # Process recursively with a specific output file 42 | sqldeps extract path/to/sql_folder --recursive -o results.csv 43 | 44 | # Use a custom prompt 45 | sqldeps extract path/to/query.sql --prompt=path/to/custom_prompt.yml 46 | 47 | # Use parallel processing with rate limiting 48 | sqldeps extract path/to/sql_folder --recursive --n-workers=-1 --rpm=50 49 | ``` 50 | 51 | ## Database Validation 52 | 53 | SQLDeps can validate extracted dependencies against a real database schema: 54 | 55 | ```bash 56 | # Validate against a database 57 | sqldeps extract path/to/query.sql \ 58 | --db-match-schema \ 59 | --db-target-schemas public,sales \ 60 | --db-credentials path/to/database.yml 61 | ``` 62 | 63 | Database validation options: 64 | 65 | | Option | Description | 66 | |--------|-------------| 67 | | `--db-match-schema` | Enable database schema validation | 68 | | `--db-target-schemas` | Comma-separated list of target schemas | 69 | | `--db-credentials` | Path to database credentials YAML file | 70 | | `--db-dialect` | Database dialect (default: postgresql) | 71 | 72 | ## Output Formats 73 | 74 | SQLDeps supports both JSON and CSV output formats: 75 | 76 | ```bash 77 | # Output as JSON (default) 78 | sqldeps extract path/to/query.sql -o results.json 79 | 80 | # Output as CSV 81 | sqldeps extract path/to/query.sql -o results.csv 82 | ``` 83 | 84 | ## Managing Cache 85 | 86 | SQLDeps provides commands to manage the extraction cache: 87 | 88 | ```bash 89 | # Clear the cache 90 | sqldeps cache clear 91 | ``` 92 | 93 | ## Running the Web App 94 | 95 | SQLDeps includes a Streamlit-based web application: 96 | 97 | ```bash 98 | # Start the web app 99 | sqldeps app 100 | ``` 101 | 102 | ## Advanced Examples 103 | 104 | ```bash 105 | # Complete example with all options 106 | sqldeps extract data/sql_folder \ 107 | --recursive \ 108 | --framework=deepseek \ 109 | --model=deepseek-chat \ 110 | --prompt=configs/prompts/custom.yml \ 111 | --db-match-schema \ 112 | --db-target-schemas public,sales,reporting \ 113 | --db-credentials configs/database.yml \ 114 | --n-workers=10 \ 115 | --rpm=100 \ 116 | --use-cache \ 117 | -o folder_deps.csv 118 | ``` 119 | 120 | ## Help Command 121 | 122 | For a complete list of options, use the help command: 123 | 124 | ```bash 125 | sqldeps --help 126 | 127 | # View help for a specific command 128 | sqldeps extract --help 129 | ``` 130 | 131 | ## Exit Codes 132 | 133 | The CLI will return the following exit codes: 134 | 135 | - `0`: Success 136 | - `1`: Error (file not found, connection error, extraction failed, etc.) 137 | 138 | ## Integration with Shell Scripts 139 | 140 | SQLDeps can be easily integrated into shell scripts: 141 | 142 | ```bash 143 | #!/bin/bash 144 | 145 | # Process all SQL files in a directory 146 | sqldeps extract sql_files/ --recursive -o results.json 147 | 148 | # Check exit code 149 | if [ $? -eq 0 ]; then 150 | echo "Dependencies extracted successfully." 151 | else 152 | echo "Failed to extract dependencies." 153 | exit 1 154 | fi 155 | 156 | # Process results 157 | cat results.json | jq '.dependencies' 158 | ``` -------------------------------------------------------------------------------- /docs/user-guide/database-integration.md: -------------------------------------------------------------------------------- 1 | # Database Integration 2 | 3 | SQLDeps provides robust database integration for matching extracted dependencies against actual database schemas. 4 | 5 | ## Supported Databases 6 | 7 | Currently, SQLDeps supports: 8 | 9 | - PostgreSQL (primary support) 10 | 11 | ## Database Connection 12 | 13 | ### Using the PostgreSQLConnector 14 | 15 | The `PostgreSQLConnector` class provides a secure way to connect to PostgreSQL databases: 16 | 17 | ```python 18 | from sqldeps.database import PostgreSQLConnector 19 | 20 | # Create a connection using direct parameters 21 | conn = PostgreSQLConnector( 22 | host="localhost", 23 | port=5432, 24 | database="mydatabase", 25 | username="username", 26 | password="password" # Optional, can use .pgpass 27 | ) 28 | 29 | # Alternative: load from YAML config file 30 | conn = PostgreSQLConnector( 31 | config_path="path/to/database.yml" 32 | ) 33 | 34 | # Alternative: use environment variables 35 | # DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD 36 | conn = PostgreSQLConnector() 37 | ``` 38 | 39 | ### Connection Priority 40 | 41 | The connector uses the following priority for connection parameters: 42 | 43 | 1. Direct parameters in constructor 44 | 2. YAML config file 45 | 3. Environment variables 46 | 4. .pgpass file (for password only) 47 | 48 | ### Database Configuration YAML 49 | 50 | ```yaml 51 | # database.yml 52 | database: 53 | host: localhost 54 | port: 5432 55 | database: mydatabase 56 | username: username 57 | password: password # Optional 58 | ``` 59 | 60 | ### Environment Variables 61 | 62 | ``` 63 | DB_HOST=localhost 64 | DB_PORT=5432 65 | DB_NAME=mydatabase 66 | DB_USER=username 67 | DB_PASSWORD=password 68 | ``` 69 | 70 | ### PostgreSQL Password File 71 | 72 | SQLDeps supports standard PostgreSQL password file (`~/.pgpass`) format: 73 | 74 | ``` 75 | hostname:port:database:username:password 76 | ``` 77 | 78 | ## Schema Retrieval 79 | 80 | You can directly access database schema information: 81 | 82 | ```python 83 | # Get all schemas 84 | db_schema = conn.get_schema() 85 | 86 | # Get specific schemas 87 | db_schema = conn.get_schema(schemas=["public", "sales"]) 88 | 89 | # Export schema to CSV 90 | conn.export_schema_csv("schema.csv") 91 | ``` 92 | 93 | ## Schema Matching 94 | 95 | ### Using the API 96 | 97 | ```python 98 | from sqldeps.llm_parsers import create_extractor 99 | from sqldeps.database import PostgreSQLConnector 100 | 101 | # Create extractor and extract dependencies 102 | extractor = create_extractor() 103 | dependencies = extractor.extract_from_file("query.sql") 104 | 105 | # Connect to database 106 | conn = PostgreSQLConnector( 107 | host="localhost", 108 | port=5432, 109 | database="mydatabase", 110 | username="username" 111 | ) 112 | 113 | # Match extracted dependencies against database schema 114 | matching_results = extractor.match_database_schema( 115 | dependencies, 116 | db_connection=conn, 117 | target_schemas=["public", "sales"] 118 | ) 119 | 120 | # Analyze database-matching results 121 | exact_matches = matching_results[matching_results["exact_match"]] 122 | agnostic_matches = matching_results[~matching_results["exact_match"]] 123 | 124 | print(f"Found {len(exact_matches)} exact matches.") 125 | print(f"Found {len(agnostic_matches)} schema-agnostic matches.") 126 | ``` 127 | 128 | ### Using the CLI 129 | 130 | ```bash 131 | sqldeps extract path/to/query.sql \ 132 | --db-match-schema \ 133 | --db-target-schemas public,sales \ 134 | --db-credentials configs/database.yml \ 135 | -o db_matching_results.csv 136 | ``` 137 | 138 | ## Matching Results 139 | 140 | The matching results are returned as a pandas DataFrame with these columns: 141 | 142 | | Column | Description | 143 | |--------|-------------| 144 | | `schema` | Database schema name | 145 | | `table` | Table name | 146 | | `column` | Column name | 147 | | `data_type` | Database data type | 148 | | `exact_match` | Boolean indicating if schema name matched exactly | 149 | 150 | ### Interpreting Results 151 | 152 | - `exact_match=True`: The table/column was found in the specified schema 153 | - `exact_match=False`: The table/column does not have a specified schema 154 | - Missing entries: Dependencies that weren't found in the database 155 | 156 | ## Using Schema Information in Applications 157 | 158 | The schema matching results can be used to: 159 | 160 | 1. Identify missing dependencies before executing SQL 161 | 2. Generate data type-aware documentation 162 | 3. Create migration scripts 163 | 4. Highlight potential issues in SQL queries 164 | 5. Ensure referential integrity across schemas 165 | 166 | ## Security Considerations 167 | 168 | SQLDeps follows security best practices for database connections: 169 | 170 | - No hardcoded credentials in code 171 | - Support for PostgreSQL password file 172 | - Environment variable support 173 | - Secure parameter handling (parameters are cleared after use) 174 | - Connection timeouts to prevent hanging 175 | -------------------------------------------------------------------------------- /docs/user-guide/visualization.md: -------------------------------------------------------------------------------- 1 | # Interactive Visualization of SQL Dependencies 2 | 3 | SQLDeps provides powerful visualization capabilities to help you understand and explore SQL dependencies across your projects. 4 | 5 | ## Interactive Dependency Graphs 6 | 7 | The `visualize_sql_dependencies()` function creates an interactive network graph that shows the relationships between SQL files, tables, and their dependencies. 8 | 9 | ### Basic Usage 10 | 11 | ```python 12 | from sqldeps.llm_parsers import create_extractor 13 | from sqldeps.visualization import visualize_sql_dependencies 14 | 15 | # Create an interactive network graph from multiple SQL files 16 | extractor = create_extractor() 17 | sql_profiles = extractor.extract_from_folder("path/to/folder", recursive=False) 18 | 19 | # Generate an interactive visualization (saving output to an HTML file) 20 | figure = visualize_sql_dependencies(sql_profiles, output_path="dependencies.html") 21 | 22 | # Show figure on IDE 23 | figure.show() 24 | ``` 25 | 26 | ### Visualization Options 27 | 28 | The `visualize_sql_dependencies()` function offers extensive customization: 29 | 30 | ```python 31 | # See API documentation for more options 32 | # https://sqldeps.readthedocs.io/en/latest/api-reference/visualization 33 | 34 | figure = visualize_sql_dependencies( 35 | dependencies, 36 | output_path="dependencies.html", # Optional: Save to HTML file 37 | show_columns=True, # Show column details in hover text 38 | layout_algorithm="spring", # Layout options: 'spring', 'circular', 'kamada_kawai' 39 | highlight_common_tables=True, # Highlight tables used by multiple files 40 | show_file_text=True, # Show file names 41 | show_table_text=False, # Show table names 42 | color_gradient=True, # Color intensity based on usage frequency 43 | min_file_size=20, # Minimum node size for files 44 | max_file_size=40, # Maximum node size for files 45 | show_text_buttons=True, # Add buttons to toggle text visibility 46 | show_layout_buttons=True # Add buttons to change graph layout 47 | ) 48 | ``` 49 | 50 | ### Visualization Features 51 | 52 | - **Interactive Exploration**: Hover over nodes to see detailed information 53 | - **Dynamic Layout**: Change graph layout with built-in buttons 54 | - **Text Toggle**: Show/hide labels for files and tables 55 | - **Usage Visualization**: 56 | - Node sizes indicate usage frequency 57 | - Color intensity represents how many files use a particular table 58 | - **Dependency Insights**: 59 | - Visualize connections between SQL files and tables 60 | - Identify common tables across multiple files 61 | 62 | ### Example Use Cases 63 | 64 | 1. **Project Dependency Mapping** 65 | ```python 66 | # Map dependencies across an entire project 67 | project_deps = extractor.extract_from_folder( 68 | "path/to/project/sql", 69 | recursive=True 70 | ) 71 | # Plot dependencies 72 | visualize_sql_dependencies(project_deps, output_path="project_deps.html") 73 | ``` 74 | 75 | 2. **Focused Analysis** 76 | ```python 77 | # Analyze dependencies for a specific subset of files 78 | subset_deps = extractor.extract_from_folder( 79 | "path/to/specific/sql/folder", 80 | recursive=False 81 | ) 82 | # Plot dependencies 83 | visualize_sql_dependencies(subset_deps, output_path="subset_deps.html") 84 | ``` 85 | 86 | ## Use cases 87 | 88 | You can use the visualization to identify: 89 | 90 | - Shared tables across different files 91 | - Potential refactoring opportunities 92 | - Complex dependency relationships 93 | 94 | > Note: The visualization is best suited for projects with a moderate number of SQL files 95 | -------------------------------------------------------------------------------- /docs/user-guide/web-app.md: -------------------------------------------------------------------------------- 1 | # Web Application 2 | 3 | SQLDeps includes a Streamlit-based web interface for interactive SQL dependency exploration. 4 | 5 | ## Installation 6 | 7 | To use the web application, install SQLDeps with the app dependencies: 8 | 9 | ```bash 10 | pip install "sqldeps[app]" 11 | ``` 12 | 13 | ## Starting the App 14 | 15 | Start the web application with: 16 | 17 | ```bash 18 | # Using the CLI command 19 | sqldeps app 20 | 21 | # Or directly with streamlit 22 | streamlit run -m sqldeps.app.main 23 | ``` 24 | 25 | This will launch the Streamlit app in your default web browser, typically at `http://localhost:8501`. 26 | 27 | ## Using the Web Interface 28 | 29 | The web interface provides an intuitive way to analyze SQL dependencies: 30 | 31 | ### Configuration Panel 32 | 33 | On the left sidebar, you'll find configuration options: 34 | 35 | 1. **Framework Selection**: Choose between Groq, OpenAI, or DeepSeek 36 | 2. **Model Selection**: Select the specific model to use 37 | 3. **Custom Prompt**: Optionally upload a custom prompt YAML file 38 | 4. **Database Connection**: Configure database connection for schema validation 39 | 5. **SQL Input**: Upload a SQL file or enter SQL directly 40 | 41 | ### Analysis Results 42 | 43 | After clicking "Extract Dependencies", the main panel displays: 44 | 45 | 1. **SQL Query**: The formatted SQL query that was analyzed 46 | 2. **Extracted Dependencies**: 47 | - Tables listed in a clear format 48 | - Columns organized by table 49 | - Database schema validation results (if enabled) 50 | - DataFrame representation 51 | - Raw JSON output 52 | 53 | ### Download Options 54 | 55 | The app provides options to download the results as: 56 | 57 | - CSV file 58 | - JSON file 59 | - Data types for dependencies matching database (when enabled) 60 | 61 | ## Database Matching 62 | 63 | To enable database schema matching: 64 | 65 | 1. Check the "Enable Database Schema Validation" option 66 | 2. Enter database connection details: 67 | - Host 68 | - Port 69 | - Database name 70 | - Username 71 | - Target schemas (comma-separated) 72 | 73 | When database matching is enabled, the app will: 74 | 75 | 1. Connect to the specified database 76 | 2. Retrieve schema information for the target schemas 77 | 3. Match extracted dependencies against the actual schema 78 | 4. Display dependency data types showing exact matches and schema-agnostic dependencies 79 | 80 | ## Example Workflow 81 | 82 | 1. Select your preferred framework and model 83 | 2. Either upload a SQL file or enter a SQL query 84 | 3. Optionally configure database schema validation 85 | 4. Click "Extract Dependencies" to analyze 86 | 5. Explore the results in the main panel 87 | 6. Download the results in your preferred format 88 | 89 | ## Notes 90 | 91 | The web application is designed for demonstration and exploration of single SQL files. For processing multiple files or entire folders, use the CLI or API interfaces. 92 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: SQLDeps 2 | site_description: SQL Dependency Extractor using Large Language Models 3 | site_author: Cainã Max Couto da Silva 4 | repo_url: https://github.com/glue-lab/sqldeps 5 | repo_name: glue-lab/sqldeps 6 | 7 | extra_css: 8 | - stylesheets/custom.css 9 | 10 | theme: 11 | name: material 12 | favicon: assets/images/sqldeps_logo.png 13 | palette: 14 | primary: custom 15 | accent: custom 16 | features: 17 | - navigation.tabs 18 | - navigation.sections 19 | - navigation.top 20 | - search.suggest 21 | - search.highlight 22 | 23 | icon: 24 | repo: fontawesome/brands/github 25 | icons: 26 | - material/ 27 | - fontawesome/brands/ 28 | 29 | plugins: 30 | - search 31 | - mkdocstrings: 32 | handlers: 33 | python: 34 | options: 35 | show_root_heading: true 36 | show_root_full_name: true 37 | show_source: true 38 | show_if_no_docstring: false 39 | docstring_style: google 40 | 41 | markdown_extensions: 42 | - pymdownx.highlight: 43 | anchor_linenums: true 44 | - pymdownx.superfences 45 | - pymdownx.inlinehilite 46 | - admonition 47 | - pymdownx.details 48 | - pymdownx.snippets 49 | - pymdownx.tabbed: 50 | alternate_style: true 51 | - tables 52 | - footnotes 53 | - pymdownx.emoji: 54 | emoji_index: !!python/name:material.extensions.emoji.twemoji 55 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 56 | 57 | nav: 58 | - Home: index.md 59 | - Getting Started: 60 | - Installation: getting-started/installation.md 61 | - Quick Start: getting-started/quick-start.md 62 | - User Guide: 63 | - API Usage: user-guide/api-usage.md 64 | - CLI Usage: user-guide/cli-usage.md 65 | - Web Application: user-guide/web-app.md 66 | - Database Integration: user-guide/database-integration.md 67 | - Interactive Graph Visualization: user-guide/visualization.md 68 | - API Reference: 69 | - Core features: 70 | - Models: api-reference/models.md 71 | - LLM Parsers: api-reference/llm-parsers.md 72 | - Database: api-reference/database.md 73 | - Visualization: api-reference/visualization.md 74 | - Advanced features: 75 | - Config: api-reference/config.md 76 | - Utils: api-reference/utils.md 77 | - Cache: api-reference/cache.md 78 | - Rate Limiter: api-reference/rate-limiter.md 79 | - Parallelization: api-reference/parallel.md 80 | # - Interfaces: # No need to document these interfaces 81 | # - CLI: api-reference/cli.md 82 | # - Web Application: api-reference/app.md 83 | 84 | # - Examples: examples.md 85 | - Development: 86 | - Contributing: contributing.md 87 | - ChangeLog: changelog.md 88 | - Team: 89 | - Authors: authors.md 90 | # - Contributors: team/contributors.md # ToDo: add script to show contributors 91 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/notebooks/.gitkeep -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "sqldeps" 7 | version = "0.1.1" 8 | description = "SQL Dependency Extractor" 9 | requires-python = ">=3.10" 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "Programming Language :: Python :: 3.10", 13 | "Programming Language :: Python :: 3.11", 14 | "Programming Language :: Python :: 3.12", 15 | "Programming Language :: Python :: 3.13", 16 | "Operating System :: OS Independent", 17 | ] 18 | authors = [ 19 | {name = "Cainã Silva", email = "coutodasilva@wisc.edu"}, 20 | {name = "Matt Christie", email = "mjchristie@wisc.edu"} 21 | ] 22 | readme = "README.md" 23 | keywords = ["sql", "dependency", "parser", "extractor", "llm"] 24 | dependencies = [ 25 | "groq>=0.17.0", 26 | "litellm>=1.67.6", 27 | "loguru>=0.7.3", 28 | "openai>=1.59.5", 29 | "pandas>=2.2.3", 30 | "python-dotenv>=1.0.1", 31 | "pyyaml>=6.0.2", 32 | "sqlalchemy>=2.0.37", 33 | "sqlparse>=0.5.3", 34 | "tenacity>=9.0.0", 35 | "typer>=0.15.1", 36 | ] 37 | 38 | [project.optional-dependencies] 39 | app = [ 40 | "streamlit>=1.42.1", 41 | ] 42 | postgres = [ 43 | "psycopg2>=2.9.10", 44 | ] 45 | dataviz = [ 46 | "nbformat>=5.10.4", 47 | "networkx>=3.4.2", 48 | "plotly>=6.0.0", 49 | "scipy>=1.15.2", 50 | ] 51 | 52 | [project.urls] 53 | Repository = "https://github.com/glue-lab/sqldeps" 54 | Documentation = "https://sqldeps.readthedocs.io" 55 | Questions = "https://github.com/glue-lab/sqldeps/discussions/categories/questions" 56 | Issues = "https://github.com/glue-lab/sqldeps/issues" 57 | 58 | [project.scripts] 59 | sqldeps = "sqldeps.cli:app" 60 | 61 | [tool.pytest.ini_options] 62 | pythonpath = "." 63 | testpaths = ["tests"] 64 | markers = [ 65 | "llm: marks tests that require LLM API calls (skipped by default)", 66 | "slow: marks tests that are slow to execute", 67 | ] 68 | addopts = "-m 'not llm and not integration and not slow' --ignore=sqldeps/app" 69 | 70 | [tool.hatch.metadata] 71 | allow-direct-references = false 72 | 73 | [tool.hatch.build] 74 | packages = ["sqldeps"] 75 | 76 | [dependency-groups] 77 | dev = [ 78 | "pytest-cov>=6.0.0", 79 | "pytest>=8.3.4", 80 | "ruff>=0.9.7", 81 | "pre-commit>=4.2.0", 82 | ] 83 | docs = [ 84 | "mkdocs>=1.6.1", 85 | "mkdocs-material>=9.6.9", 86 | "mkdocstrings[python]>=0.29.1", 87 | "pymdown-extensions>=10.14.3", 88 | ] 89 | analysis = [ 90 | "ipykernel>=6.29.5", 91 | "seaborn>=0.13.2", 92 | "tabulate>=0.9.0", 93 | ] 94 | 95 | [tool.uv] 96 | default-groups = ["dev", "docs", "analysis"] 97 | 98 | [tool.ruff] 99 | line-length = 88 # Like Black, use 88 characters per line. 100 | indent-width = 4 # Like Black, use 4 spaces per indentation level. 101 | exclude = ["*.ipynb"] # Exclude Jupyter notebooks from linting. 102 | 103 | [tool.ruff.lint] 104 | select = [ 105 | "F", # Pyflakes 106 | "E", "W", # pycodestyle 107 | "I", # isort 108 | "N", # pep8-naming 109 | "Q", # flake8-quotes 110 | "UP", # pyupgrade 111 | "D", # pydocstyle 112 | "RUF", # Ruff-specific rules 113 | "B", # flake8-bugbear 114 | "T20", # flake8-print 115 | "C90", # mccabe (complex structures) 116 | "SIM", # flake8-simplify 117 | "ANN", # flake8-annotations 118 | "TID", # flake8-tidy-imports 119 | ] 120 | ignore = [] # ignore specific rules here 121 | 122 | # Allow fix for all enabled rules (when `--fix`) is provided. 123 | fixable = ["ALL"] 124 | unfixable = [] 125 | 126 | [tool.ruff.lint.pydocstyle] 127 | convention = "google" 128 | 129 | [tool.ruff.lint.isort] 130 | combine-as-imports = true 131 | force-single-line = false 132 | 133 | [tool.ruff.format] 134 | # Like Black, use double quotes for strings. 135 | quote-style = "double" 136 | 137 | # Like Black, indent with spaces, rather than tabs. 138 | indent-style = "space" 139 | 140 | # Like Black, respect magic trailing commas. 141 | skip-magic-trailing-comma = false 142 | 143 | # Like Black, automatically detect the appropriate line ending. 144 | line-ending = "auto" 145 | -------------------------------------------------------------------------------- /scripts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/scripts/.gitkeep -------------------------------------------------------------------------------- /sqldeps/__init__.py: -------------------------------------------------------------------------------- 1 | """SQLDeps: SQL Dependency Extractor using Large Language Models. 2 | 3 | SQLDeps provides tools to automatically extract and map table and colum dependencies 4 | from SQL scripts using LLMs. It identifies both dependencies (tables/columns needed 5 | before execution) and outputs (tables/columns created or modified by the query). 6 | """ 7 | 8 | from importlib.metadata import version 9 | 10 | __version__ = version("sqldeps") 11 | -------------------------------------------------------------------------------- /sqldeps/app/__init__.py: -------------------------------------------------------------------------------- 1 | """SQLDeps web application. 2 | 3 | This package provides a Streamlit-based web interface for the SQLDeps tool, 4 | allowing users to interactively extract and visualize SQL dependencies. 5 | """ 6 | -------------------------------------------------------------------------------- /sqldeps/app/assets/images/sqldeps_gray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/sqldeps/app/assets/images/sqldeps_gray.png -------------------------------------------------------------------------------- /sqldeps/app/assets/images/sqldeps_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glue-lab/sqldeps/66ae7e1313f767417f2dc83e7d766ff478168082/sqldeps/app/assets/images/sqldeps_white.png -------------------------------------------------------------------------------- /sqldeps/cache.py: -------------------------------------------------------------------------------- 1 | """Caching utilities for SQL dependency extraction. 2 | 3 | This module provides functions for caching extraction results to avoid 4 | repeatedly processing the same SQL files, which can save API calls, cost, and time. 5 | """ 6 | 7 | import hashlib 8 | import json 9 | from pathlib import Path 10 | 11 | from loguru import logger 12 | 13 | from sqldeps.models import SQLProfile 14 | 15 | CACHE_DIR = ".sqldeps_cache" 16 | 17 | 18 | def get_cache_path(file_path: str | Path, cache_dir: str | Path = CACHE_DIR) -> Path: 19 | """Generate a consistent cache file path based on SQL file content. 20 | 21 | Creates a unique cache filename by hashing the SQL file's content. 22 | Includes the original filename in the cache name for easier debugging. 23 | 24 | Args: 25 | file_path: Path to the SQL file to be processed 26 | cache_dir: Directory where cache files will be stored. 27 | Defaults to ".sqldeps_cache" 28 | 29 | Returns: 30 | Path object pointing to the cache file location 31 | 32 | Raises: 33 | FileNotFoundError: If the SQL file doesn't exist 34 | PermissionError: If the SQL file can't be read 35 | """ 36 | file_path = Path(file_path).resolve() 37 | 38 | # Read file content and create hash 39 | with open(file_path, "rb") as f: 40 | content = f.read() 41 | 42 | # Hash the content 43 | content_hash = hashlib.md5(content).hexdigest()[:16] 44 | 45 | # Use a combination of filename and content hash for better readability/debugging 46 | cache_name = f"{file_path.stem}_{content_hash}" 47 | 48 | # Ensure a valid filename 49 | cache_name = "".join(c if c.isalnum() or c in "_-." else "_" for c in cache_name) 50 | 51 | return Path(cache_dir) / f"{cache_name}.json" 52 | 53 | 54 | def save_to_cache( 55 | result: SQLProfile, file_path: Path, cache_dir: Path = Path(CACHE_DIR) 56 | ) -> bool: 57 | """Save extraction result to cache. 58 | 59 | Args: 60 | result: The SQLProfile to save 61 | file_path: The original SQL file path 62 | cache_dir: The cache directory 63 | 64 | Returns: 65 | True if saved successfully, False otherwise 66 | """ 67 | cache_dir.mkdir(parents=True, exist_ok=True) 68 | cache_file = get_cache_path(file_path, cache_dir) 69 | 70 | try: 71 | with open(cache_file, "w") as f: 72 | json.dump(result.to_dict(), f) 73 | return True 74 | except Exception as e: 75 | logger.warning(f"Failed to save cache for {file_path}: {e}") 76 | return False 77 | 78 | 79 | def load_from_cache( 80 | file_path: Path, cache_dir: Path = Path(CACHE_DIR) 81 | ) -> SQLProfile | None: 82 | """Load extraction result from cache. 83 | 84 | Args: 85 | file_path: The original SQL file path 86 | cache_dir: The cache directory 87 | 88 | Returns: 89 | SQLProfile if loaded successfully, None otherwise 90 | """ 91 | cache_file = get_cache_path(file_path, cache_dir) 92 | 93 | if not cache_file.exists(): 94 | return None 95 | 96 | try: 97 | with open(cache_file) as f: 98 | cached_data = json.load(f) 99 | logger.info(f"Loading from cache: {file_path}") 100 | return SQLProfile(**cached_data) 101 | except Exception as e: 102 | logger.warning(f"Failed to load cache for {file_path}: {e}") 103 | return None 104 | 105 | 106 | def cleanup_cache(cache_dir: Path = Path(CACHE_DIR)) -> bool: 107 | """Clean up cache directory. 108 | 109 | Args: 110 | cache_dir: The cache directory to clean up 111 | 112 | Returns: 113 | True if cleaned up successfully, False otherwise 114 | """ 115 | if not cache_dir.exists(): 116 | return True 117 | 118 | try: 119 | # Remove all JSON files 120 | for cache_file in cache_dir.glob("*.json"): 121 | cache_file.unlink() 122 | 123 | # Try to remove directory if empty 124 | if not any(cache_dir.iterdir()): 125 | cache_dir.rmdir() 126 | logger.info(f"Removed cache directory: {cache_dir}") 127 | else: 128 | logger.info( 129 | "Cache directory cleaned but not removed (contains other files)" 130 | ) 131 | return True 132 | except Exception as e: 133 | logger.warning(f"Failed to clean up cache: {e}") 134 | return False 135 | -------------------------------------------------------------------------------- /sqldeps/config.py: -------------------------------------------------------------------------------- 1 | """Configuration utilities for SQLDeps. 2 | 3 | This module provides functions for loading configuration from YAML files. 4 | """ 5 | 6 | import yaml 7 | 8 | 9 | def load_config(config_path: str) -> dict: 10 | """Load configuration from a YAML file. 11 | 12 | Args: 13 | config_path: Path to the YAML configuration file 14 | 15 | Returns: 16 | dict: Parsed configuration dictionary 17 | """ 18 | with open(config_path) as config_file: 19 | config = yaml.safe_load(config_file) 20 | return config 21 | -------------------------------------------------------------------------------- /sqldeps/configs/prompts/default_v0.1.0.yml: -------------------------------------------------------------------------------- 1 | system_prompt: | 2 | You are a SQL analyzer that extracts two key elements from SQL queries: 3 | 4 | 1. DEPENDENCIES: Tables and columns that must exist BEFORE query execution. 5 | - Source tables in `FROM`, `JOIN`, CTEs, subqueries, etc. 6 | - ALL target tables in operations like `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE` must be included in dependencies. 7 | - Referenced columns in `SELECT`, `WHERE`, `CASE`, `JOIN`, `GROUP BY`, `HAVING`, `ORDER BY`, etc. 8 | - Columns used in expressions, `CASE` statements, and aggregate functions. 9 | 10 | 2. OUTPUTS: Tables and columns permanently CREATED or MODIFIED by the query. 11 | - Tables modified with `INSERT`, `UPDATE`, `DELETE`, `TRUNCATE`. 12 | - Target columns in these operations. 13 | - Tables created with `CREATE TABLE`. 14 | 15 | KEY RULES (CRITICAL): 16 | - ALL target tables (INSERT, UPDATE, DELETE, TRUNCATE) MUST appear in BOTH dependencies AND outputs. 17 | - Example: For `INSERT INTO table_x (col1, col2) VALUES (1, 2)` 18 | → Dependencies: `{"table_x": ["col1", "col2"]}` 19 | → Outputs: `{"table_x": ["col1", "col2"]}` 20 | - Example: For `TRUNCATE TABLE table_x` 21 | → Dependencies: `{"table_x": []}` 22 | → Outputs: `{"table_x": []}` 23 | - This applies even inside CTEs, functions or stored procedures. 24 | - EXCEPTIONS: 25 | - If a table is CREATED in the same query (CREATE TABLE), it appears ONLY in outputs. 26 | - If a table is used in INSERT statement after TRUNCATE, include the specified columns for both dependencies and outputs instead of an empty list. 27 | - ALWAYS include schema-qualified tables (e.g., `schema.table`) in both dependencies and outputs, preserving the schema name. 28 | 29 | COLUMN HANDLING: 30 | - Explicit `SELECT *` should return ["*"] in dependencies 31 | - Functions like COUNT(*) with specific names, do NOT use ["*"], only include explicitly named columns 32 | - Example: For `SELECT COUNT(*), name FROM users`, dependencies would include `{"users": ["name"]}` 33 | - INSERT without column list creates dependencies on all columns in the target table: ["*"] 34 | 35 | CTE HANDLING: 36 | - CTEs (WITH queries) are temporary structures and should NOT be included as dependencies or outputs themselves. 37 | - However, include tables and columns used within CTEs that originated outside, since they are dependencies. 38 | - Example: 39 | ```sql 40 | WITH cte AS (SELECT * FROM table_x) 41 | INSERT INTO table_y SELECT * FROM cte 42 | ``` 43 | → Dependencies: `{"table_x": ["*"], "table_y": ["*"]}` 44 | → Outputs: `{"table_y": ["*"]}` 45 | 46 | SCHEMA-QUALIFIED TABLES: 47 | - Always preserve schema names exactly as they appear in the query. 48 | - Example: For `INSERT INTO schema_a.table_x SELECT col1,col2 FROM schema_b.table_y` 49 | → Dependencies: `{"schema_a.table_x": ["*"], "schema_b.table_y": ["col1","col2"]}` 50 | → Outputs: `{"schema_a.table_x": ["*"]}` 51 | 52 | FUNCTION & PROCEDURE HANDLING: 53 | - Even inside functions or stored procedures, any `INSERT`, `UPDATE`, `DELETE`, or `TRUNCATE` statements affect real tables and must be included as dependencies and outputs. 54 | 55 | ADDITIONAL CONSIDERATIONS: 56 | - Resolve table aliases to real table names. 57 | - `CASE` expressions → dependencies on all examined columns. 58 | - `MERGE`/`UPSERT` → both dependencies and outputs. 59 | - Ignore variables and parameters as dependencies. 60 | 61 | OUTPUT JSON FORMAT: 62 | { 63 | "dependencies": {"table_name": ["column1", "column2"]}, 64 | "outputs": {"table_name": ["column1", "column2"]} 65 | } 66 | 67 | user_prompt: | 68 | Extract SQL dependencies (tables/columns needed BEFORE execution) and outputs (tables/columns CREATED or MODIFIED) from this query. 69 | 70 | Respond ONLY with JSON in this exact format: 71 | {{ 72 | "dependencies": {{"table_name": ["column1", "column2"]}}, 73 | "outputs": {{"table_name": ["column1", "column2"]}} 74 | }} 75 | 76 | SQL query to analyze: 77 | {sql} 78 | -------------------------------------------------------------------------------- /sqldeps/configs/prompts/simplified.yml: -------------------------------------------------------------------------------- 1 | system_prompt: | 2 | You are a SQL analyzer that extracts two key elements from SQL queries: 3 | 4 | 1. DEPENDENCIES: All tables and columns that must exist BEFORE query execution so that the query can run without error. 5 | 2. OUTPUTS: All tables and columns permanently CREATED or MODIFIED by the query. 6 | 7 | OUTPUT JSON FORMAT: 8 | { 9 | "dependencies": {"table_name": ["column1", "column2"]}, 10 | "outputs": {"table_name": ["column1", "column2"]} 11 | } 12 | 13 | user_prompt: | 14 | Extract SQL dependencies and outputs for this query: 15 | {sql} 16 | -------------------------------------------------------------------------------- /sqldeps/database/__init__.py: -------------------------------------------------------------------------------- 1 | """Database connectors for SQLDeps. 2 | 3 | This package provides database connectors for schema retrieval and validation. 4 | """ 5 | 6 | from .postgresql import PostgreSQLConnector 7 | 8 | __all__ = ["PostgreSQLConnector"] 9 | -------------------------------------------------------------------------------- /sqldeps/database/base.py: -------------------------------------------------------------------------------- 1 | """Base class for database connections. 2 | 3 | This module defines the abstract base class for SQL database connections 4 | and schema inspection. 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | from pathlib import Path 9 | from typing import Any 10 | 11 | import pandas as pd 12 | from dotenv import load_dotenv 13 | from sqlalchemy.engine.base import Engine 14 | 15 | load_dotenv() 16 | 17 | 18 | class SQLBaseConnector(ABC): 19 | """Abstract base class for SQL database connections and schema inspection. 20 | 21 | Provides interface for: 22 | - Database connection with multiple configuration sources 23 | - Schema inspection and export 24 | - Engine-specific connection handling 25 | """ 26 | 27 | @abstractmethod 28 | def __init__( 29 | self, 30 | host: str | None = None, 31 | port: int | None = None, 32 | database: str | None = None, 33 | username: str | None = None, 34 | password: str | None = None, 35 | config_path: Path | None = None, 36 | ) -> None: 37 | """Initialize database connection. 38 | 39 | Args: 40 | host: Database host address 41 | port: Database port 42 | database: Database name 43 | username: Database username 44 | password: Database password 45 | config_path: Path to configuration file 46 | """ 47 | pass 48 | 49 | @abstractmethod 50 | def _create_engine(self, params: dict[str, Any]) -> Engine: 51 | """Create database engine with given parameters. 52 | 53 | Args: 54 | params: Dictionary of connection parameters 55 | 56 | Returns: 57 | Database engine 58 | """ 59 | pass 60 | 61 | @abstractmethod 62 | def _load_config(self, config_path: Path | None) -> dict[str, Any]: 63 | """Load configuration from file. 64 | 65 | Args: 66 | config_path: Path to configuration file 67 | 68 | Returns: 69 | Dictionary with configuration parameters 70 | """ 71 | pass 72 | 73 | @abstractmethod 74 | def _get_env_vars(self) -> dict[str, Any]: 75 | """Get environment variables for connection. 76 | 77 | Returns: 78 | Dictionary with environment variables 79 | """ 80 | pass 81 | 82 | @abstractmethod 83 | def _resolve_params( 84 | self, 85 | host: str | None, 86 | port: int | None, 87 | database: str | None, 88 | username: str | None, 89 | password: str | None, 90 | config_path: Path | None, 91 | **kwargs: dict[str, Any], 92 | ) -> dict[str, Any]: 93 | """Resolve connection parameters from all sources. 94 | 95 | Args: 96 | host: Database host address 97 | port: Database port 98 | database: Database name 99 | username: Database username 100 | password: Database password 101 | config_path: Path to configuration file 102 | **kwargs: Additional parameters 103 | 104 | Returns: 105 | Dictionary with resolved connection parameters 106 | """ 107 | pass 108 | 109 | @abstractmethod 110 | def get_schema(self, schemas: str | list[str] | None = None) -> pd.DataFrame: 111 | """Get database schema information. 112 | 113 | Args: 114 | schemas: Optional schema name or list of schema names to filter results 115 | 116 | Returns: 117 | DataFrame with schema information 118 | """ 119 | pass 120 | 121 | def export_schema_csv( 122 | self, 123 | path: str, 124 | schemas: str | list[str] | None = None, 125 | ) -> None: 126 | """Export schema to CSV file. 127 | 128 | Args: 129 | path: Path to output CSV file 130 | schemas: Optional schema name or list of schema names to filter results 131 | 132 | Returns: 133 | None 134 | """ 135 | df = self.get_schema(schemas) 136 | df.to_csv(path, index=False) 137 | -------------------------------------------------------------------------------- /sqldeps/llm_parsers/__init__.py: -------------------------------------------------------------------------------- 1 | """LLM-based SQL parsers for dependency extraction. 2 | 3 | This package provides integrations with various LLM providers for extracting 4 | SQL dependencies, with a common interface and factory function. 5 | """ 6 | 7 | from pathlib import Path 8 | 9 | from dotenv import load_dotenv 10 | 11 | from .base import BaseSQLExtractor 12 | from .deepseek import DeepseekExtractor 13 | from .groq import GroqExtractor 14 | from .litellm import LiteLlmExtractor 15 | from .openai import OpenaiExtractor 16 | 17 | load_dotenv() 18 | 19 | DEFAULTS = { 20 | "litellm": {"class": LiteLlmExtractor, "model": "openai/gpt-4.1"}, 21 | "groq": {"class": GroqExtractor, "model": "llama-3.3-70b-versatile"}, 22 | "openai": {"class": OpenaiExtractor, "model": "gpt-4.1"}, 23 | "deepseek": {"class": DeepseekExtractor, "model": "deepseek-chat"}, 24 | } 25 | 26 | 27 | def create_extractor( 28 | framework: str = "litellm", 29 | model: str | None = None, 30 | params: dict | None = None, 31 | prompt_path: Path | None = None, 32 | ) -> BaseSQLExtractor: 33 | """Create an appropriate SQL extractor based on the specified framework. 34 | 35 | Args: 36 | framework: The LLM framework to use ("litellm", "groq", "openai", or "deepseek") 37 | Note: Direct framework options are maintained for backward compatibility, 38 | but "litellm" is recommended as it provides integrations for all models 39 | from multiple providers 40 | model: The model name within the selected framework (uses default if None) 41 | params: Additional parameters to pass to the LLM API 42 | prompt_path: Path to a custom prompt YAML file 43 | 44 | Returns: 45 | An instance of the appropriate SQL extractor 46 | 47 | Raises: 48 | ValueError: If an unsupported framework is specified 49 | """ 50 | framework = framework.lower() 51 | if framework not in DEFAULTS: 52 | raise ValueError( 53 | f"Unsupported framework: {framework}. " 54 | f"Must be one of: {', '.join(DEFAULTS.keys())}" 55 | ) 56 | 57 | config = DEFAULTS[framework] 58 | extractor_class = config["class"] 59 | model_name = model or config["model"] 60 | 61 | return extractor_class(model=model_name, params=params, prompt_path=prompt_path) 62 | 63 | 64 | __all__ = [ 65 | "DeepseekExtractor", 66 | "GroqExtractor", 67 | "LiteLlmExtractor", 68 | "OpenaiExtractor", 69 | "create_extractor", 70 | ] 71 | -------------------------------------------------------------------------------- /sqldeps/llm_parsers/deepseek.py: -------------------------------------------------------------------------------- 1 | """DeepSeek-based SQL parser implementation. 2 | 3 | This module provides the DeepSeek-specific implementation of the BaseSQLExtractor 4 | for using DeepSeek's models to extract SQL dependencies. 5 | """ 6 | 7 | import os 8 | from pathlib import Path 9 | 10 | from openai import OpenAI 11 | 12 | from sqldeps.llm_parsers.base import BaseSQLExtractor 13 | 14 | 15 | class DeepseekExtractor(BaseSQLExtractor): 16 | """DeepSeek-based SQL dependency extractor. 17 | 18 | Attributes: 19 | ENV_VAR_NAME: Environment variable name for the API key 20 | client: OpenAI client instance configured for DeepSeek API 21 | """ 22 | 23 | # Expected environmental variable with the DeepSeek key 24 | ENV_VAR_NAME = "DEEPSEEK_API_KEY" 25 | 26 | def __init__( 27 | self, 28 | model: str = "deepseek-chat", 29 | params: dict | None = None, 30 | api_key: str | None = None, 31 | prompt_path: Path | None = None, 32 | ) -> None: 33 | """Initialize DeepSeek extractor. 34 | 35 | Args: 36 | model: DeepSeek model name to use 37 | params: Additional parameters for the API 38 | api_key: DeepSeek API key (defaults to environment variable) 39 | prompt_path: Path to custom prompt YAML file 40 | 41 | Raises: 42 | ValueError: If API key is not provided 43 | """ 44 | super().__init__(model, params, prompt_path=prompt_path) 45 | 46 | api_key = api_key or os.getenv(self.ENV_VAR_NAME) 47 | if not api_key: 48 | raise ValueError( 49 | "No API key provided. Either pass api_key parameter or set " 50 | f"{self.ENV_VAR_NAME} environment variable." 51 | ) 52 | 53 | self.client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com") 54 | 55 | def _query_llm(self, user_prompt: str) -> str: 56 | """Query the DeepSeek LLM with the generated prompt. 57 | 58 | Args: 59 | user_prompt: Generated prompt to send to DeepSeek 60 | 61 | Returns: 62 | Response content from DeepSeek 63 | """ 64 | response = self.client.chat.completions.create( 65 | model=self.model, 66 | messages=[ 67 | {"role": "system", "content": self.prompts["system_prompt"]}, 68 | {"role": "user", "content": user_prompt}, 69 | ], 70 | response_format={"type": "json_object"}, 71 | stream=False, 72 | **self.params, 73 | ) 74 | 75 | return response.choices[0].message.content 76 | -------------------------------------------------------------------------------- /sqldeps/llm_parsers/groq.py: -------------------------------------------------------------------------------- 1 | """Groq-based SQL parser implementation. 2 | 3 | This module provides the Groq-specific implementation of the BaseSQLExtractor 4 | for using Groq's models to extract SQL dependencies. 5 | """ 6 | 7 | import os 8 | from pathlib import Path 9 | 10 | from groq import Groq 11 | 12 | from sqldeps.llm_parsers.base import BaseSQLExtractor 13 | 14 | 15 | class GroqExtractor(BaseSQLExtractor): 16 | """Groq-based SQL dependency extractor. 17 | 18 | Attributes: 19 | ENV_VAR_NAME: Environment variable name for the API key 20 | client: Groq client instance 21 | """ 22 | 23 | ENV_VAR_NAME = "GROQ_API_KEY" 24 | 25 | def __init__( 26 | self, 27 | model: str = "llama-3.3-70b-versatile", 28 | params: dict | None = None, 29 | api_key: str | None = None, 30 | prompt_path: Path | None = None, 31 | ) -> None: 32 | """Initialize Groq extractor.""" 33 | super().__init__(model, params, prompt_path=prompt_path) 34 | 35 | api_key = api_key or os.getenv(self.ENV_VAR_NAME) 36 | if not api_key: 37 | raise ValueError( 38 | "No API key provided. Either pass api_key parameter or set " 39 | f"{self.ENV_VAR_NAME} environment variable." 40 | ) 41 | 42 | self.client = Groq(api_key=api_key) 43 | 44 | def _query_llm(self, user_prompt: str) -> str: 45 | """Query the Groq LLM with the generated prompt. 46 | 47 | Args: 48 | user_prompt: Generated prompt to send to Groq 49 | 50 | Returns: 51 | Response content from Groq 52 | """ 53 | response = self.client.chat.completions.create( 54 | model=self.model, 55 | messages=[ 56 | {"role": "system", "content": self.prompts["system_prompt"]}, 57 | {"role": "user", "content": user_prompt}, 58 | ], 59 | response_format={"type": "json_object"}, 60 | **self.params, 61 | ) 62 | 63 | return response.choices[0].message.content 64 | -------------------------------------------------------------------------------- /sqldeps/llm_parsers/litellm.py: -------------------------------------------------------------------------------- 1 | """LiteLLM-based SQL parser implementation. 2 | 3 | This module provides the LiteLLM-specific implementation of the BaseSQLExtractor 4 | for using various LLM models to extract SQL dependencies. 5 | """ 6 | 7 | import os 8 | from pathlib import Path 9 | 10 | from litellm import UnsupportedParamsError, completion 11 | 12 | from sqldeps.llm_parsers.base import BaseSQLExtractor 13 | 14 | 15 | class LiteLlmExtractor(BaseSQLExtractor): 16 | """LiteLLM-based SQL dependency extractor. 17 | 18 | This extractor supports multiple LLM providers through LiteLLM. 19 | Authentication is handled by LiteLLM, which supports various methods 20 | depending on the provider (API keys, tokens, or no authentication). 21 | 22 | API keys can be provided as a dictionary mapping environment variable names 23 | to their values. For example: 24 | { 25 | "OPENAI_API_KEY": "sk-...", 26 | "ANTHROPIC_API_KEY": "sk-...", 27 | } 28 | """ 29 | 30 | def __init__( 31 | self, 32 | model: str = "openai/gpt-4.1", 33 | params: dict | None = None, 34 | api_key: dict[str, str] | None = None, 35 | prompt_path: Path | None = None, 36 | ) -> None: 37 | """Initialize LiteLLM extractor. 38 | 39 | Args: 40 | model: LLM model name to use (supports various providers through LiteLLM) 41 | params: Additional parameters for the API 42 | api_key: Optional dictionary mapping environment variable names to 43 | API key values. For example: {"OPENAI_API_KEY": "sk-..."} 44 | prompt_path: Path to custom prompt YAML file 45 | """ 46 | super().__init__(model, params, prompt_path=prompt_path) 47 | 48 | if api_key: 49 | for env_var, key_value in api_key.items(): 50 | os.environ[env_var] = key_value 51 | 52 | def _query_llm(self, user_prompt: str) -> str: 53 | """Query the LLM with the generated prompt using LiteLLM. 54 | 55 | Args: 56 | user_prompt: Generated prompt to send to the LLM 57 | 58 | Returns: 59 | Response content from the LLM 60 | """ 61 | messages = [ 62 | {"role": "system", "content": self.prompts["system_prompt"]}, 63 | {"role": "user", "content": user_prompt}, 64 | ] 65 | 66 | try: 67 | response = completion( 68 | model=self.model, 69 | messages=messages, 70 | response_format={"type": "json_object"}, 71 | **self.params, 72 | ) 73 | except UnsupportedParamsError: 74 | response = completion( 75 | model=self.model, 76 | messages=messages, 77 | response_format={"type": "json_object"}, 78 | ) 79 | 80 | return response.choices[0].message.content 81 | -------------------------------------------------------------------------------- /sqldeps/llm_parsers/openai.py: -------------------------------------------------------------------------------- 1 | """OpenAI-based SQL parser implementation. 2 | 3 | This module provides the OpenAI-specific implementation of the BaseSQLExtractor 4 | for using OpenAI's models to extract SQL dependencies. 5 | """ 6 | 7 | import os 8 | from pathlib import Path 9 | 10 | from openai import BadRequestError, OpenAI 11 | 12 | from sqldeps.llm_parsers.base import BaseSQLExtractor 13 | 14 | 15 | class OpenaiExtractor(BaseSQLExtractor): 16 | """OpenAI-based SQL dependency extractor. 17 | 18 | Attributes: 19 | ENV_VAR_NAME: Environment variable name for the API key 20 | client: OpenAI client instance 21 | """ 22 | 23 | # Expected environmental variable with the OpenAI key 24 | ENV_VAR_NAME = "OPENAI_API_KEY" 25 | 26 | def __init__( 27 | self, 28 | model: str = "gpt-4o", 29 | params: dict | None = None, 30 | api_key: str | None = None, 31 | prompt_path: Path | None = None, 32 | ) -> None: 33 | """Initialize OpenAI extractor. 34 | 35 | Args: 36 | model: OpenAI model name to use 37 | params: Additional parameters for the API 38 | api_key: OpenAI API key (defaults to environment variable) 39 | prompt_path: Path to custom prompt YAML file 40 | 41 | Raises: 42 | ValueError: If API key is not provided 43 | """ 44 | super().__init__(model, params, prompt_path=prompt_path) 45 | 46 | api_key = api_key or os.getenv(self.ENV_VAR_NAME) 47 | if not api_key: 48 | raise ValueError( 49 | "No API key provided. Either pass api_key parameter or set " 50 | f"{self.ENV_VAR_NAME} environment variable." 51 | ) 52 | 53 | self.client = OpenAI(api_key=api_key) 54 | 55 | def _query_llm(self, user_prompt: str) -> str: 56 | """Query the OpenAI LLM with the generated prompt. 57 | 58 | Args: 59 | user_prompt: Generated prompt to send to OpenAI 60 | 61 | Returns: 62 | Response content from OpenAI 63 | """ 64 | messages = [ 65 | {"role": "system", "content": self.prompts["system_prompt"]}, 66 | {"role": "user", "content": user_prompt}, 67 | ] 68 | 69 | try: 70 | response = self.client.chat.completions.create( 71 | model=self.model, 72 | messages=messages, 73 | response_format={"type": "json_object"}, 74 | **self.params, 75 | ) 76 | except BadRequestError as e: 77 | if any(param in str(e) for param in ["temperature", "unsupported"]): 78 | response = self.client.chat.completions.create( 79 | model=self.model, 80 | messages=messages, 81 | response_format={"type": "json_object"}, 82 | ) 83 | else: 84 | raise 85 | 86 | return response.choices[0].message.content 87 | -------------------------------------------------------------------------------- /sqldeps/models.py: -------------------------------------------------------------------------------- 1 | """Data models for SQLDeps. 2 | 3 | This module defines the core data structures used by SQLDeps for 4 | representing SQL dependencies and outputs. 5 | """ 6 | 7 | from dataclasses import dataclass 8 | 9 | import pandas as pd 10 | 11 | 12 | @dataclass 13 | class SQLProfile: 14 | """Data class to hold both SQL dependencies and outputs.""" 15 | 16 | # Dependencies (input tables/columns required by the query) 17 | dependencies: dict[str, list[str]] 18 | 19 | # Outputs (tables/columns created or modified by the query) 20 | outputs: dict[str, list[str]] 21 | 22 | def __post_init__(self) -> None: 23 | """Sort tables and columns for consistent output.""" 24 | self.dependencies = { 25 | table: sorted(set(cols)) 26 | for table, cols in sorted(self.dependencies.items()) 27 | } 28 | self.outputs = { 29 | table: sorted(set(cols)) for table, cols in sorted(self.outputs.items()) 30 | } 31 | 32 | @property 33 | def dependency_tables(self) -> list[str]: 34 | """Get list of dependency tables. 35 | 36 | Returns: 37 | list[str]: Sorted list of table names referenced as dependencies 38 | """ 39 | return sorted(self.dependencies.keys()) 40 | 41 | @property 42 | def outcome_tables(self) -> list[str]: 43 | """Get list of outcome tables. 44 | 45 | Returns: 46 | list[str]: Sorted list of table names referenced as outputs 47 | """ 48 | return sorted(self.outputs.keys()) 49 | 50 | def to_dict(self) -> dict: 51 | """Convert to dictionary format. 52 | 53 | Returns: 54 | dict: Dictionary with dependencies and outputs 55 | """ 56 | return {"dependencies": self.dependencies, "outputs": self.outputs} 57 | 58 | def to_dataframe(self) -> pd.DataFrame: 59 | """Convert to a DataFrame with type column indicating dependency or outcome. 60 | 61 | Returns: 62 | pd.DataFrame: DataFrame with columns for type, schema, table, and column 63 | """ 64 | records = [] 65 | 66 | # Add dependencies 67 | for table, columns in self.dependencies.items(): 68 | schema, table_name = table.split(".") if "." in table else (None, table) 69 | if columns: 70 | for column in columns: 71 | records.append( 72 | { 73 | "type": "dependency", 74 | "schema": schema, 75 | "table": table_name, 76 | "column": column, 77 | } 78 | ) 79 | else: 80 | records.append( 81 | { 82 | "type": "dependency", 83 | "schema": schema, 84 | "table": table_name, 85 | "column": None, 86 | } 87 | ) 88 | 89 | # Add outputs 90 | for table, columns in self.outputs.items(): 91 | schema, table_name = table.split(".") if "." in table else (None, table) 92 | if columns: 93 | for column in columns: 94 | records.append( 95 | { 96 | "type": "outcome", 97 | "schema": schema, 98 | "table": table_name, 99 | "column": column, 100 | } 101 | ) 102 | else: 103 | records.append( 104 | { 105 | "type": "outcome", 106 | "schema": schema, 107 | "table": table_name, 108 | "column": None, 109 | } 110 | ) 111 | 112 | return pd.DataFrame(records) 113 | -------------------------------------------------------------------------------- /sqldeps/parallel.py: -------------------------------------------------------------------------------- 1 | """Parallel processing utilities for SQL dependency extraction. 2 | 3 | This module provides functions for extracting SQL dependencies in parallel 4 | using multiple worker processes, with shared rate limiting. 5 | """ 6 | 7 | from concurrent.futures import ProcessPoolExecutor, as_completed 8 | from functools import partial 9 | from multiprocessing import Manager, cpu_count 10 | from pathlib import Path 11 | 12 | import numpy as np 13 | from loguru import logger 14 | from tenacity import retry, stop_after_attempt, wait_exponential 15 | 16 | from sqldeps.cache import load_from_cache, save_to_cache 17 | from sqldeps.models import SQLProfile 18 | from sqldeps.rate_limiter import MultiprocessingRateLimiter 19 | 20 | 21 | def resolve_workers(n_workers: int) -> int: 22 | """Resolve the number of worker processes to use. 23 | 24 | Args: 25 | n_workers: Requested number of workers (-1 for all, >0 for specific count) 26 | 27 | Returns: 28 | int: Actual number of worker processes to use 29 | 30 | Raises: 31 | ValueError: If n_workers is invalid (not -1, or not between 1 and cpu_count) 32 | """ 33 | max_workers = cpu_count() 34 | 35 | if n_workers == -1: 36 | return max_workers 37 | if 1 <= n_workers <= max_workers: 38 | return n_workers 39 | 40 | raise ValueError( 41 | f"Invalid worker count: {n_workers}. " 42 | f"Must be -1 (all), 1 (single), or up to {max_workers}." 43 | ) 44 | 45 | 46 | def _extract_from_file( 47 | file_path: Path, 48 | rate_limiter: MultiprocessingRateLimiter, 49 | framework: str, 50 | model: str, 51 | prompt_path: Path | None = None, 52 | use_cache: bool = True, 53 | ) -> tuple[Path, object]: 54 | """Process a single file with rate limiting and extraction. 55 | 56 | Args: 57 | file_path: Path to SQL file 58 | rate_limiter: Rate limiter instance 59 | framework: LLM framework to use 60 | model: Model name within the framework 61 | prompt_path: Optional path to custom prompt 62 | use_cache: Whether to use cache 63 | 64 | Returns: 65 | Tuple of (file_path, result) or (file_path, None) on failure 66 | """ 67 | from sqldeps.llm_parsers import create_extractor 68 | 69 | # Check cache if enabled 70 | if use_cache: 71 | result = load_from_cache(file_path) 72 | if result: 73 | return file_path, result 74 | 75 | try: 76 | # Create extractor 77 | extractor = create_extractor( 78 | framework=framework, model=model, prompt_path=prompt_path 79 | ) 80 | 81 | # Apply rate limiting and extract with retry 82 | @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=10)) 83 | def extract_with_rate_limit() -> SQLProfile: 84 | rate_limiter.wait_if_needed() 85 | logger.debug(f"Extracting from file: {file_path}") 86 | return extractor.extract_from_file(file_path) 87 | 88 | result = extract_with_rate_limit() 89 | 90 | # Save to cache if enabled 91 | if use_cache: 92 | save_to_cache(result, file_path) 93 | 94 | return file_path, result 95 | except Exception as e: 96 | logger.error(f"Failed to process {file_path}: {e}") 97 | return file_path, None 98 | 99 | 100 | def _process_batch_files( 101 | batch_files: list[Path], 102 | rate_limiter: MultiprocessingRateLimiter, 103 | framework: str, 104 | model: str, 105 | prompt_path: Path | None = None, 106 | use_cache: bool = True, 107 | ) -> dict: 108 | """Process a batch of files with shared rate limiting. 109 | 110 | Args: 111 | batch_files: List of file paths to process 112 | rate_limiter: Shared rate limiter 113 | framework: LLM framework to use 114 | model: Model name 115 | prompt_path: Optional path to custom prompt 116 | use_cache: Whether to use cache 117 | 118 | Returns: 119 | Dictionary mapping file paths to results 120 | """ 121 | results = {} 122 | 123 | for file_path in batch_files: 124 | path, result = _extract_from_file( 125 | file_path, rate_limiter, framework, model, prompt_path, use_cache 126 | ) 127 | if result: 128 | results[str(path)] = result 129 | 130 | return results 131 | 132 | 133 | def process_files_in_parallel( 134 | sql_files: list[Path], 135 | framework: str = "groq", 136 | model: str | None = None, 137 | prompt_path: Path | None = None, 138 | n_workers: int = 1, 139 | rpm: int = 100, 140 | use_cache: bool = True, 141 | ) -> dict: 142 | """Extract SQL dependencies from SQL files in parallel with rate limiting. 143 | 144 | Args: 145 | sql_files: List of Paths to SQL files to process 146 | framework: LLM framework to use (e.g., groq, openai, deepseek) 147 | model: Model name within the selected framework 148 | prompt_path: Path to custom prompt YAML file 149 | n_workers: Number of worker processes to use (-1 for all) 150 | rpm: Requests per minute limit across all workers 151 | use_cache: Whether to use cached results 152 | 153 | Returns: 154 | Dictionary mapping file paths to SQLProfile objects 155 | 156 | Raises: 157 | ValueError: If no SQL files provided or no dependencies extracted 158 | """ 159 | # Resolve number of workers 160 | n_workers = resolve_workers(n_workers) 161 | 162 | # Ensure we have a list of Path objects 163 | sql_files = [Path(f) for f in sql_files] 164 | 165 | if not sql_files: 166 | raise ValueError("No SQL files provided") 167 | 168 | logger.info(f"Processing {len(sql_files)} SQL files") 169 | logger.info( 170 | f"Using {n_workers} workers with global rate limit of {rpm} requests per minute" 171 | ) 172 | logger.info(f"Cache usage: {'enabled' if use_cache else 'disabled'}") 173 | 174 | # Calculate optimal number of workers (don't use more workers than files) 175 | n_workers = min(n_workers, len(sql_files)) 176 | 177 | # Split files into batches 178 | batches = np.array_split(sql_files, n_workers) 179 | batches = [list(batch) for batch in batches if len(batch) > 0] 180 | 181 | all_results = {} 182 | 183 | # Create shared rate limiter 184 | with Manager() as manager: 185 | rate_limiter = MultiprocessingRateLimiter(manager, rpm) 186 | 187 | # Process batches in parallel 188 | with ProcessPoolExecutor(max_workers=n_workers) as executor: 189 | process_func = partial( 190 | _process_batch_files, 191 | rate_limiter=rate_limiter, 192 | framework=framework, 193 | model=model, 194 | prompt_path=prompt_path, 195 | use_cache=use_cache, 196 | ) 197 | 198 | futures = { 199 | executor.submit(process_func, batch): i 200 | for i, batch in enumerate(batches) 201 | } 202 | 203 | for future in as_completed(futures): 204 | batch_idx = futures[future] 205 | try: 206 | batch_results = future.result() 207 | all_results.update(batch_results) 208 | logger.info( 209 | f"Completed batch {batch_idx + 1}/{len(batches)} with " 210 | f"{len(batch_results)} results" 211 | ) 212 | except Exception as e: 213 | logger.error(f"Batch {batch_idx + 1} failed: {e}") 214 | 215 | # If no results were extracted 216 | if not all_results: 217 | raise ValueError("No dependencies could be extracted from any SQL file") 218 | 219 | return all_results 220 | -------------------------------------------------------------------------------- /sqldeps/rate_limiter.py: -------------------------------------------------------------------------------- 1 | """Rate limiting utilities for API calls. 2 | 3 | This module provides classes for limiting the rate of API calls to stay 4 | within provider limits, in both single-process and multi-process contexts. 5 | """ 6 | 7 | import time 8 | from collections import deque 9 | from multiprocessing.managers import SyncManager 10 | 11 | from loguru import logger 12 | 13 | 14 | class RateLimiter: 15 | """Rate limiter to prevent exceeding API rate limits. 16 | 17 | Tracks API call timestamps and enforces waiting periods 18 | to respect the specified requests per minute (RPM) limit. 19 | 20 | Attributes: 21 | rpm: Maximum requests per minute allowed 22 | call_times: Deque storing timestamps of recent API calls 23 | window: Time window in seconds (default: 60 seconds = 1 minute) 24 | """ 25 | 26 | def __init__(self, rpm: int) -> None: 27 | """Initialize the rate limiter with an RPM limit. 28 | 29 | Args: 30 | rpm: Maximum number of API requests allowed per minute 31 | """ 32 | self.rpm = rpm 33 | self.call_times = deque() 34 | self.window = 60 # 60 seconds = 1 minute window 35 | 36 | def wait_if_needed(self) -> None: 37 | """Ensures that calls do not exceed the rate limit. 38 | 39 | If the limit is reached, it waits until a slot is available. 40 | """ 41 | if self.rpm <= 0: # Disable rate limiting if rpm is 0 42 | return 43 | 44 | now = time.time() 45 | 46 | # Remove timestamps older than our time window (60 seconds) 47 | cutoff = now - self.window 48 | while self.call_times and self.call_times[0] < cutoff: 49 | self.call_times.popleft() 50 | 51 | # If we've reached the RPM limit, wait until the oldest timestamp expires 52 | if len(self.call_times) >= self.rpm: 53 | wait_time = max(0, self.call_times[0] + self.window - now) 54 | if wait_time > 0: 55 | logger.debug(f"Rate limit reached. Waiting {wait_time:.2f} seconds") 56 | time.sleep(wait_time) 57 | 58 | # After waiting, recalculate current time and clean up again 59 | now = time.time() 60 | cutoff = now - self.window 61 | while self.call_times and self.call_times[0] < cutoff: 62 | self.call_times.popleft() 63 | 64 | # Record this API call's timestamp 65 | self.call_times.append(now) 66 | 67 | 68 | class MultiprocessingRateLimiter: 69 | """A shared rate limiter for multiprocessing environments. 70 | 71 | Uses a manager to share state between processes, ensuring 72 | all processes collectively respect the RPM limit. 73 | 74 | Attributes: 75 | call_times: A shared list of API call timestamps 76 | lock: A shared lock for thread-safe operations 77 | rpm: Maximum requests per minute allowed 78 | window: Time window in seconds 79 | """ 80 | 81 | def __init__(self, manager: SyncManager, rpm: int) -> None: 82 | """Initialize with a multiprocessing manager and RPM limit. 83 | 84 | Args: 85 | manager: A multiprocessing.Manager instance 86 | rpm: Maximum requests per minute allowed 87 | """ 88 | self.call_times = manager.list() 89 | self.lock = manager.RLock() 90 | self.rpm = rpm 91 | self.window = 60 92 | 93 | def wait_if_needed(self) -> None: 94 | """Ensures calls don't exceed the rate limit across processes.""" 95 | if self.rpm <= 0: 96 | return 97 | 98 | with self.lock: 99 | now = time.time() 100 | cutoff = now - self.window 101 | 102 | # Remove old timestamps 103 | while self.call_times and self.call_times[0] < cutoff: 104 | self.call_times.pop(0) 105 | 106 | # Wait if at RPM limit 107 | if len(self.call_times) >= self.rpm: 108 | wait_time = max(0, self.call_times[0] + self.window - now) 109 | if wait_time > 0: 110 | logger.debug(f"Rate limit reached. Waiting {wait_time:.2f} seconds") 111 | time.sleep(wait_time) 112 | 113 | # Recalculate after waiting 114 | now = time.time() 115 | cutoff = now - self.window 116 | while self.call_times and self.call_times[0] < cutoff: 117 | self.call_times.pop(0) 118 | 119 | # Record this call 120 | self.call_times.append(now) 121 | -------------------------------------------------------------------------------- /sqldeps/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for SQLDeps. 2 | 3 | This module provides helper functions for finding SQL files, merging SQL profiles, 4 | and performing schema validation and comparison. 5 | """ 6 | 7 | from pathlib import Path 8 | 9 | import pandas as pd 10 | 11 | from sqldeps.models import SQLProfile 12 | 13 | 14 | def find_sql_files( 15 | folder_path: str | Path, 16 | recursive: bool = False, 17 | valid_extensions: set[str] | None = None, 18 | ) -> list[Path]: 19 | """Find SQL files in a folder. 20 | 21 | Args: 22 | folder_path: Path to the folder 23 | recursive: Whether to search recursively 24 | valid_extensions: Set of valid file extensions (default: {'sql'}) 25 | 26 | Returns: 27 | List of file paths 28 | 29 | Raises: 30 | FileNotFoundError: If folder doesn't exist 31 | NotADirectoryError: If path is not a directory 32 | ValueError: If no SQL files are found 33 | """ 34 | folder_path = Path(folder_path) 35 | 36 | # Validate folder 37 | if not folder_path.exists(): 38 | raise FileNotFoundError(f"Folder not found: {folder_path}") 39 | 40 | if not folder_path.is_dir(): 41 | raise NotADirectoryError(f"Path is not a directory: {folder_path}") 42 | 43 | # Default extensions if not provided 44 | valid_extensions = valid_extensions or {"sql"} 45 | valid_extensions = {ext.lower().lstrip(".") for ext in valid_extensions} 46 | 47 | # Find matching files 48 | pattern = "**/*" if recursive else "*" 49 | sql_files = [ 50 | f 51 | for f in folder_path.glob(pattern) 52 | if f.is_file() and f.suffix.lower().lstrip(".") in valid_extensions 53 | ] 54 | 55 | if not sql_files: 56 | raise ValueError(f"No SQL files found in {folder_path}") 57 | 58 | return sql_files 59 | 60 | 61 | def merge_profiles(analyses: list[SQLProfile]) -> SQLProfile: 62 | """Merges multiple SQLProfile objects into a single one. 63 | 64 | Args: 65 | analyses: List of SQLProfile objects to merge 66 | 67 | Returns: 68 | A new SQLProfile with merged dependencies and outputs 69 | """ 70 | merged_dependencies = {} 71 | merged_outputs = {} 72 | 73 | for analysis in analyses: 74 | # Merge dependencies 75 | for table, columns in analysis.dependencies.items(): 76 | if "*" in columns: 77 | merged_dependencies[table] = {"*"} 78 | else: 79 | merged_dependencies.setdefault(table, set()).update(columns) 80 | 81 | # Merge outputs 82 | for table, columns in analysis.outputs.items(): 83 | if "*" in columns: 84 | merged_outputs[table] = {"*"} 85 | else: 86 | merged_outputs.setdefault(table, set()).update(columns) 87 | 88 | return SQLProfile( 89 | dependencies={ 90 | table: list(columns) for table, columns in merged_dependencies.items() 91 | }, 92 | outputs={table: list(columns) for table, columns in merged_outputs.items()}, 93 | ) 94 | 95 | 96 | def merge_schemas( 97 | df_extracted_schema: pd.DataFrame, df_db_schema: pd.DataFrame 98 | ) -> pd.DataFrame: 99 | """Matches extracted SQL dependencies with the actual database schema. 100 | 101 | Handles both exact schema matches and schema-agnostic matches. 102 | Expands wildcards ('*') to match all columns from the relevant table(s). 103 | Handles tables with no columns (None). 104 | 105 | Args: 106 | df_extracted_schema: Extracted table-column dependencies 107 | df_db_schema: Actual database schema information 108 | 109 | Returns: 110 | Merged schema with an `exact_match` flag indicating whether 111 | the schema name matched exactly 112 | """ 113 | # Create copy to avoid modifying input 114 | df_extracted = df_extracted_schema.copy() 115 | df_extracted["exact_match"] = pd.Series(dtype="boolean") 116 | 117 | # Initialize empty DataFrame with correct dtypes 118 | df_no_columns = pd.DataFrame( 119 | { 120 | "schema": pd.Series(dtype="object"), 121 | "table": pd.Series(dtype="object"), 122 | "column": pd.Series(dtype="object"), 123 | "data_type": pd.Series(dtype="object"), 124 | "exact_match": pd.Series(dtype="boolean"), 125 | } 126 | ) 127 | 128 | # Handle tables with no columns (None) 129 | if (no_columns_mask := df_extracted["column"].isna()).any(): 130 | no_columns_deps = df_extracted.loc[no_columns_mask, ["schema", "table"]] 131 | df_extracted = df_extracted.loc[~no_columns_mask] 132 | 133 | # Exact schema match 134 | exact_matches = ( 135 | no_columns_deps.dropna(subset=["schema"]) 136 | .merge(df_db_schema[["schema", "table"]], on=["schema", "table"]) 137 | .assign(column=None, data_type=None, exact_match=True) 138 | ) 139 | 140 | # Schema-agnostic match 141 | schema_agnostic = no_columns_deps[no_columns_deps["schema"].isna()] 142 | matching_schemas = df_db_schema.merge(schema_agnostic[["table"]], on="table")[ 143 | ["schema", "table"] 144 | ] 145 | schema_agnostic_matches = matching_schemas.assign( 146 | column=None, data_type=None, exact_match=False 147 | ) 148 | 149 | # Combine results 150 | df_no_columns = pd.concat( 151 | [exact_matches, schema_agnostic_matches], ignore_index=True 152 | ) 153 | 154 | # Expand wildcards (*) to include all relevant columns 155 | if (wildcard_mask := df_extracted["column"] == "*").any(): 156 | regular_deps = df_extracted[~wildcard_mask] 157 | wildcard_deps = df_extracted[wildcard_mask] 158 | expanded_wildcard_deps = [] 159 | 160 | for _, row in wildcard_deps.iterrows(): 161 | mask = df_db_schema["table"] == row["table"] 162 | if pd.notna(row["schema"]): 163 | mask &= df_db_schema["schema"] == row["schema"] 164 | wildcard_schema = df_db_schema[mask][ 165 | ["schema", "table", "column"] 166 | ].assign(exact_match=True) 167 | else: 168 | wildcard_schema = df_db_schema[mask][ 169 | ["schema", "table", "column"] 170 | ].assign(exact_match=False) 171 | expanded_wildcard_deps.append(wildcard_schema) 172 | 173 | df_extracted = pd.concat( 174 | [regular_deps, *expanded_wildcard_deps], ignore_index=True 175 | ) 176 | 177 | # Exact schema matches 178 | exact_matches = ( 179 | df_extracted[df_extracted["schema"].notna()] 180 | .merge(df_db_schema, how="inner") 181 | .fillna({"exact_match": True}) 182 | ) 183 | 184 | # Schema-agnostic matches (ignoring schema column) 185 | schemaless_matches = ( 186 | df_extracted[df_extracted["schema"].isna()] 187 | .drop(columns="schema") 188 | .merge(df_db_schema, how="inner") 189 | .fillna({"exact_match": False}) 190 | ) 191 | 192 | # Combine all results & remove duplicates with priority to exact matches 193 | df_merged_schemas = ( 194 | pd.concat([exact_matches, schemaless_matches, df_no_columns], ignore_index=True) 195 | .reindex(columns=["schema", "table", "column", "data_type", "exact_match"]) 196 | # Sort values to give priority to exact matches 197 | .sort_values( 198 | by=["schema", "table", "column", "data_type", "exact_match"], 199 | ascending=[True, True, True, True, False], 200 | na_position="last", 201 | ) 202 | # Drop duplicates (keep exact matches) 203 | .drop_duplicates(subset=["schema", "table", "column", "data_type"]) 204 | .reset_index(drop=True) 205 | ) 206 | 207 | return df_merged_schemas 208 | 209 | 210 | def schema_diff( 211 | df_extracted_schema: pd.DataFrame, df_db_schema: pd.DataFrame, copy: bool = True 212 | ) -> pd.DataFrame: 213 | """Checks if extracted schema entries exist in the database schema. 214 | 215 | Args: 216 | df_extracted_schema: Extracted table-column dependencies 217 | df_db_schema: Actual database schema information 218 | copy: Whether to create a copy of the input DataFrame 219 | 220 | Returns: 221 | The extracted schema with an added `match_db` flag 222 | """ 223 | # Copy dataframe to avoid in-place update 224 | if copy: 225 | df_extracted_schema = df_extracted_schema.copy() 226 | 227 | # Create sets for quick lookup 228 | db_exact_matches = set( 229 | zip( 230 | df_db_schema["schema"], 231 | df_db_schema["table"], 232 | df_db_schema["column"], 233 | strict=False, 234 | ) 235 | ) 236 | db_table_matches = set( 237 | zip(df_db_schema["schema"], df_db_schema["table"], strict=False) 238 | ) 239 | db_schema_agnostic = set( 240 | zip(df_db_schema["table"], df_db_schema["column"], strict=False) 241 | ) 242 | db_table_agnostic = set(df_db_schema["table"]) 243 | 244 | def check_existence(row: pd.Series) -> bool: 245 | """Helper function to determine if a row exists in the DB schema.""" 246 | if pd.notna(row["schema"]): 247 | if row["column"] == "*": 248 | return (row["schema"], row["table"]) in db_table_matches 249 | return (row["schema"], row["table"], row["column"]) in db_exact_matches 250 | else: 251 | if row["column"] == "*": 252 | return row["table"] in db_table_agnostic 253 | return (row["table"], row["column"]) in db_schema_agnostic 254 | 255 | # Apply vectorized check 256 | df_extracted_schema["match_db"] = df_extracted_schema.apply(check_existence, axis=1) 257 | 258 | return df_extracted_schema 259 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration and fixtures for all tests. 2 | 3 | This module provides pytest configuration, custom command-line options, 4 | and fixtures shared across test modules. 5 | """ 6 | 7 | from pathlib import Path 8 | 9 | import pytest 10 | 11 | from sqldeps.llm_parsers import BaseSQLExtractor, create_extractor 12 | 13 | # Base paths 14 | TEST_DATA_DIR = Path(__file__).parent / "data" 15 | SQL_DIR = TEST_DATA_DIR / "sql" 16 | EXPECTED_OUTPUT_DIR = TEST_DATA_DIR / "expected_outputs" 17 | 18 | 19 | def pytest_addoption(parser: pytest.Parser) -> None: 20 | """Register custom pytest command-line options. 21 | 22 | Args: 23 | parser: Pytest command-line parser 24 | """ 25 | parser.addoption( 26 | "--framework", 27 | action="store", 28 | default="litellm", 29 | help="Specify the framework to use (litellm, openai, groq, deepseek)", 30 | ) 31 | parser.addoption( 32 | "--model", 33 | action="store", 34 | default=None, 35 | help="Specify the model to use within the selected framework", 36 | ) 37 | parser.addoption( 38 | "--prompt", 39 | action="store", 40 | default=None, 41 | help="Specify the path to the prompt yml file to use a custom prompt", 42 | ) 43 | 44 | 45 | def pytest_configure(config: pytest.Config) -> None: 46 | """Register custom markers. 47 | 48 | Args: 49 | config: Pytest configuration object 50 | """ 51 | config.addinivalue_line( 52 | "markers", 53 | "llm: mark tests that require LLM API calls (typically skipped in CI/CD)", 54 | ) 55 | config.addinivalue_line( 56 | "markers", "integration: mark tests that integrate with external services" 57 | ) 58 | config.addinivalue_line("markers", "slow: mark tests that are slow to execute") 59 | 60 | 61 | def pytest_collection_modifyitems( 62 | items: list[pytest.Item], config: pytest.Config 63 | ) -> None: 64 | """Skip slow tests when only llm marker is specified.""" 65 | # Get the value of -m if specified 66 | markexpr = config.getoption("-m", default="") 67 | 68 | # Check if "llm" is specified but "slow" is not 69 | if "llm" in markexpr and "slow" not in markexpr: 70 | skip_marker = pytest.mark.skip( 71 | reason=( 72 | "Slow tests are skipped by default. Use -m 'llm and slow' to run them." 73 | ) 74 | ) 75 | for item in items: 76 | # If the test has both llm and slow markers, skip it 77 | if "slow" in item.keywords and "llm" in item.keywords: 78 | item.add_marker(skip_marker) 79 | 80 | 81 | @pytest.fixture 82 | def extractor(request: pytest.FixtureRequest) -> BaseSQLExtractor: 83 | """Create an extractor based on command-line options. 84 | 85 | Args: 86 | request: Pytest request object 87 | 88 | Returns: 89 | A configured SQLDeps extractor 90 | """ 91 | framework = request.config.getoption("--framework") 92 | model = request.config.getoption("--model") 93 | prompt = request.config.getoption("--prompt") 94 | 95 | return create_extractor(framework, model, prompt_path=prompt) 96 | -------------------------------------------------------------------------------- /tests/data/expected_outputs/example10_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "customers": ["customer_name", "id"], 4 | "employees": [], 5 | "logs": [], 6 | "my_db.orders": ["order_date", "order_id", "total_amount"], 7 | "products": ["product_id", "product_name"], 8 | "reports.sales_report": ["customer_name", "product_name", "sale_id"], 9 | "sales": ["amount", "customer_id", "id"] 10 | }, 11 | "outputs": { 12 | "logs": [], 13 | "reports.sales_report": ["customer_name", "product_name", "sale_id"] 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example1_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "users": ["id", "name"] 4 | }, 5 | "outputs": { 6 | 7 | } 8 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example2_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "users": [] 4 | }, 5 | "outputs": { 6 | 7 | } 8 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example3_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "my_db.users": ["id", "name"], 4 | "orders": ["order_id", "user_id"] 5 | }, 6 | "outputs": { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /tests/data/expected_outputs/example4_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "my_db.users": ["email", "id", "name", "status"], 4 | "orders": ["order_date", "order_id", "order_type", "priority_level", "shipping_status", "total_amount", "user_id"] 5 | }, 6 | "outputs": { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /tests/data/expected_outputs/example5_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "orders": ["user_id"], 4 | "users": ["id", "name"] 5 | }, 6 | "outputs": { 7 | 8 | } 9 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example6_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "orders": ["user_id"], 4 | "users": ["id", "name"] 5 | }, 6 | "outputs": { 7 | 8 | } 9 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example7_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "orders": ["user_id"], 4 | "users": ["id", "name"] 5 | }, 6 | "outputs": { 7 | 8 | } 9 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example8_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "build_public.Property_Shape": ["PropertyId", "ShapeId"], 4 | "build_spatial.Shape_Defor": ["ShapeId", "Year", "areaha"], 5 | "web_import.Api_Property_Defor": ["Ha", "PropertyId", "Year"] 6 | }, 7 | "outputs": { 8 | "web_import.Api_Property_Defor": ["Ha", "PropertyId", "Year"] 9 | } 10 | } -------------------------------------------------------------------------------- /tests/data/expected_outputs/example9_expected.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "orders": ["user_id"], 4 | "pgi_shape_clusters": ["PropertyGroupId", "ShapeCluster", "ShapeGroupId"], 5 | "spatial.Shape": ["ShapeId", "geom"], 6 | "users": ["id", "name"] 7 | }, 8 | "outputs": { 9 | "pgi_shape_geom_clusters": ["PropertyGroupId", "ShapeCluster", "ShapeGroupId", "geom"] 10 | } 11 | } -------------------------------------------------------------------------------- /tests/data/oneshot.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "schema1.users": ["user_id", "username", "status", "*", "registration_date"], 4 | "schema1.orders": ["user_id", "order_id", "order_date", "total_amount", "customer_id"], 5 | "schema1.order_items": ["order_id", "product_id", "quantity", "order_date"], 6 | "schema1.products": ["product_id", "product_name", "category", "current_stock", "stock_status", "last_updated"], 7 | "schema2.customer_metrics": ["customer_id", "monthly_order_count", "monthly_spend", "last_updated"], 8 | "schema2.audit_logs": [], 9 | "schema2.daily_summary": [] 10 | }, 11 | "outputs": { 12 | "schema2.customer_metrics": ["customer_id", "monthly_order_count", "monthly_spend", "last_updated"], 13 | "schema2.audit_logs": [], 14 | "schema2.daily_summary": ["date", "total_orders", "total_revenue", "avg_order_value"], 15 | "schema1.products": ["stock_status", "last_updated"], 16 | "schema2.monthly_report": ["month", "category", "order_count", "customer_count", "total_items_sold", "total_revenue"] 17 | } 18 | } -------------------------------------------------------------------------------- /tests/data/oneshot.sql: -------------------------------------------------------------------------------- 1 | -- This oneshot example demonstrates key SQL dependency extraction scenarios 2 | -- including table dependencies, outputs, temporary artifacts, truncates, and more 3 | 4 | -- Scenario 1: Simple SELECT with JOIN and WHERE clauses 5 | SELECT 6 | u.user_id, 7 | u.username, 8 | o.order_date, 9 | o.total_amount, 10 | p.product_name 11 | FROM 12 | schema1.users u 13 | JOIN 14 | schema1.orders o ON u.user_id = o.user_id 15 | JOIN 16 | schema1.order_items oi ON o.order_id = oi.order_id 17 | JOIN 18 | schema1.products p ON oi.product_id = p.product_id 19 | WHERE 20 | o.order_date > '2023-01-01' 21 | AND p.category = 'Electronics' 22 | AND u.status = 'active'; 23 | 24 | -- Scenario 2: CTE and INSERT operation with columns 25 | WITH recent_orders AS ( 26 | SELECT 27 | customer_id, 28 | COUNT(*) as order_count, 29 | SUM(total_amount) as total_spent 30 | FROM 31 | schema1.orders 32 | WHERE 33 | order_date > CURRENT_DATE - INTERVAL '30 days' 34 | GROUP BY 35 | customer_id 36 | ) 37 | INSERT INTO schema2.customer_metrics (customer_id, monthly_order_count, monthly_spend, last_updated) 38 | SELECT 39 | customer_id, 40 | order_count, 41 | total_spent, 42 | CURRENT_TIMESTAMP 43 | FROM 44 | recent_orders 45 | WHERE 46 | order_count > 0; 47 | 48 | -- Scenario 3: TRUNCATE alone (should appear in both dependencies and outputs) 49 | TRUNCATE TABLE schema2.audit_logs; 50 | 51 | -- Scenario 4: TRUNCATE followed by population of specific columns 52 | TRUNCATE TABLE schema2.daily_summary; 53 | 54 | INSERT INTO schema2.daily_summary (date, total_orders, total_revenue, avg_order_value) 55 | SELECT 56 | CURRENT_DATE, 57 | COUNT(*), 58 | SUM(total_amount), 59 | AVG(total_amount) 60 | FROM 61 | schema1.orders 62 | WHERE 63 | order_date = CURRENT_DATE; 64 | 65 | -- Scenario 5: UPDATE with subquery 66 | UPDATE schema1.products 67 | SET 68 | stock_status = 69 | CASE 70 | WHEN current_stock = 0 THEN 'Out of Stock' 71 | WHEN current_stock < 10 THEN 'Low Stock' 72 | ELSE 'In Stock' 73 | END, 74 | last_updated = CURRENT_TIMESTAMP 75 | WHERE 76 | product_id IN ( 77 | SELECT 78 | product_id 79 | FROM 80 | schema1.order_items 81 | WHERE 82 | order_date > CURRENT_DATE - INTERVAL '7 days' 83 | ); 84 | 85 | -- Scenario 6: CREATE TABLE and immediate population 86 | CREATE TABLE schema2.monthly_report AS 87 | SELECT 88 | DATE_TRUNC('month', o.order_date) AS month, 89 | p.category, 90 | COUNT(DISTINCT o.order_id) AS order_count, 91 | COUNT(DISTINCT o.user_id) AS customer_count, 92 | SUM(oi.quantity) AS total_items_sold, 93 | SUM(o.total_amount) AS total_revenue 94 | FROM 95 | schema1.orders o 96 | JOIN 97 | schema1.order_items oi ON o.order_id = oi.order_id 98 | JOIN 99 | schema1.products p ON oi.product_id = p.product_id 100 | GROUP BY 101 | DATE_TRUNC('month', o.order_date), 102 | p.category; 103 | 104 | -- Scenario 7: SELECT * (should generate ["*"] in dependencies) 105 | SELECT * 106 | FROM schema1.users 107 | WHERE registration_date > CURRENT_DATE - INTERVAL '90 days'; -------------------------------------------------------------------------------- /tests/data/sql/example1.sql: -------------------------------------------------------------------------------- 1 | -- Simple query selecting a subset of columns 2 | SELECT id, name FROM users -------------------------------------------------------------------------------- /tests/data/sql/example10.sql: -------------------------------------------------------------------------------- 1 | -- PostgreSQL function that uses CTEs and creates a table 2 | CREATE OR REPLACE FUNCTION generate_sales_report() 3 | RETURNS void AS $$ 4 | BEGIN 5 | -- Use CTEs to process data 6 | WITH cte_sales AS ( 7 | SELECT 8 | s.id AS sale_id, 9 | s.amount, 10 | c.customer_name 11 | FROM sales s 12 | JOIN customers c ON s.customer_id = c.id 13 | ), 14 | cte_products AS ( 15 | SELECT 16 | p.product_id, 17 | p.product_name 18 | FROM products p 19 | ) 20 | -- Insert the processed data into a new table 21 | INSERT INTO reports.sales_report (sale_id, customer_name, product_name) 22 | SELECT 23 | cte_sales.sale_id, 24 | cte_sales.customer_name, 25 | cte_products.product_name 26 | FROM cte_sales 27 | JOIN cte_products ON cte_sales.sale_id = cte_products.product_id; 28 | END; 29 | $$ LANGUAGE plpgsql; 30 | 31 | -- Truncate a table 32 | TRUNCATE TABLE logs; 33 | 34 | -- Query from a specific database 35 | SELECT 36 | my_db.orders.order_id, 37 | my_db.orders.order_date, 38 | my_db.orders.total_amount 39 | FROM my_db.orders; 40 | 41 | -- Select all columns from a table 42 | SELECT * 43 | FROM employees 44 | LIMIT 10; 45 | -------------------------------------------------------------------------------- /tests/data/sql/example2.sql: -------------------------------------------------------------------------------- 1 | -- Simple query selecting all columns 2 | SELECT * FROM users LIMIT 100 -------------------------------------------------------------------------------- /tests/data/sql/example3.sql: -------------------------------------------------------------------------------- 1 | -- Query with table alias, with and without database specification, and join 2 | SELECT u.id, u.name, o.order_id 3 | FROM my_db.users u 4 | JOIN orders AS o ON u.id = o.user_id -------------------------------------------------------------------------------- /tests/data/sql/example4.sql: -------------------------------------------------------------------------------- 1 | -- Query with table alias, with and without database specification, and join, and where clauses 2 | SELECT u.id, u.name, o.order_id 3 | FROM my_db.users u 4 | JOIN orders AS o ON u.id = o.user_id 5 | WHERE u.status = 'active' 6 | AND o.order_date >= '2024-01-01' 7 | AND o.total_amount > 100.00 8 | AND u.email LIKE '%@company.com' 9 | AND o.order_type IN ('retail', 'wholesale') 10 | AND ( 11 | o.shipping_status = 'pending' 12 | OR (o.shipping_status = 'processed' AND o.priority_level = 'high') 13 | ); -------------------------------------------------------------------------------- /tests/data/sql/example5.sql: -------------------------------------------------------------------------------- 1 | -- Simple CTE 2 | WITH user_orders AS ( 3 | SELECT user_id, COUNT(*) as order_count 4 | FROM orders 5 | GROUP BY user_id 6 | ) 7 | SELECT u.name, uo.order_count 8 | FROM users u 9 | JOIN user_orders uo ON u.id = uo.user_id; -------------------------------------------------------------------------------- /tests/data/sql/example6.sql: -------------------------------------------------------------------------------- 1 | -- Simple Subquery 1 2 | SELECT 3 | u.name, 4 | ( 5 | SELECT COUNT(*) 6 | FROM orders o 7 | WHERE o.user_id = u.id 8 | GROUP BY o.user_id 9 | ) as order_count 10 | FROM users u; -------------------------------------------------------------------------------- /tests/data/sql/example7.sql: -------------------------------------------------------------------------------- 1 | -- Simple Subquery 2 2 | SELECT 3 | u.name, 4 | uo.order_count 5 | FROM users u 6 | JOIN ( 7 | SELECT 8 | user_id, 9 | COUNT(*) as order_count 10 | FROM orders 11 | GROUP BY user_id 12 | ) uo ON u.id = uo.user_id; -------------------------------------------------------------------------------- /tests/data/sql/example8.sql: -------------------------------------------------------------------------------- 1 | -- Postgres Function 2 | CREATE OR REPLACE FUNCTION web_import."build_Api_Property_Defor"() 3 | RETURNS void 4 | LANGUAGE plpgsql 5 | AS $function$BEGIN 6 | TRUNCATE TABLE web_import."Api_Property_Defor"; 7 | 8 | INSERT INTO web_import."Api_Property_Defor"( 9 | "PropertyId", "Year", "Ha" 10 | ) 11 | SELECT ps."PropertyId", d."Year", avg("Defor") AS "Ha" 12 | FROM build_public."Property_Shape" ps 13 | INNER JOIN ( 14 | SELECT "ShapeId", "Year"::INTEGER, SUM("areaha") AS "Defor" 15 | FROM build_spatial."Shape_Defor" 16 | WHERE "Year"::text ~ '^[0-9]+$' 17 | -- and "areaha">6.25 18 | GROUP BY "ShapeId", "Year"::INTEGER 19 | ) d 20 | ON 21 | d."ShapeId" = ps."ShapeId" 22 | WHERE ps."PropertyId" IS NOT NULL 23 | GROUP BY ps."PropertyId", d."Year"; 24 | 25 | END 26 | $function$ -------------------------------------------------------------------------------- /tests/data/sql/example9.sql: -------------------------------------------------------------------------------- 1 | -- Multiple queries with CTEs & function 2 | CREATE OR REPLACE FUNCTION make_pgi_shape_geom_clusters() 3 | RETURNS VOID 4 | LANGUAGE plpgsql 5 | AS $function$ 6 | BEGIN 7 | 8 | -- Build table with cluster + geom data 9 | DROP TABLE IF EXISTS pgi_shape_geom_clusters CASCADE; 10 | CREATE TABLE pgi_shape_geom_clusters AS 11 | SELECT 12 | pgic."PropertyGroupId", 13 | pgic."ShapeGroupId", 14 | sh.geom, 15 | pgic."ShapeCluster" 16 | FROM 17 | pgi_shape_clusters pgic 18 | LEFT JOIN 19 | spatial."Shape" sh 20 | ON 21 | pgic."PropertyGroupId" = sh."ShapeId"; 22 | 23 | -- Integrity check: A Property observation should have at most one row 24 | ALTER TABLE pgi_shape_geom_clusters ADD PRIMARY KEY ("PropertyGroupId","ShapeGroupId"); 25 | ANALYZE VERBOSE pgi_shape_geom_clusters; 26 | 27 | END 28 | $function$; 29 | 30 | WITH user_orders AS ( 31 | SELECT user_id, COUNT(*) as order_count 32 | FROM orders 33 | GROUP BY user_id 34 | ) 35 | SELECT u.name, uo.order_count 36 | FROM users u 37 | JOIN user_orders uo ON u.id = uo.user_id; -------------------------------------------------------------------------------- /tests/functional/test_sql.py: -------------------------------------------------------------------------------- 1 | """Functional tests for SQL dependency extraction. 2 | 3 | This module tests the end-to-end functionality of SQL dependency extraction 4 | against a set of predefined SQL files with expected outputs. 5 | 6 | The module provides two testing approaches: 7 | 1. A fast batch test (test_sql_dependency_extraction_batch) that extracts 8 | dependencies from all SQL files at once using parallel processing 9 | 2. A slower individual test (test_sql_dependency_extraction_individual) that 10 | processes each file separately, which is useful for debugging specific files 11 | 12 | The batch approach is more efficient as it leverages parallel processing 13 | and extracts dependencies from all files in a single operation. 14 | """ 15 | 16 | import json 17 | from pathlib import Path 18 | 19 | import pytest 20 | 21 | from sqldeps.llm_parsers import BaseSQLExtractor 22 | 23 | TEST_DATA_DIR = Path(__file__).parent.parent / "data" 24 | SQL_DIR = TEST_DATA_DIR / "sql" 25 | EXPECTED_OUTPUT_DIR = TEST_DATA_DIR / "expected_outputs" 26 | 27 | 28 | # Create a list of test cases from data directory 29 | def get_test_cases() -> list: 30 | """Create a list of test cases from data directory. 31 | 32 | Each test case is a tuple of (sql_file, expected_output_file). 33 | 34 | Returns: 35 | list: List of test case tuples 36 | """ 37 | test_cases = [] 38 | for sql_file in SQL_DIR.glob("example*.sql"): 39 | expected_file = EXPECTED_OUTPUT_DIR / f"{sql_file.stem}_expected.json" 40 | if expected_file.exists(): 41 | test_cases.append((sql_file, expected_file)) 42 | return test_cases 43 | 44 | 45 | def load_expected_output(expected_output_file: Path) -> dict: 46 | """Load the expected output from a JSON file. 47 | 48 | Args: 49 | expected_output_file: Path to the expected output JSON file 50 | 51 | Returns: 52 | dict: The expected output as a dictionary 53 | """ 54 | with open(expected_output_file) as f: 55 | return json.load(f) 56 | 57 | 58 | @pytest.mark.llm 59 | def test_sql_dependency_extraction_batch(extractor: BaseSQLExtractor) -> None: 60 | """Test extraction of dependencies from all SQL files at once. 61 | 62 | This is more efficient than testing each file individually as it 63 | extracts dependencies from all files in a single batch. 64 | 65 | Args: 66 | extractor: SQLDeps extractor fixture 67 | """ 68 | # Get all the test cases 69 | test_cases = get_test_cases() 70 | 71 | # Extract dependencies from all SQL files at once 72 | results = extractor.extract_from_folder( 73 | SQL_DIR, recursive=False, n_workers=-1, use_cache=False, rpm=100 74 | ) 75 | 76 | # Verify each result against its expected output 77 | for sql_file, expected_output_file in test_cases: 78 | expected_output = load_expected_output(expected_output_file) 79 | extracted = results[str(sql_file)].to_dict() 80 | 81 | # Use a more descriptive assertion message 82 | assert extracted == expected_output, f"Mismatch for {sql_file.name}" 83 | 84 | 85 | # Keep the original test for backward compatibility but mark it as slow 86 | @pytest.mark.parametrize( 87 | "sql_file,expected_output_file", 88 | get_test_cases(), 89 | ids=lambda x: x.name if isinstance(x, Path) else str(x), 90 | ) 91 | @pytest.mark.llm 92 | @pytest.mark.slow 93 | def test_sql_dependency_extraction_individual( 94 | sql_file: Path, expected_output_file: Path, extractor: BaseSQLExtractor 95 | ) -> None: 96 | """Test extraction of dependencies from SQL files individually. 97 | 98 | This is slower than the batch test but useful for debugging specific files. 99 | This test will only run when both 'llm' and 'slow' markers are specified. 100 | 101 | Args: 102 | sql_file: Path to SQL file 103 | expected_output_file: Path to expected output JSON file 104 | extractor: SQLDeps extractor fixture 105 | """ 106 | # Load SQL code 107 | with open(sql_file) as f: 108 | sql = f.read() 109 | 110 | # Load expected output 111 | with open(expected_output_file) as f: 112 | expected_output = json.load(f) 113 | 114 | # Run the extractor 115 | dependency = extractor.extract_from_query(sql) 116 | 117 | # Assert the output matches the expected 118 | assert dependency.to_dict() == expected_output, f"Mismatch for {sql_file.name}" 119 | -------------------------------------------------------------------------------- /tests/integration/test_database.py: -------------------------------------------------------------------------------- 1 | """Integration tests for database connectors. 2 | 3 | These tests connect to an actual PostgreSQL database to verify 4 | schema retrieval and validation functionality. 5 | """ 6 | 7 | import os 8 | 9 | import pandas as pd 10 | import pytest 11 | 12 | from sqldeps.database import PostgreSQLConnector 13 | 14 | # Skip all tests if no test database is configured 15 | pytestmark = [ 16 | pytest.mark.skipif( 17 | os.environ.get("TEST_DB_HOST") is None, reason="Test database not configured" 18 | ), 19 | pytest.mark.integration, 20 | ] 21 | 22 | 23 | class TestPostgreSQLIntegration: 24 | """Integration tests for PostgreSQL connector. 25 | 26 | To run these tests, set the following environment variables: 27 | - TEST_DB_HOST 28 | - TEST_DB_PORT (optional, defaults to 5432) 29 | - TEST_DB_NAME 30 | - TEST_DB_USER 31 | - TEST_DB_PASSWORD 32 | """ 33 | 34 | @pytest.fixture 35 | def db_connector(self) -> PostgreSQLConnector: 36 | """Create a database connector for testing. 37 | 38 | Returns: 39 | PostgreSQLConnector: Configured database connector 40 | """ 41 | return PostgreSQLConnector( 42 | host=os.environ.get("TEST_DB_HOST"), 43 | port=int(os.environ.get("TEST_DB_PORT", "5432")), 44 | database=os.environ.get("TEST_DB_NAME"), 45 | username=os.environ.get("TEST_DB_USER"), 46 | password=os.environ.get("TEST_DB_PASSWORD"), 47 | ) 48 | 49 | def test_connection(self, db_connector: PostgreSQLConnector) -> None: 50 | """Test that connection to database succeeds.""" 51 | # Just creating the connector should establish a connection 52 | # If it doesn't, an exception will be raised and the test will fail 53 | assert db_connector is not None 54 | assert hasattr(db_connector, "engine") 55 | assert hasattr(db_connector, "inspector") 56 | 57 | def test_get_schema(self, db_connector: PostgreSQLConnector) -> None: 58 | """Test retrieving schema information.""" 59 | # Get schema for the public schema 60 | schema = db_connector.get_schema("public") 61 | 62 | # Verify result structure 63 | assert isinstance(schema, pd.DataFrame) 64 | assert set(schema.columns) == {"schema", "table", "column", "data_type"} 65 | assert len(schema) > 0 # Should have at least some tables 66 | 67 | # Verify all rows have the correct schema 68 | assert all(schema["schema"] == "public") 69 | 70 | # Commented out because it takes too long to run 71 | # @pytest.mark.slow 72 | # def test_get_schema_multiple(self, db_connector): 73 | # """Test retrieving schema from multiple schemas.""" 74 | # # Get schema from all available schemas 75 | # schema = db_connector.get_schema() 76 | 77 | # # Verify result 78 | # assert isinstance(schema, pd.DataFrame) 79 | # assert len(schema) > 0 80 | 81 | # # Should include multiple schemas if available 82 | # schemas = schema["schema"].unique() 83 | # assert len(schemas) > 0 84 | 85 | def test_export_schema_csv( 86 | self, db_connector: PostgreSQLConnector, tmp_path: str 87 | ) -> None: 88 | """Test exporting schema to CSV. 89 | 90 | Args: 91 | db_connector: PostgreSQL connector fixture 92 | tmp_path: Pytest temporary path fixture 93 | """ 94 | # Export schema to a temporary file 95 | output_file = tmp_path / "schema.csv" 96 | db_connector.export_schema_csv(output_file, schemas="public") 97 | 98 | # Verify file was created 99 | assert output_file.exists() 100 | 101 | # Verify file content 102 | df = pd.read_csv(output_file) 103 | assert set(df.columns) == {"schema", "table", "column", "data_type"} 104 | assert len(df) > 0 105 | -------------------------------------------------------------------------------- /tests/unit/app/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for SQLDeps web application. 2 | 3 | This package contains unit tests for the Streamlit-based web application. 4 | """ 5 | -------------------------------------------------------------------------------- /tests/unit/database/test_postgresql.py: -------------------------------------------------------------------------------- 1 | """Unit tests for PostgreSQLConnector. 2 | 3 | This module contains unit tests for the PostgreSQL connector functionality. 4 | """ 5 | 6 | from pathlib import Path 7 | from unittest.mock import MagicMock, mock_open, patch 8 | 9 | import pandas as pd 10 | import pytest 11 | import yaml 12 | 13 | from sqldeps.database.postgresql import PostgreSQLConnector 14 | 15 | 16 | class TestPostgreSQLConnector: 17 | """Test suite for PostgreSQLConnector.""" 18 | 19 | def test_initialization_with_params(self) -> None: 20 | """Test initialization with direct parameters.""" 21 | # Mock both create_engine and inspect 22 | with ( 23 | patch("sqldeps.database.postgresql.create_engine") as mock_engine, 24 | patch("sqldeps.database.postgresql.inspect") as mock_inspect, 25 | ): 26 | # Set up the mock inspector that will be returned 27 | mock_inspector = MagicMock() 28 | mock_inspect.return_value = mock_inspector 29 | 30 | connector = PostgreSQLConnector( 31 | host="localhost", 32 | port=5432, 33 | database="testdb", 34 | username="user", 35 | password="pass", 36 | ) 37 | 38 | # Verify engine was created 39 | mock_engine.assert_called_once() 40 | # Verify inspector was created 41 | mock_inspect.assert_called_once() 42 | assert connector.inspector == mock_inspector 43 | 44 | def test_initialization_missing_params(self) -> None: 45 | """Test initialization fails with missing parameters.""" 46 | with ( 47 | pytest.raises(ValueError, match="Missing required database parameters"), 48 | patch("os.getenv", return_value=None), 49 | ): 50 | PostgreSQLConnector( 51 | host=None, database="testdb", username="user", password="pass" 52 | ) 53 | 54 | def test_initialization_with_config_file(self) -> None: 55 | """Test initialization with YAML config file.""" 56 | config_data = { 57 | "database": { 58 | "host": "dbhost", 59 | "port": 5432, 60 | "database": "configdb", 61 | "username": "configuser", 62 | "password": "configpass", 63 | } 64 | } 65 | 66 | with ( 67 | patch("builtins.open", mock_open(read_data=yaml.dump(config_data))), 68 | patch("sqldeps.database.postgresql.create_engine") as mock_engine, 69 | patch("sqldeps.database.postgresql.inspect") as mock_inspect, 70 | patch.object(Path, "exists", return_value=True), 71 | ): 72 | # Set up the mock inspector 73 | mock_inspector = MagicMock() 74 | mock_inspect.return_value = mock_inspector 75 | 76 | PostgreSQLConnector(config_path=Path("config.yml")) 77 | 78 | # Verify engine was created with correct parameters 79 | mock_engine.assert_called_once() 80 | # Verify connection string contains expected values 81 | conn_string = mock_engine.call_args[0][0] 82 | assert "dbhost" in conn_string 83 | assert "configdb" in conn_string 84 | assert "configuser" in conn_string 85 | 86 | def test_get_schema(self) -> None: 87 | """Test schema retrieval functionality.""" 88 | with ( 89 | patch("sqldeps.database.postgresql.create_engine"), 90 | patch("sqldeps.database.postgresql.inspect") as mock_inspect, 91 | ): 92 | # Create mock inspector with appropriate return values 93 | mock_inspector = MagicMock() 94 | mock_inspector.get_schema_names.return_value = ["public"] 95 | mock_inspector.get_table_names.return_value = ["users"] 96 | mock_inspector.get_columns.return_value = [ 97 | {"name": "id", "type": "INTEGER"}, 98 | {"name": "name", "type": "VARCHAR"}, 99 | ] 100 | mock_inspect.return_value = mock_inspector 101 | 102 | # Create connector with mocked components 103 | connector = PostgreSQLConnector( 104 | host="localhost", database="testdb", username="user", password="pass" 105 | ) 106 | 107 | # Test get_schema method 108 | result = connector.get_schema() 109 | 110 | # Verify the result 111 | assert isinstance(result, pd.DataFrame) 112 | assert len(result) == 2 # Two columns 113 | assert list(result.columns) == ["schema", "table", "column", "data_type"] 114 | assert list(result["column"]) == ["id", "name"] 115 | 116 | def test_get_schema_with_specific_schemas(self) -> None: 117 | """Test schema retrieval for specific schemas.""" 118 | with ( 119 | patch("sqldeps.database.postgresql.create_engine"), 120 | patch("sqldeps.database.postgresql.inspect") as mock_inspect, 121 | ): 122 | # Create mock inspector 123 | mock_inspector = MagicMock() 124 | mock_inspector.get_table_names.return_value = ["orders"] 125 | mock_inspector.get_columns.return_value = [ 126 | {"name": "order_id", "type": "INTEGER"} 127 | ] 128 | mock_inspect.return_value = mock_inspector 129 | 130 | # Create connector with mocked components 131 | connector = PostgreSQLConnector( 132 | host="localhost", database="testdb", username="user", password="pass" 133 | ) 134 | 135 | # Test get_schema method with specific schema 136 | result = connector.get_schema(schemas="sales") 137 | 138 | # Verify the result 139 | assert isinstance(result, pd.DataFrame) 140 | assert len(result) == 1 141 | assert result["schema"][0] == "sales" 142 | assert result["table"][0] == "orders" 143 | assert result["column"][0] == "order_id" 144 | 145 | def test_pgpass_lookup(self) -> None: 146 | """Test .pgpass file password lookup.""" 147 | pgpass_content = ( 148 | "localhost:5432:testdb:user:secretpass\n*:5432:*:admin:adminpass" 149 | ) 150 | 151 | with ( 152 | patch("builtins.open", mock_open(read_data=pgpass_content)), 153 | patch.object(Path, "home", return_value=Path("/home/user")), 154 | patch.object(Path, "exists", return_value=True), 155 | ): 156 | # Test exact match 157 | password = PostgreSQLConnector._get_password_from_pgpass( 158 | None, "user", "localhost", "testdb", 5432, None 159 | ) 160 | assert password == "secretpass" 161 | 162 | # Test wildcard match 163 | password = PostgreSQLConnector._get_password_from_pgpass( 164 | None, "admin", "somehost", "anydb", 5432, None 165 | ) 166 | assert password == "adminpass" 167 | -------------------------------------------------------------------------------- /tests/unit/llm_parsers/test_base.py: -------------------------------------------------------------------------------- 1 | """Unit tests for BaseSQLExtractor. 2 | 3 | This module contains tests for the common functionality provided by the 4 | BaseSQLExtractor abstract base class. 5 | """ 6 | 7 | import json 8 | from pathlib import Path 9 | from unittest.mock import MagicMock, mock_open, patch 10 | 11 | import pytest 12 | 13 | from sqldeps.llm_parsers import BaseSQLExtractor 14 | from sqldeps.models import SQLProfile 15 | 16 | 17 | class MockSQLExtractor(BaseSQLExtractor): 18 | """Test implementation of BaseSQLExtractor for unit testing. 19 | 20 | This class provides a concrete implementation of the abstract BaseSQLExtractor 21 | that can be used in tests with mocked LLM responses. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | model: str = "test-model", 27 | params: dict | None = None, 28 | prompt_path: str | None = None, 29 | ) -> None: 30 | """Initialize the mock extractor. 31 | 32 | Args: 33 | model: Model name 34 | params: Additional parameters 35 | prompt_path: Path to custom prompt file 36 | """ 37 | super().__init__(model, params, prompt_path) 38 | 39 | def _query_llm(self, prompt: str) -> str: 40 | """Implement the abstract method from the parent class. 41 | 42 | In actual tests, this method will typically be mocked. 43 | 44 | Args: 45 | prompt: Prompt to send to the LLM 46 | 47 | Returns: 48 | Empty string (will be mocked in tests) 49 | """ 50 | return "" 51 | 52 | 53 | @pytest.fixture 54 | def mock_extractor() -> MockSQLExtractor: 55 | """Provide a mock SQL extractor for tests. 56 | 57 | Returns: 58 | MockSQLExtractor: A concrete implementation of BaseSQLExtractor 59 | """ 60 | return MockSQLExtractor() 61 | 62 | 63 | @pytest.fixture 64 | def mock_sql_response() -> callable: 65 | """Create a standard SQL dependency response. 66 | 67 | Returns: 68 | function: A function that creates JSON responses with given dependencies/outputs 69 | """ 70 | 71 | def _create_response( 72 | dependencies: dict | None = None, outputs: dict | None = None 73 | ) -> str: 74 | return json.dumps( 75 | {"dependencies": dependencies or {}, "outputs": outputs or {}} 76 | ) 77 | 78 | return _create_response 79 | 80 | 81 | class TestBaseSQLExtractor: 82 | """Test suite for BaseSQLExtractor.""" 83 | 84 | def test_initialization(self, mock_extractor: MockSQLExtractor) -> None: 85 | """Test proper initialization of BaseSQLExtractor.""" 86 | assert mock_extractor.model == "test-model" 87 | assert mock_extractor.framework == "mocksql" 88 | assert mock_extractor.params == {"temperature": 0} 89 | 90 | def test_extract_from_query( 91 | self, mock_extractor: MockSQLExtractor, mock_sql_response: callable 92 | ) -> None: 93 | """Test extraction from a SQL query.""" 94 | response = mock_sql_response( 95 | dependencies={"table1": ["col1", "col2"]}, outputs={"table2": ["col3"]} 96 | ) 97 | 98 | mock_extractor._query_llm = MagicMock(return_value=response) 99 | result = mock_extractor.extract_from_query("SELECT col1, col2 FROM table1") 100 | 101 | assert isinstance(result, SQLProfile) 102 | assert result.dependencies == {"table1": ["col1", "col2"]} 103 | assert result.outputs == {"table2": ["col3"]} 104 | mock_extractor._query_llm.assert_called_once() 105 | 106 | def test_extract_from_file( 107 | self, mock_extractor: MockSQLExtractor, mock_sql_response: callable 108 | ) -> None: 109 | """Test extraction from a SQL file.""" 110 | mock_sql = "SELECT * FROM users" 111 | response = mock_sql_response(dependencies={"users": ["*"]}) 112 | 113 | mock_extractor._query_llm = MagicMock(return_value=response) 114 | 115 | with ( 116 | patch("builtins.open", mock_open(read_data=mock_sql)), 117 | patch.object(Path, "exists", return_value=True), 118 | ): 119 | result = mock_extractor.extract_from_file("fake_path.sql") 120 | 121 | assert result.dependencies == {"users": ["*"]} 122 | assert result.outputs == {} 123 | 124 | def test_file_not_found(self, mock_extractor: MockSQLExtractor) -> None: 125 | """Test handling of file not found.""" 126 | with ( 127 | patch.object(Path, "exists", return_value=False), 128 | pytest.raises(FileNotFoundError), 129 | ): 130 | mock_extractor.extract_from_file("nonexistent.sql") 131 | 132 | def test_extract_from_folder(self, mock_extractor: MockSQLExtractor) -> None: 133 | """Test extraction from a folder.""" 134 | with patch("sqldeps.llm_parsers.base.find_sql_files") as mock_find: 135 | # Setup mock files 136 | mock_files = [Path("file1.sql"), Path("file2.sql")] 137 | mock_find.return_value = mock_files 138 | 139 | # Mock file extraction 140 | mock_extractor.extract_from_file = MagicMock( 141 | return_value=SQLProfile(dependencies={"table1": ["col1"]}, outputs={}) 142 | ) 143 | 144 | # Explicitly disable cache usage 145 | result = mock_extractor.extract_from_folder( 146 | "test_folder", recursive=True, n_workers=1, use_cache=False 147 | ) 148 | 149 | # Verify results 150 | assert len(result) == len(mock_files) 151 | assert mock_extractor.extract_from_file.call_count == len(mock_files) 152 | 153 | @pytest.mark.parametrize( 154 | "response,error_pattern", 155 | [ 156 | ("Invalid JSON", "Failed to decode JSON"), 157 | ('{"only_dependencies": {}}', "Missing required keys"), 158 | ], 159 | ) 160 | def test_process_response_errors( 161 | self, mock_extractor: MockSQLExtractor, response: str, error_pattern: str 162 | ) -> None: 163 | """Test handling of different error conditions. 164 | 165 | Args: 166 | mock_extractor: Mock extractor fixture 167 | response: Response string to process 168 | error_pattern: Expected error message pattern 169 | """ 170 | with pytest.raises(ValueError, match=error_pattern): 171 | mock_extractor._process_response(response) 172 | 173 | def test_load_prompts_default(self, mock_extractor: MockSQLExtractor) -> None: 174 | """Test loading default prompts.""" 175 | # Define a dict that mimics parsed YAML 176 | mock_yaml_data = { 177 | "system_prompt": "test system prompt", 178 | "user_prompt": "test user prompt", 179 | } 180 | 181 | # Directly patch yaml.safe_load to return our mock data 182 | with patch("yaml.safe_load", return_value=mock_yaml_data): 183 | # Create a new extractor to trigger _load_prompts 184 | extractor = MockSQLExtractor() 185 | 186 | # Verify the prompts were loaded correctly 187 | assert extractor.prompts == mock_yaml_data 188 | assert extractor.prompts["system_prompt"] == "test system prompt" 189 | assert extractor.prompts["user_prompt"] == "test user prompt" 190 | 191 | def test_normalize_extensions(self) -> None: 192 | """Test normalization of file extensions.""" 193 | result = MockSQLExtractor._normalize_extensions({".SQL", "sql", ".Sql"}) 194 | assert result == {"sql"} 195 | 196 | result = MockSQLExtractor._normalize_extensions(None) 197 | assert result == {"sql"} # Default extension 198 | -------------------------------------------------------------------------------- /tests/unit/llm_parsers/test_deepseek.py: -------------------------------------------------------------------------------- 1 | """Unit tests for DeepseekExtractor. 2 | 3 | This module tests the DeepSeek-specific LLM implementation. 4 | """ 5 | 6 | from unittest.mock import MagicMock, patch 7 | 8 | import pytest 9 | 10 | from sqldeps.llm_parsers.deepseek import DeepseekExtractor 11 | 12 | 13 | class TestDeepseekExtractor: 14 | """Test suite for DeepseekExtractor.""" 15 | 16 | def test_initialization(self) -> None: 17 | """Test proper initialization with API key.""" 18 | with patch.dict("os.environ", {"DEEPSEEK_API_KEY": "fake-key"}): 19 | extractor = DeepseekExtractor(model="deepseek-chat") 20 | 21 | assert extractor.model == "deepseek-chat" 22 | assert extractor.framework == "deepseek" 23 | assert hasattr(extractor, "client") 24 | 25 | def test_initialization_without_api_key(self) -> None: 26 | """Test initialization fails without API key.""" 27 | with ( 28 | patch.dict("os.environ", clear=True), 29 | pytest.raises(ValueError, match="No API key provided"), 30 | ): 31 | DeepseekExtractor(model="deepseek-chat") 32 | 33 | def test_query_llm(self) -> None: 34 | """Test LLM query functionality.""" 35 | with patch.dict("os.environ", {"DEEPSEEK_API_KEY": "fake-key"}): 36 | extractor = DeepseekExtractor(model="deepseek-chat") 37 | 38 | # Mock the OpenAI client (which DeepseekExtractor uses) 39 | mock_response = MagicMock() 40 | mock_response.choices = [MagicMock()] 41 | mock_response.choices[ 42 | 0 43 | ].message.content = '{"dependencies": {}, "outputs": {}}' 44 | 45 | extractor.client = MagicMock() 46 | extractor.client.chat.completions.create.return_value = mock_response 47 | 48 | # Test the query 49 | result = extractor._query_llm("SELECT * FROM test") 50 | 51 | # Verify the response and method calls 52 | assert result == '{"dependencies": {}, "outputs": {}}' 53 | extractor.client.chat.completions.create.assert_called_once() 54 | 55 | # Verify correct parameters were passed 56 | call_args = extractor.client.chat.completions.create.call_args[1] 57 | assert call_args["model"] == "deepseek-chat" 58 | assert call_args["response_format"] == {"type": "json_object"} 59 | assert len(call_args["messages"]) == 2 60 | assert call_args["messages"][0]["role"] == "system" 61 | assert call_args["messages"][1]["role"] == "user" 62 | -------------------------------------------------------------------------------- /tests/unit/llm_parsers/test_groq.py: -------------------------------------------------------------------------------- 1 | """Unit tests for GroqExtractor. 2 | 3 | This module tests the Groq-specific LLM implementation. 4 | """ 5 | 6 | from unittest.mock import MagicMock, patch 7 | 8 | import pytest 9 | 10 | from sqldeps.llm_parsers.groq import GroqExtractor 11 | 12 | 13 | class TestGroqExtractor: 14 | """Test suite for GroqExtractor.""" 15 | 16 | def test_initialization(self) -> None: 17 | """Test proper initialization with API key.""" 18 | with patch.dict("os.environ", {"GROQ_API_KEY": "fake-key"}): 19 | extractor = GroqExtractor(model="llama-3.3-70b-versatile") 20 | 21 | assert extractor.model == "llama-3.3-70b-versatile" 22 | assert extractor.framework == "groq" 23 | assert hasattr(extractor, "client") 24 | 25 | def test_initialization_without_api_key(self) -> None: 26 | """Test initialization fails without API key.""" 27 | with ( 28 | patch.dict("os.environ", clear=True), 29 | pytest.raises(ValueError, match="No API key provided"), 30 | ): 31 | GroqExtractor(model="llama-3.3-70b-versatile") 32 | 33 | def test_query_llm(self) -> None: 34 | """Test LLM query functionality.""" 35 | with patch.dict("os.environ", {"GROQ_API_KEY": "fake-key"}): 36 | extractor = GroqExtractor(model="llama-3.3-70b-versatile") 37 | 38 | # Mock the Groq client 39 | mock_response = MagicMock() 40 | mock_response.choices = [MagicMock()] 41 | mock_response.choices[ 42 | 0 43 | ].message.content = '{"dependencies": {}, "outputs": {}}' 44 | 45 | extractor.client = MagicMock() 46 | extractor.client.chat.completions.create.return_value = mock_response 47 | 48 | # Test the query 49 | result = extractor._query_llm("SELECT * FROM test") 50 | 51 | # Verify the response and method calls 52 | assert result == '{"dependencies": {}, "outputs": {}}' 53 | extractor.client.chat.completions.create.assert_called_once() 54 | 55 | # Verify correct parameters were passed 56 | call_args = extractor.client.chat.completions.create.call_args[1] 57 | assert call_args["model"] == "llama-3.3-70b-versatile" 58 | assert call_args["response_format"] == {"type": "json_object"} 59 | assert len(call_args["messages"]) == 2 60 | assert call_args["messages"][0]["role"] == "system" 61 | assert call_args["messages"][1]["role"] == "user" 62 | -------------------------------------------------------------------------------- /tests/unit/llm_parsers/test_init.py: -------------------------------------------------------------------------------- 1 | """Unit tests for sqldeps.llm_parsers.__init__. 2 | 3 | This module tests the factory function and other initialization 4 | logic of the llm_parsers package. 5 | """ 6 | 7 | import pytest 8 | 9 | from sqldeps.llm_parsers import create_extractor 10 | 11 | 12 | class TestLLMParsersInit: 13 | """Test suite for sqldeps.llm_parsers.__init__.""" 14 | 15 | def test_create_extractor_invalid_framework(self) -> None: 16 | """Test creating an extractor with invalid framework.""" 17 | # No need to patch anything for this test 18 | with pytest.raises(ValueError, match="Unsupported framework"): 19 | create_extractor(framework="invalid_framework") 20 | -------------------------------------------------------------------------------- /tests/unit/llm_parsers/test_openai.py: -------------------------------------------------------------------------------- 1 | """Unit tests for OpenaiExtractor. 2 | 3 | This module tests the OpenAI-specific LLM implementation. 4 | """ 5 | 6 | from unittest.mock import MagicMock, patch 7 | 8 | import pytest 9 | 10 | from sqldeps.llm_parsers.openai import OpenaiExtractor 11 | 12 | 13 | class TestOpenaiExtractor: 14 | """Test suite for OpenaiExtractor.""" 15 | 16 | def test_initialization(self) -> None: 17 | """Test proper initialization with API key.""" 18 | with patch.dict("os.environ", {"OPENAI_API_KEY": "fake-key"}): 19 | extractor = OpenaiExtractor(model="gpt-4o") 20 | 21 | assert extractor.model == "gpt-4o" 22 | assert extractor.framework == "openai" 23 | assert hasattr(extractor, "client") 24 | 25 | def test_initialization_without_api_key(self) -> None: 26 | """Test initialization fails without API key.""" 27 | with ( 28 | patch.dict("os.environ", clear=True), 29 | pytest.raises(ValueError, match="No API key provided"), 30 | ): 31 | OpenaiExtractor(model="gpt-4o") 32 | 33 | def test_query_llm(self) -> None: 34 | """Test LLM query functionality.""" 35 | with patch.dict("os.environ", {"OPENAI_API_KEY": "fake-key"}): 36 | extractor = OpenaiExtractor(model="gpt-4o") 37 | 38 | # Mock the OpenAI client 39 | mock_response = MagicMock() 40 | mock_response.choices = [MagicMock()] 41 | mock_response.choices[ 42 | 0 43 | ].message.content = '{"dependencies": {}, "outputs": {}}' 44 | 45 | extractor.client = MagicMock() 46 | extractor.client.chat.completions.create.return_value = mock_response 47 | 48 | # Test the query 49 | result = extractor._query_llm("SELECT * FROM test") 50 | 51 | # Verify the response and method calls 52 | assert result == '{"dependencies": {}, "outputs": {}}' 53 | extractor.client.chat.completions.create.assert_called_once() 54 | 55 | # Verify correct parameters were passed 56 | call_args = extractor.client.chat.completions.create.call_args[1] 57 | assert call_args["model"] == "gpt-4o" 58 | assert call_args["response_format"] == {"type": "json_object"} 59 | assert len(call_args["messages"]) == 2 60 | assert call_args["messages"][0]["role"] == "system" 61 | assert call_args["messages"][1]["role"] == "user" 62 | -------------------------------------------------------------------------------- /tests/unit/test_cache.py: -------------------------------------------------------------------------------- 1 | """Unit tests for cache.py. 2 | 3 | This module tests the caching functionality for storing and retrieving 4 | SQL dependency extraction results. 5 | """ 6 | 7 | import json 8 | from pathlib import Path 9 | from unittest.mock import MagicMock, mock_open, patch 10 | 11 | from sqldeps.cache import cleanup_cache, get_cache_path, load_from_cache, save_to_cache 12 | from sqldeps.models import SQLProfile 13 | 14 | 15 | def test_get_cache_path() -> None: 16 | """Test generation of cache file paths based on file content.""" 17 | # Set up mock file content 18 | mock_content = b"SELECT * FROM table" 19 | mock_content_hash = "0123456789abcdef" # Simplified hash output 20 | 21 | with ( 22 | patch("pathlib.Path.resolve") as mock_resolve, 23 | patch("builtins.open", mock_open(read_data=mock_content)), 24 | patch("hashlib.md5") as mock_md5, 25 | ): 26 | # Setup mocks 27 | mock_resolve.return_value = Path("/absolute/path/to/file.sql") 28 | mock_hash_instance = MagicMock() 29 | mock_hash_instance.hexdigest.return_value = mock_content_hash 30 | mock_md5.return_value = mock_hash_instance 31 | 32 | # Test content-based hashing 33 | cache_path = get_cache_path("file.sql") 34 | 35 | # Verify results 36 | expected_path = Path(".sqldeps_cache") / f"file_{mock_content_hash[:16]}.json" 37 | assert cache_path == expected_path 38 | 39 | # Verify file was opened for reading 40 | open.assert_called_once_with(Path("/absolute/path/to/file.sql"), "rb") 41 | 42 | # Verify hash was computed with file content 43 | mock_md5.assert_called_once() 44 | mock_hash_instance.hexdigest.assert_called_once() 45 | 46 | 47 | def test_save_load_cache() -> None: 48 | """Test saving and loading from cache.""" 49 | # Create a test SQLProfile 50 | profile = SQLProfile( 51 | dependencies={"table1": ["col1"]}, outputs={"table2": ["col2"]} 52 | ) 53 | 54 | # Mock file operations 55 | mock_file_content = json.dumps(profile.to_dict()) 56 | 57 | with ( 58 | patch("sqldeps.cache.get_cache_path") as mock_get_path, 59 | patch("builtins.open", mock_open(read_data=mock_file_content)), 60 | ): 61 | # Setup mock path 62 | mock_cache_path = Path(".sqldeps_cache/test.json") 63 | mock_get_path.return_value = mock_cache_path 64 | 65 | # Test saving to cache 66 | result = save_to_cache(profile, "test.sql") 67 | assert result is True 68 | 69 | # Test loading from cache 70 | with patch("pathlib.Path.exists") as mock_exists: 71 | mock_exists.return_value = True 72 | loaded = load_from_cache("test.sql") 73 | assert loaded.dependencies == profile.dependencies 74 | assert loaded.outputs == profile.outputs 75 | 76 | 77 | def test_cleanup_cache_success() -> None: 78 | """Test successful cleanup of cache directory.""" 79 | # Setup - create a mock cache directory with some files 80 | mock_cache_dir = Path("mock_cache_dir") 81 | mock_json_files = [ 82 | Path("mock_cache_dir/file1.json"), 83 | Path("mock_cache_dir/file2.json"), 84 | ] 85 | 86 | with ( 87 | patch("pathlib.Path.exists", return_value=True), 88 | patch("pathlib.Path.glob") as mock_glob, 89 | patch("pathlib.Path.unlink") as mock_unlink, 90 | patch("pathlib.Path.iterdir", return_value=[]), 91 | patch("pathlib.Path.rmdir") as mock_rmdir, 92 | patch("sqldeps.cache.logger") as mock_logger, 93 | ): 94 | # Setup mock glob to return our json files 95 | mock_glob.return_value = mock_json_files 96 | 97 | # Call the function 98 | result = cleanup_cache(mock_cache_dir) 99 | 100 | # Verify the result 101 | assert result is True 102 | assert mock_unlink.call_count == 2 # Should unlink both JSON files 103 | mock_rmdir.assert_called_once() # Should remove the directory 104 | mock_logger.info.assert_called() # Should log success 105 | 106 | 107 | def test_cleanup_cache_non_empty_dir() -> None: 108 | """Test cleanup when directory still has other files.""" 109 | mock_cache_dir = Path("mock_cache_dir") 110 | 111 | with ( 112 | patch("pathlib.Path.exists", return_value=True), 113 | patch("pathlib.Path.glob") as mock_glob, 114 | patch("pathlib.Path.unlink") as mock_unlink, 115 | patch("pathlib.Path.iterdir", return_value=["other_file"]), 116 | patch("pathlib.Path.rmdir") as mock_rmdir, 117 | patch("sqldeps.cache.logger") as mock_logger, 118 | ): 119 | # Setup mock glob to return JSON files 120 | mock_glob.return_value = [Path("mock_cache_dir/file1.json")] 121 | 122 | # Call the function 123 | result = cleanup_cache(mock_cache_dir) 124 | 125 | # Verify the result 126 | assert result is True 127 | assert mock_unlink.call_count == 1 # Should unlink the JSON file 128 | mock_rmdir.assert_not_called() # Should not remove non-empty directory 129 | mock_logger.info.assert_called_with( 130 | "Cache directory cleaned but not removed (contains other files)" 131 | ) 132 | 133 | 134 | def test_cleanup_cache_error() -> None: 135 | """Test cleanup when an error occurs.""" 136 | mock_cache_dir = Path("mock_cache_dir") 137 | 138 | with ( 139 | patch("pathlib.Path.exists", return_value=True), 140 | patch("pathlib.Path.glob") as mock_glob, 141 | patch("sqldeps.cache.logger") as mock_logger, 142 | ): 143 | # Make glob raise an exception 144 | mock_glob.side_effect = Exception("Test error") 145 | 146 | # Call the function 147 | result = cleanup_cache(mock_cache_dir) 148 | 149 | # Verify the result 150 | assert result is False 151 | mock_logger.warning.assert_called_once() # Should log warning 152 | 153 | 154 | def test_cleanup_cache_nonexistent() -> None: 155 | """Test cleanup when cache directory doesn't exist.""" 156 | mock_cache_dir = Path("nonexistent_dir") 157 | 158 | with patch("pathlib.Path.exists", return_value=False): 159 | # Call the function 160 | result = cleanup_cache(mock_cache_dir) 161 | 162 | # Verify the result 163 | assert result is True # Should return True when directory doesn't exist 164 | -------------------------------------------------------------------------------- /tests/unit/test_cli.py: -------------------------------------------------------------------------------- 1 | """Unit tests for command-line interface. 2 | 3 | This module tests the functionality of the CLI commands and related functions. 4 | """ 5 | 6 | from pathlib import Path 7 | from unittest.mock import MagicMock, patch 8 | 9 | import pytest 10 | from typer.testing import CliRunner 11 | 12 | from sqldeps.cli import app, extract, extract_dependencies, save_output 13 | from sqldeps.models import SQLProfile 14 | 15 | 16 | @pytest.fixture 17 | def runner() -> CliRunner: 18 | """Create a CLI runner for testing. 19 | 20 | Returns: 21 | CliRunner: A Typer CLI test runner 22 | """ 23 | return CliRunner() 24 | 25 | 26 | @pytest.fixture 27 | def mock_sql_profile() -> SQLProfile: 28 | """Create a mock SQLProfile for testing. 29 | 30 | Returns: 31 | SQLProfile: A sample SQLProfile for testing 32 | """ 33 | return SQLProfile( 34 | dependencies={"users": ["id", "name"]}, outputs={"reports": ["date", "total"]} 35 | ) 36 | 37 | 38 | class TestCLI: 39 | """Test suite for command-line interface.""" 40 | 41 | def test_extract_dependencies(self, mock_sql_profile: SQLProfile) -> None: 42 | """Test extraction of dependencies.""" 43 | # Mock the extractor 44 | mock_extractor = MagicMock() 45 | mock_extractor.extract_from_file.return_value = mock_sql_profile 46 | mock_extractor.extract_from_folder.return_value = {"file.sql": mock_sql_profile} 47 | 48 | # Test file extraction 49 | with patch("pathlib.Path.is_file", return_value=True): 50 | result = extract_dependencies(mock_extractor, Path("file.sql"), False) 51 | assert result == mock_sql_profile 52 | mock_extractor.extract_from_file.assert_called_once() 53 | 54 | # Test folder extraction 55 | with patch("pathlib.Path.is_file", return_value=False): 56 | result = extract_dependencies(mock_extractor, Path("folder"), True) 57 | assert result == {"file.sql": mock_sql_profile} 58 | mock_extractor.extract_from_folder.assert_called_once() 59 | 60 | def test_save_output(self, mock_sql_profile: SQLProfile, tmp_path: Path) -> None: 61 | """Test saving output to different formats.""" 62 | # Test JSON output 63 | json_path = tmp_path / "output.json" 64 | save_output(mock_sql_profile, json_path) 65 | assert json_path.exists() 66 | 67 | # Test CSV output 68 | csv_path = tmp_path / "output.csv" 69 | save_output(mock_sql_profile, csv_path) 70 | assert csv_path.exists() 71 | 72 | # Test CSV output with schema match 73 | df_mock = MagicMock() 74 | df_mock.to_csv = MagicMock() 75 | save_output(df_mock, csv_path, is_schema_match=True) 76 | df_mock.to_csv.assert_called_once() 77 | 78 | def test_cli_command(self, runner: CliRunner, mock_sql_profile: SQLProfile) -> None: 79 | """Test the CLI command execution using isolated components.""" 80 | with ( 81 | patch("sqldeps.cli.create_extractor") as mock_create_extractor, 82 | patch("sqldeps.cli.extract_dependencies") as mock_extract, 83 | patch("sqldeps.cli.save_output") as mock_save, 84 | ): 85 | # Setup mocks 86 | mock_extractor = MagicMock() 87 | mock_create_extractor.return_value = mock_extractor 88 | mock_extract.return_value = mock_sql_profile 89 | 90 | # Use the extract function directly instead of 'main' 91 | extract( 92 | fpath=Path("file.sql"), 93 | framework="groq", 94 | model=None, 95 | prompt=None, 96 | recursive=False, 97 | db_match_schema=False, 98 | db_target_schemas="public", 99 | db_credentials=None, 100 | output=Path("dependencies.json"), 101 | ) 102 | 103 | # Verify function calls 104 | mock_create_extractor.assert_called_once() 105 | mock_extract.assert_called_once() 106 | mock_save.assert_called_once() 107 | 108 | def test_cli_error_handling(self) -> None: 109 | """Test error handling in CLI using mock directly.""" 110 | with patch("sqldeps.cli.create_extractor") as mock_create_extractor: 111 | # Make the extractor creation raise an exception 112 | mock_create_extractor.side_effect = ValueError("Test error") 113 | 114 | # Call the extract function directly and catch the exception 115 | from typer import Exit 116 | 117 | with pytest.raises(Exit) as excinfo: 118 | extract( 119 | fpath=Path("file.sql"), 120 | framework="groq", 121 | model=None, 122 | prompt=None, 123 | recursive=False, 124 | db_match_schema=False, 125 | db_target_schemas="public", 126 | db_credentials=None, 127 | output=Path("dependencies.json"), 128 | ) 129 | 130 | # Verify the exit code is 1 131 | assert excinfo.value.exit_code == 1 132 | 133 | def test_cli_database_validation(self, mock_sql_profile: SQLProfile) -> None: 134 | """Test database validation logic directly.""" 135 | with ( 136 | patch("sqldeps.cli.create_extractor") as mock_create_extractor, 137 | patch("sqldeps.cli.extract_dependencies") as mock_extract, 138 | patch("sqldeps.cli.match_dependencies_against_schema") as mock_match, 139 | patch("sqldeps.cli.save_output"), 140 | patch("builtins.open", MagicMock()), 141 | patch("yaml.safe_load", return_value={"database": {}}), 142 | ): 143 | # Setup mocks 144 | mock_extractor = MagicMock() 145 | mock_create_extractor.return_value = mock_extractor 146 | mock_extract.return_value = mock_sql_profile 147 | mock_match.return_value = MagicMock() # Mock DataFrame result 148 | 149 | # Call the extract function directly 150 | extract( 151 | fpath=Path("file.sql"), 152 | framework="groq", 153 | model=None, 154 | prompt=None, 155 | recursive=False, 156 | db_match_schema=True, 157 | db_target_schemas="public", 158 | db_credentials=Path("config.yml"), 159 | output=Path("dependencies.json"), 160 | ) 161 | 162 | # Verify function calls 163 | mock_match.assert_called_once() 164 | 165 | def test_app_version(self, runner: CliRunner) -> None: 166 | """Test CLI app version command.""" 167 | # The version command is a safer option to test CLI integration 168 | result = runner.invoke(app, ["--version"]) 169 | 170 | # Version command should not produce an error 171 | assert result.exit_code == 0 172 | assert "SQLDeps version:" in result.output 173 | 174 | def test_app_command(self) -> None: 175 | """Test the app command functionality.""" 176 | with ( 177 | patch("sqldeps.cli.subprocess.run") as mock_run, 178 | patch("sqldeps.cli.Path.exists", return_value=True), 179 | ): 180 | from sqldeps.cli import app_main 181 | 182 | app_main() 183 | mock_run.assert_called_once() 184 | 185 | def test_cache_clear_command(self) -> None: 186 | """Test the cache clear command.""" 187 | with patch("sqldeps.cli.cleanup_cache", return_value=True) as mock_cleanup: 188 | from sqldeps.cli import cache_clear 189 | 190 | cache_clear() 191 | mock_cleanup.assert_called_once() 192 | -------------------------------------------------------------------------------- /tests/unit/test_config.py: -------------------------------------------------------------------------------- 1 | """Unit tests for config.py. 2 | 3 | This module tests configuration loading functionality. 4 | """ 5 | 6 | from unittest.mock import mock_open, patch 7 | 8 | from sqldeps.config import load_config 9 | 10 | 11 | def test_load_config() -> None: 12 | """Test loading configuration from a YAML file.""" 13 | # Simple test YAML with nested keys 14 | config_yaml = """ 15 | database: 16 | host: localhost 17 | port: 5432 18 | """ 19 | 20 | # Mock file open 21 | with patch("builtins.open", mock_open(read_data=config_yaml)): 22 | config = load_config("fake_config.yml") 23 | 24 | # Verify basic parsing including nested values 25 | assert config["database"]["host"] == "localhost" 26 | assert config["database"]["port"] == 5432 27 | -------------------------------------------------------------------------------- /tests/unit/test_models.py: -------------------------------------------------------------------------------- 1 | """Unit tests for data models. 2 | 3 | This module tests the SQLProfile class and its methods. 4 | """ 5 | 6 | import pandas as pd 7 | 8 | from sqldeps.models import SQLProfile 9 | 10 | 11 | def test_sql_profile_initialization() -> None: 12 | """Test SQLProfile initialization and sorting.""" 13 | # Create a profile with unsorted data 14 | profile = SQLProfile( 15 | dependencies={ 16 | "table_b": ["col_c", "col_a", "col_b"], 17 | "table_a": ["col_z", "col_y"], 18 | }, 19 | outputs={ 20 | "schema.out_table_b": ["out_col_b", "out_col_a"], 21 | "schema.out_table_a": ["out_col_x"], 22 | }, 23 | ) 24 | 25 | # Check that tables and columns are sorted 26 | assert list(profile.dependencies.keys()) == ["table_a", "table_b"] 27 | assert profile.dependencies["table_a"] == ["col_y", "col_z"] 28 | assert profile.dependencies["table_b"] == ["col_a", "col_b", "col_c"] 29 | 30 | assert list(profile.outputs.keys()) == ["schema.out_table_a", "schema.out_table_b"] 31 | assert profile.outputs["schema.out_table_a"] == ["out_col_x"] 32 | assert profile.outputs["schema.out_table_b"] == ["out_col_a", "out_col_b"] 33 | 34 | 35 | def test_to_dataframe_conversion() -> None: 36 | """Test conversion to DataFrame with proper structure.""" 37 | profile = SQLProfile( 38 | dependencies={"schema.users": ["id", "name"]}, 39 | outputs={"public.user_report": ["user_id", "report_date"]}, 40 | ) 41 | 42 | df = profile.to_dataframe() 43 | 44 | # Check DataFrame structure 45 | assert isinstance(df, pd.DataFrame) 46 | assert set(df.columns) == {"type", "schema", "table", "column"} 47 | 48 | # Check dependencies were properly converted 49 | deps = df[df["type"] == "dependency"] 50 | assert len(deps) == 2 # Two columns 51 | assert set(deps["schema"]) == {"schema"} 52 | assert set(deps["table"]) == {"users"} 53 | assert set(deps["column"]) == {"id", "name"} 54 | 55 | # Check outputs were properly converted 56 | outs = df[df["type"] == "outcome"] 57 | assert len(outs) == 2 # Two columns 58 | assert set(outs["schema"]) == {"public"} 59 | assert set(outs["table"]) == {"user_report"} 60 | assert set(outs["column"]) == {"user_id", "report_date"} 61 | 62 | 63 | def test_empty_columns_handling() -> None: 64 | """Test handling of tables with no specific columns.""" 65 | profile = SQLProfile( 66 | dependencies={"table_with_no_columns": []}, 67 | outputs={"output_table_no_columns": []}, 68 | ) 69 | 70 | df = profile.to_dataframe() 71 | 72 | # Check tables with no columns are properly represented 73 | deps = df[df["type"] == "dependency"] 74 | assert len(deps) == 1 75 | assert deps.iloc[0]["table"] == "table_with_no_columns" 76 | assert deps.iloc[0]["column"] is None 77 | 78 | outs = df[df["type"] == "outcome"] 79 | assert len(outs) == 1 80 | assert outs.iloc[0]["table"] == "output_table_no_columns" 81 | assert outs.iloc[0]["column"] is None 82 | 83 | 84 | def test_to_dict() -> None: 85 | """Test conversion to dictionary format.""" 86 | profile = SQLProfile( 87 | dependencies={"users": ["id", "name"]}, outputs={"reports": ["user_id"]} 88 | ) 89 | 90 | result = profile.to_dict() 91 | 92 | assert isinstance(result, dict) 93 | assert "dependencies" in result 94 | assert "outputs" in result 95 | assert result["dependencies"] == {"users": ["id", "name"]} 96 | assert result["outputs"] == {"reports": ["user_id"]} 97 | 98 | 99 | def test_property_accessors() -> None: 100 | """Test property accessor methods.""" 101 | profile = SQLProfile( 102 | dependencies={"schema1.table1": ["col1"], "schema2.table2": ["col2"]}, 103 | outputs={"schema3.table3": ["col3"], "schema4.table4": ["col4"]}, 104 | ) 105 | 106 | # Test dependency_tables property 107 | assert profile.dependency_tables == ["schema1.table1", "schema2.table2"] 108 | 109 | # Test outcome_tables property 110 | assert profile.outcome_tables == ["schema3.table3", "schema4.table4"] 111 | -------------------------------------------------------------------------------- /tests/unit/test_parallel.py: -------------------------------------------------------------------------------- 1 | """Unit tests for parallel processing functionality. 2 | 3 | This module tests the parallel execution of SQL dependency extraction 4 | across multiple processes. 5 | """ 6 | 7 | from concurrent.futures import Future 8 | from pathlib import Path 9 | from unittest.mock import MagicMock, patch 10 | 11 | import pytest 12 | 13 | from sqldeps.parallel import ( 14 | _extract_from_file, 15 | _process_batch_files, 16 | process_files_in_parallel, 17 | resolve_workers, 18 | ) 19 | 20 | 21 | class TestParallelProcessing: 22 | """Test suite for parallel processing functionality.""" 23 | 24 | def test_resolve_workers(self) -> None: 25 | """Test resolution of worker count.""" 26 | with patch("sqldeps.parallel.cpu_count", return_value=8): 27 | # Default (-1) should use all CPUs 28 | assert resolve_workers(-1) == 8 29 | 30 | # Specific number within range 31 | assert resolve_workers(4) == 4 32 | 33 | # Minimum 1 34 | assert resolve_workers(1) == 1 35 | 36 | # Too large should raise ValueError 37 | with pytest.raises(ValueError): 38 | resolve_workers(9) 39 | 40 | # Too small should raise ValueError 41 | with pytest.raises(ValueError): 42 | resolve_workers(0) 43 | 44 | def test_extract_from_file_with_cache(self) -> None: 45 | """Test single file extraction with caching.""" 46 | # Mock dependencies 47 | mock_limiter = MagicMock() 48 | mock_result = MagicMock() 49 | mock_path = Path("test.sql") 50 | 51 | # Mock cache hit 52 | with patch("sqldeps.parallel.load_from_cache", return_value=mock_result): 53 | # Should return cached result without extracting 54 | path, result = _extract_from_file( 55 | mock_path, mock_limiter, "groq", "model", None, True 56 | ) 57 | 58 | assert path == mock_path 59 | assert result == mock_result 60 | mock_limiter.wait_if_needed.assert_not_called() 61 | 62 | def test_extract_from_file_without_cache(self) -> None: 63 | """Test single file extraction without cache hit.""" 64 | # Mock dependencies 65 | mock_limiter = MagicMock() 66 | mock_extractor = MagicMock() 67 | mock_extractor.extract_from_file.return_value = "result" 68 | mock_path = Path("test.sql") 69 | 70 | # Setup no cache hit, extract successful 71 | with ( 72 | patch("sqldeps.parallel.load_from_cache", return_value=None), 73 | patch("sqldeps.llm_parsers.create_extractor", return_value=mock_extractor), 74 | patch("sqldeps.parallel.save_to_cache") as mock_save, 75 | ): 76 | # Should perform extraction 77 | path, result = _extract_from_file( 78 | mock_path, mock_limiter, "groq", "model", None, True 79 | ) 80 | 81 | assert path == mock_path 82 | assert result == "result" 83 | mock_limiter.wait_if_needed.assert_called_once() 84 | mock_extractor.extract_from_file.assert_called_once_with(mock_path) 85 | mock_save.assert_called_once() 86 | 87 | def test_process_batch_files(self) -> None: 88 | """Test batch processing of files.""" 89 | # Mock dependencies 90 | mock_limiter = MagicMock() 91 | mock_files = [Path("test1.sql"), Path("test2.sql")] 92 | 93 | # Setup extraction results 94 | path1_result = MagicMock() 95 | path2_result = MagicMock() 96 | 97 | # Mock the extract_from_file function to return predetermined results 98 | with patch( 99 | "sqldeps.parallel._extract_from_file", 100 | side_effect=[(mock_files[0], path1_result), (mock_files[1], path2_result)], 101 | ): 102 | # Process batch 103 | results = _process_batch_files( 104 | mock_files, mock_limiter, "groq", "model", None, True 105 | ) 106 | 107 | # Verify results 108 | assert len(results) == 2 109 | assert results[str(mock_files[0])] == path1_result 110 | assert results[str(mock_files[1])] == path2_result 111 | 112 | def test_process_files_in_parallel(self) -> None: 113 | """Test parallel file processing.""" 114 | with ( 115 | patch("sqldeps.parallel.ProcessPoolExecutor") as mock_executor_class, 116 | patch("sqldeps.parallel.Manager") as mock_manager, 117 | patch("sqldeps.parallel.MultiprocessingRateLimiter") as mock_limiter_class, 118 | patch("sqldeps.parallel.resolve_workers") as mock_resolve, 119 | patch("sqldeps.parallel.np.array_split") as mock_array_split, 120 | ): 121 | # Setup mocks 122 | mock_resolve.return_value = 2 123 | mock_sql_files = [ 124 | Path("test1.sql"), 125 | Path("test2.sql"), 126 | Path("test3.sql"), 127 | Path("test4.sql"), 128 | ] 129 | mock_array_split.return_value = [ 130 | [mock_sql_files[0], mock_sql_files[1]], 131 | [mock_sql_files[2], mock_sql_files[3]], 132 | ] 133 | 134 | # Mock the manager 135 | manager_instance = MagicMock() 136 | mock_manager.return_value.__enter__.return_value = manager_instance 137 | 138 | # Mock the limiter 139 | mock_limiter = MagicMock() 140 | mock_limiter_class.return_value = mock_limiter 141 | 142 | # Mock the ProcessPoolExecutor 143 | executor_instance = MagicMock() 144 | mock_executor_class.return_value.__enter__.return_value = executor_instance 145 | 146 | # Setup futures and their results 147 | future1 = MagicMock(spec=Future) 148 | future2 = MagicMock(spec=Future) 149 | future1.result.return_value = { 150 | str(mock_sql_files[0]): "result1", 151 | str(mock_sql_files[1]): "result2", 152 | } 153 | future2.result.return_value = { 154 | str(mock_sql_files[2]): "result3", 155 | str(mock_sql_files[3]): "result4", 156 | } 157 | 158 | # Mock the futures dictionary 159 | executor_instance.submit.side_effect = [future1, future2] 160 | 161 | # Mock as_completed to return futures in order 162 | with patch( 163 | "sqldeps.parallel.as_completed", return_value=[future1, future2] 164 | ): 165 | # Call the function 166 | results = process_files_in_parallel( 167 | mock_sql_files, 168 | framework="groq", 169 | model="test-model", 170 | n_workers=2, 171 | rpm=60, 172 | use_cache=True, 173 | ) 174 | 175 | # Verify results 176 | assert len(results) == 4 177 | assert results[str(mock_sql_files[0])] == "result1" 178 | assert results[str(mock_sql_files[1])] == "result2" 179 | assert results[str(mock_sql_files[2])] == "result3" 180 | assert results[str(mock_sql_files[3])] == "result4" 181 | 182 | # Verify worker resolution 183 | mock_resolve.assert_called_once_with(2) 184 | 185 | # Verify batch splitting 186 | mock_array_split.assert_called_once() 187 | 188 | # Verify executor was created with correct workers 189 | mock_executor_class.assert_called_once_with(max_workers=2) 190 | 191 | # Verify submit was called for each batch 192 | assert executor_instance.submit.call_count == 2 193 | -------------------------------------------------------------------------------- /tests/unit/test_rate_limiter.py: -------------------------------------------------------------------------------- 1 | """Unit tests for rate limiter. 2 | 3 | This module tests the rate limiting functionality which is used to control 4 | the frequency of API calls to LLM providers. 5 | """ 6 | 7 | from unittest.mock import MagicMock, patch 8 | 9 | from sqldeps.rate_limiter import MultiprocessingRateLimiter, RateLimiter 10 | 11 | 12 | def test_rate_limiter_no_wait_under_limit() -> None: 13 | """Test rate limiter when under the RPM limit (no waiting needed).""" 14 | # Create a rate limiter with 60 RPM (1 request per second) 15 | limiter = RateLimiter(rpm=60) 16 | 17 | # Mock time.time to return controlled values 18 | with patch("time.time", return_value=100), patch("time.sleep") as mock_sleep: 19 | # Call wait_if_needed multiple times (less than rpm) 20 | for _ in range(30): 21 | limiter.wait_if_needed() 22 | 23 | # Verify sleep was not called since we're under the rate limit 24 | mock_sleep.assert_not_called() 25 | 26 | 27 | def test_rate_limiter_wait_when_limit_reached() -> None: 28 | """Test rate limiter when RPM limit is reached (should wait).""" 29 | # Create a rate limiter with 10 RPM 30 | limiter = RateLimiter(rpm=10) 31 | 32 | # Set up the call_times list with timestamps that would trigger rate limiting 33 | current_time = 100 34 | limiter.call_times = [ 35 | current_time - 50 + i for i in range(10) 36 | ] # 10 calls in last 50 seconds 37 | 38 | # Mock time functions 39 | with ( 40 | patch("time.time", return_value=current_time), 41 | patch("time.sleep") as mock_sleep, 42 | ): 43 | # This call should trigger waiting since we've reached 10 calls in the window 44 | limiter.wait_if_needed() 45 | 46 | # Verify sleep was called with the correct wait time (should wait ~10 seconds) 47 | # First timestamp is (current_time - 50), so it expires at (current_time + 10) 48 | expected_wait_time = 10 # (current_time - 50) + 60 - current_time 49 | mock_sleep.assert_called_once() 50 | actual_wait_time = mock_sleep.call_args[0][0] 51 | assert ( 52 | abs(actual_wait_time - expected_wait_time) < 0.01 53 | ) # Allow small float differences 54 | 55 | 56 | def test_rate_limiter_zero_rpm() -> None: 57 | """Test rate limiter when RPM is set to zero (disabled).""" 58 | limiter = RateLimiter(rpm=0) 59 | 60 | with patch("time.time") as mock_time, patch("time.sleep") as mock_sleep: 61 | # Call wait_if_needed multiple times 62 | for _ in range(100): 63 | limiter.wait_if_needed() 64 | 65 | # Verify that time and sleep were not called 66 | mock_time.assert_not_called() 67 | mock_sleep.assert_not_called() 68 | 69 | 70 | def test_multiprocessing_rate_limiter() -> None: 71 | """Test multiprocessing rate limiter.""" 72 | # Create a mock manager 73 | mock_manager = MagicMock() 74 | mock_manager.list.return_value = [] 75 | mock_manager.RLock.return_value = MagicMock() 76 | 77 | # Create limiter with mock manager 78 | limiter = MultiprocessingRateLimiter(mock_manager, rpm=10) 79 | 80 | # Test wait_if_needed when under the limit 81 | with patch("time.time", return_value=100), patch("time.sleep") as mock_sleep: 82 | limiter.wait_if_needed() 83 | 84 | # Since there are no previous calls, sleep should not be called 85 | mock_sleep.assert_not_called() 86 | 87 | # call_times should have been updated 88 | assert len(limiter.call_times) == 1 89 | assert limiter.call_times[0] == 100 90 | -------------------------------------------------------------------------------- /tests/unit/test_visualization.py: -------------------------------------------------------------------------------- 1 | """Unit tests for visualization.py. 2 | 3 | This module tests the visualization functions for creating interactive 4 | network graphs of SQL dependencies. 5 | """ 6 | 7 | from sqldeps.models import SQLProfile 8 | from sqldeps.visualization import visualize_sql_dependencies 9 | 10 | 11 | def test_visualize_sql_dependencies_basic() -> None: 12 | """Test basic visualization of SQL dependencies. 13 | 14 | Verifies that the visualization function creates a valid Plotly figure 15 | with the correct title and traces. 16 | """ 17 | # Simple mock dependencies data 18 | sql_profiles = { 19 | "file1.sql": SQLProfile( 20 | dependencies={"table1": ["col1", "col2"]}, outputs={"table2": ["col3"]} 21 | ), 22 | } 23 | 24 | # Call the visualization function 25 | figure = visualize_sql_dependencies(sql_profiles) 26 | 27 | # Basic assertions to verify the figure was created properly 28 | assert figure is not None 29 | assert len(figure.data) > 0 # Should have at least some traces 30 | 31 | # Verify title contains expected information 32 | assert "SQL Dependency Graph" in figure.layout.title.text 33 | assert "1 files" in figure.layout.title.text 34 | 35 | 36 | def test_visualize_sql_dependencies_empty() -> None: 37 | """Test visualization with empty dependencies. 38 | 39 | Verifies that the function handles empty input gracefully. 40 | """ 41 | # Call with empty dependencies 42 | figure = visualize_sql_dependencies({}) 43 | 44 | # Should still create a figure 45 | assert figure is not None 46 | assert "0 files" in figure.layout.title.text 47 | --------------------------------------------------------------------------------