├── tests
├── test_output
│ └── .gitkeep
├── test_set_one
│ ├── data.csv
│ ├── plot.png
│ ├── lalala.csv
│ ├── output.xlsx
│ ├── raw
│ │ └── raw_data_in.dta
│ ├── processing.py
│ ├── report_gen.py
│ ├── analysis.py
│ └── ins_and_outs_file.py
├── test_set_two
│ ├── data
│ │ └── input.csv
│ ├── visualisation.py
│ ├── model_solver.py
│ ├── data_processing.py
│ ├── slides.qmd
│ └── nb_example.ipynb
├── test_set_three
│ └── db_analysis.py
├── test_all.py
├── test_reporter_integration.py
├── test_qmd_parser.py
├── test_qmd_integration.py
├── test_jupyter_integration.py
└── test_analyser.py
├── docs
├── styles.css
├── favicon.png
├── _quarto.yml
├── logo.svg
├── objects.json
├── output_options.ipynb
├── contributing.qmd
├── index.ipynb
└── output.svg
├── codecov.yml
├── .github
├── ISSUE_TEMPLATE.md
├── release-drafter.yml
├── workflows
│ ├── labeler.yml
│ ├── release.yml
│ └── tests.yml
└── labels.yml
├── Makefile
├── LICENSE
├── src
└── smartrappy
│ ├── __init__.py
│ ├── notebook_parser.py
│ ├── __main__.py
│ ├── qmd_parser.py
│ ├── models.py
│ └── reporters.py
├── version_bumper.py
├── pyproject.toml
├── .pre-commit-config.yaml
├── create_readme.py
├── README.md
├── .gitignore
└── noxfile.py
/tests/test_output/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_set_one/data.csv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_set_one/plot.png:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_set_one/lalala.csv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_set_one/output.xlsx:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/styles.css:
--------------------------------------------------------------------------------
1 | /* css styles */
2 |
--------------------------------------------------------------------------------
/tests/test_set_one/raw/raw_data_in.dta:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_set_two/data/input.csv:
--------------------------------------------------------------------------------
1 | ,value
2 | 1,1
3 | 2,2
4 | 3,3
5 |
--------------------------------------------------------------------------------
/docs/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aeturrell/smartrappy/HEAD/docs/favicon.png
--------------------------------------------------------------------------------
/tests/test_set_one/processing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | df = pd.read_csv("data.csv")
4 | df.to_excel("output.xlsx")
5 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 | coverage:
3 | status:
4 | project:
5 | default:
6 | target: "96"
7 | patch:
8 | default:
9 | target: "96"
10 |
--------------------------------------------------------------------------------
/tests/test_set_one/report_gen.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 |
4 | df = pd.read_excel("output.xlsx")
5 | plt.plot(df["x"], df["y"])
6 | plt.savefig("plot.png")
7 |
--------------------------------------------------------------------------------
/tests/test_set_one/analysis.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pandas as pd
4 |
5 | df = pd.read_csv("data.csv")
6 |
7 |
8 | def an_example_that_is_imported():
9 | print("hello")
10 |
11 |
12 | df_raw_in_data = pd.read_dta(Path("raw/raw_data_in.dta"))
13 |
--------------------------------------------------------------------------------
/tests/test_set_two/visualisation.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | from data_processing import process_data
4 |
5 |
6 | def create_plots():
7 | process_data()
8 | df = pd.read_csv("data/processed.csv")
9 | plt.plot(df["processed"])
10 | plt.savefig("output.png")
11 |
--------------------------------------------------------------------------------
/tests/test_set_one/ins_and_outs_file.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 |
4 | df = pd.read_csv("lalala.csv")
5 |
6 | with open("text.txt", "w") as f:
7 | f.write("blah")
8 |
9 | df.to_csv("out.csv")
10 |
11 | fig, ax = plt.subplots()
12 | ax.plot([1, 2, 4], [3, 4, 5])
13 | plt.savefig("out_figure.svg")
14 |
--------------------------------------------------------------------------------
/tests/test_set_two/model_solver.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | with open(Path("equation.tex"), "w") as f:
4 | f.write(
5 | "$${\displaystyle {\frac {\partial f_{\alpha }}{\partial t}}+\mathbf {v} _{\alpha }\cdot {\frac {\partial f_{\alpha }}{\partial \mathbf {x} }}+{\frac {q_{\alpha }\mathbf {E} }{m_{\alpha }}}\cdot {\frac {\partial f_{\alpha }}{\partial \mathbf {v} }}=0,}$$"
6 | )
7 | f.close()
8 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Issue
2 |
3 | - smartrappy version:
4 | - Python version:
5 | - Operating System:
6 |
7 | ### Description
8 |
9 | Describe what you were trying to get done.
10 | Tell us what happened, what went wrong, and what you expected to happen.
11 |
12 | ### What I Did
13 |
14 | ```python
15 | Paste the command(s) you ran and the output.
16 | If there was a crash, please include the traceback here.
17 | ```
18 |
--------------------------------------------------------------------------------
/tests/test_set_two/data_processing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pyodbc
4 |
5 |
6 | def process_data():
7 | df = pd.read_csv("data/input.csv")
8 | df["processed"] = df["value"].apply(np.sqrt)
9 | df.to_csv("data/processed.csv")
10 |
11 |
12 | mssql_conn = pyodbc.connect(
13 | "DRIVER={SQL Server};SERVER=myserver;DATABASE=mydatabase;UID=user;PWD=password"
14 | )
15 | df_db = pd.read_sql("SELECT TOP 10 * FROM customers", mssql_conn)
16 |
--------------------------------------------------------------------------------
/tests/test_set_two/slides.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Example slides"
3 | subtitle: ""
4 | format:
5 | clean-revealjs:
6 | output-file: "paper_slides.html"
7 | ---
8 |
9 | # Motivation {background-color="#770077" transition="fade-in fade-out"}
10 |
11 | ## A slide with a figure
12 |
13 | 
14 |
15 | ## A slide with a latex include
16 |
17 | {{< include /equation.tex >}}
18 |
19 | ## A slide with an alternative equation
20 |
21 | {{< include /alternative_equation.tex >}}
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # This makes the documentation for smartrappy
2 | # In practice, though, done in the GitHub Action (release)
3 | .PHONY: all site publish
4 |
5 | all: site
6 |
7 | # Build the github pages site
8 | site:
9 | uv pip install -e .
10 | uv run quartodoc build --config docs/_quarto.yml
11 | cd docs; uv run quarto render --execute
12 | rm docs/.gitignore
13 | uv run python create_readme.py
14 | uv run nbstripout docs/*.ipynb
15 | uv run pre-commit run --all-files
16 |
17 |
18 | publish:
19 | uv pip install -e .
20 | uv run quartodoc build --config docs/_quarto.yml
21 | cd docs;uv run quarto render --execute
22 | cd docs;uv run quarto publish gh-pages --no-render
23 | rm docs/.gitignore
24 | uv run python create_readme.py
25 | uv run nbstripout docs/*.ipynb
26 | uv run pre-commit run --all-files
27 |
--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
1 | categories:
2 | - title: ":boom: Breaking Changes"
3 | label: "breaking"
4 | - title: ":rocket: Features"
5 | label: "enhancement"
6 | - title: ":fire: Removals and Deprecations"
7 | label: "removal"
8 | - title: ":beetle: Fixes"
9 | label: "bug"
10 | - title: ":racehorse: Performance"
11 | label: "performance"
12 | - title: ":rotating_light: Testing"
13 | label: "testing"
14 | - title: ":construction_worker: Continuous Integration"
15 | label: "ci"
16 | - title: ":books: Documentation"
17 | label: "documentation"
18 | - title: ":hammer: Refactoring"
19 | label: "refactoring"
20 | - title: ":lipstick: Style"
21 | label: "style"
22 | - title: ":package: Dependencies"
23 | labels:
24 | - "dependencies"
25 | - "build"
26 | template: |
27 | ## Changes
28 |
29 | $CHANGES
30 |
--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
1 |
2 | name: labeler
3 |
4 | on:
5 | push:
6 | branches:
7 | - 'main'
8 | paths:
9 | - '.github/labels.yml'
10 | - '.github/workflows/labels.yml'
11 | pull_request:
12 | paths:
13 | - '.github/labels.yml'
14 | - '.github/workflows/labels.yml'
15 |
16 | jobs:
17 | labeler:
18 | runs-on: ubuntu-latest
19 | permissions:
20 | contents: read
21 | issues: write
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v4
25 | - name: Run Labeler
26 | uses: crazy-max/ghaction-github-labeler@v5
27 | with:
28 | skip-delete: true
29 | github-token: ${{ secrets.GITHUB_TOKEN }}
30 | yaml-file: .github/labels.yml
31 | dry-run: ${{ github.event_name == 'pull_request' }}
32 | exclude: |
33 | help*
34 | *issue
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Arthur Turrell
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/docs/_quarto.yml:
--------------------------------------------------------------------------------
1 | project:
2 | type: website
3 | execute-dir: project
4 |
5 | website:
6 | title: "smartrappy"
7 | favicon: favicon.png
8 | twitter-card: true
9 | navbar:
10 | left:
11 | - href: index.ipynb
12 | text: Home
13 | - href: output_options.ipynb
14 | text: Other output options
15 | - text: "Reference"
16 | file: reference/index.qmd
17 | - contributing.qmd
18 |
19 | format:
20 | html:
21 | theme: flatly
22 | css: styles.css
23 | toc: true
24 |
25 | # tell quarto to read the generated sidebar
26 | metadata-files:
27 | - _sidebar.yml
28 |
29 |
30 | quartodoc:
31 | # the name used to import the package you want to create reference docs for
32 | package: smartrappy
33 | parser: google
34 |
35 | # write sidebar data to this file
36 | sidebar: _sidebar.yml
37 |
38 | sections:
39 | - title: "Function reference"
40 | desc: "What smartrappy's functions do"
41 | contents:
42 | # the functions being documented in the package.
43 | # you can refer to anything: class methods, modules, etc..
44 | - analyse_project
45 | - ConsoleReporter
46 | - JsonReporter
47 | - MermaidReporter
48 | - GraphvizReporter
49 |
--------------------------------------------------------------------------------
/tests/test_set_three/db_analysis.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | import pandas as pd
4 | import pyodbc
5 | from sqlalchemy import create_engine
6 |
7 | # SQLite connection (simplest to test with since it doesn't require a server)
8 | sqlite_conn = sqlite3.connect("example.db")
9 | df1 = pd.read_sql("SELECT * FROM users", sqlite_conn)
10 |
11 | # Write to SQLite using pandas
12 | df2 = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
13 | df2.to_sql("new_table", sqlite_conn, if_exists="replace")
14 |
15 | # SQLAlchemy with SQLite
16 | engine = create_engine("sqlite:///another_example.db")
17 | df3 = pd.read_sql_table("some_table", engine)
18 |
19 | # MS SQL Server via pyodbc
20 | mssql_conn = pyodbc.connect(
21 | "DRIVER={SQL Server};SERVER=myserver;DATABASE=mydatabase;UID=user;PWD=password"
22 | )
23 | df4 = pd.read_sql("SELECT TOP 10 * FROM customers", mssql_conn)
24 |
25 | # MS SQL Server via SQLAlchemy
26 | mssql_engine = create_engine(
27 | "mssql+pyodbc://user:password@myserver/mydatabase?driver=SQL+Server"
28 | )
29 | df5 = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
30 | df5.to_sql("new_mssql_table", mssql_engine, if_exists="replace")
31 |
32 | # PostgreSQL via SQLAlchemy
33 | pg_engine = create_engine("postgresql://user:password@localhost:5432/pgdb")
34 | df6 = pd.read_sql_query("SELECT * FROM pg_tables", pg_engine)
35 |
36 | df6.to_csv("out.csv")
37 |
--------------------------------------------------------------------------------
/src/smartrappy/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | smartrappy
3 | ------------------------------------
4 | Smart reproducible analytical pipeline execution
5 | """
6 |
7 | from importlib.metadata import PackageNotFoundError, version
8 |
9 | try:
10 | __version__ = version("smartrappy")
11 | except PackageNotFoundError:
12 | __version__ = "unknown"
13 |
14 | # Import core components first
15 | # Import CLI functions last to avoid circular imports
16 | from smartrappy.analyser import analyse_project
17 | from smartrappy.models import (
18 | DatabaseInfo,
19 | Edge,
20 | FileInfo,
21 | FileStatus,
22 | ModuleImport,
23 | Node,
24 | NodeType,
25 | ProjectModel,
26 | )
27 | from smartrappy.qmd_parser import (
28 | analyse_qmd_file,
29 | extract_markdown_resources,
30 | extract_python_chunks,
31 | )
32 | from smartrappy.reporters import (
33 | ConsoleReporter,
34 | GraphvizReporter,
35 | JsonReporter,
36 | MermaidReporter,
37 | Reporter,
38 | get_reporter,
39 | )
40 |
41 | __all__ = [
42 | # Main functions
43 | "analyse_project",
44 | "analyse_qmd_file",
45 | "extract_python_chunks",
46 | "extract_markdown_resources", # New export
47 | # Models
48 | "DatabaseInfo",
49 | "Edge",
50 | "FileInfo",
51 | "FileStatus",
52 | "ModuleImport",
53 | "Node",
54 | "NodeType",
55 | "ProjectModel",
56 | # Reporters
57 | "Reporter",
58 | "ConsoleReporter",
59 | "GraphvizReporter",
60 | "MermaidReporter",
61 | "JsonReporter",
62 | "get_reporter",
63 | ]
64 |
--------------------------------------------------------------------------------
/version_bumper.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # dependencies = [
3 | # "toml>=0.10.2"
4 | # ]
5 | # ///
6 | import subprocess
7 | from typing import Literal
8 |
9 | import toml
10 |
11 |
12 | def bump_version(part: Literal["major", "minor", "patch"] = "patch") -> None:
13 | """Bump version in pyproject.toml file.
14 |
15 | Args:
16 | part (Literal["major", "minor", "patch"], optional): Version part to increment. Defaults to "patch".
17 |
18 | Raises:
19 | ValueError: If part is not 'major', 'minor', or 'patch'.
20 | """
21 | file_path = "pyproject.toml"
22 |
23 | with open(file_path, "r") as f:
24 | pyproject = toml.load(f)
25 |
26 | version = pyproject["project"]["version"]
27 | major, minor, patch = map(int, version.split("."))
28 |
29 | if part == "major":
30 | major += 1
31 | minor = 0
32 | patch = 0
33 | elif part == "minor":
34 | minor += 1
35 | patch = 0
36 | elif part == "patch":
37 | patch += 1
38 | else:
39 | raise ValueError("Invalid part value. Choose 'major', 'minor', or 'patch'.")
40 |
41 | new_version = f"{major}.{minor}.{patch}"
42 | subprocess.run(
43 | [
44 | "uvx",
45 | "--from=toml-cli",
46 | "toml",
47 | "set",
48 | "--toml-path=pyproject.toml",
49 | "project.version",
50 | new_version,
51 | ]
52 | )
53 |
54 | print(f"Version bumped to {major}.{minor}.{patch}")
55 |
56 |
57 | if __name__ == "__main__":
58 | bump_version()
59 |
--------------------------------------------------------------------------------
/tests/test_set_two/nb_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3fb05507",
6 | "metadata": {},
7 | "source": [
8 | "This is a markdown cell."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "af374a24",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from pathlib import Path\n",
19 | "\n",
20 | "import pandas as pd\n",
21 | "\n",
22 | "df = pd.read_csv(Path(\"data/input.csv\"))"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "dc383f23",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "with open(Path(\"alternative_equation.tex\"), \"w\") as f:\n",
33 | " f.write(\n",
34 | " \"$${\\displaystyle {\\frac {\\partial f_{\\alpha }}{\\partial t}}+\\mathbf {v} _{\\alpha }\\cdot {\\frac {\\partial f_{\\alpha }}{\\partial \\mathbf {x} }}+{\\frac {q_{\\alpha }\\mathbf {E} }{m_{\\alpha }}}\\cdot {\\frac {\\partial f_{\\alpha }}{\\partial \\mathbf {v} }}=0,}$$\"\n",
35 | " )\n",
36 | "f.close()"
37 | ]
38 | }
39 | ],
40 | "metadata": {
41 | "kernelspec": {
42 | "display_name": ".venv",
43 | "language": "python",
44 | "name": "python3"
45 | },
46 | "language_info": {
47 | "codemirror_mode": {
48 | "name": "ipython",
49 | "version": 3
50 | },
51 | "file_extension": ".py",
52 | "mimetype": "text/x-python",
53 | "name": "python",
54 | "nbconvert_exporter": "python",
55 | "pygments_lexer": "ipython3",
56 | "version": "3.12.0"
57 | }
58 | },
59 | "nbformat": 4,
60 | "nbformat_minor": 5
61 | }
62 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "smartrappy"
3 | version = "0.0.8"
4 | description = "Smart reproducible analytical pipeline visualisation."
5 | authors = [{name="Arthur Turrell", email="anon@anon.gmail.com"}]
6 | readme = "README.md"
7 | license = "MIT"
8 | classifiers = [
9 | "Development Status :: 2 - Pre-Alpha",
10 | "Programming Language :: Python :: 3.11",
11 | "Programming Language :: Python :: 3.12",
12 | "Programming Language :: Python :: 3.10",
13 | ]
14 | requires-python = ">=3.10"
15 | dependencies = [
16 | "click>=8.1.8",
17 | "graphviz>=0.20.3",
18 | "matplotlib>=3.10.0",
19 | "pandas>=2.2.3",
20 | "rich>=13.9.4",
21 | ]
22 |
23 | [dependency-groups]
24 | dev = [
25 | "autopep8>=2.3.1",
26 | "coverage[toml]>=7.6.9",
27 | "jupyter>=1.1.1",
28 | "nbstripout>=0.8.1",
29 | "nox>=2024.10.9",
30 | "pre-commit>=4.0.1",
31 | "pre-commit-hooks>=5.0.0",
32 | "pygments>=2.18.0",
33 | "pytest>=8.3.4",
34 | "quartodoc>=0.9.1",
35 | "ruff>=0.8.3",
36 | "toml>=0.10.2",
37 | "typing-extensions>=4.12.2",
38 | "xdoctest[colors]>=1.2.0",
39 | "ipykernel>=6.29.5",
40 | "pydoclint>=0.6.0",
41 | "typeguard>=4.4.2",
42 | "pyodbc>=5.2.0",
43 | "sqlalchemy>=2.0.40",
44 | ]
45 |
46 | [project.scripts]
47 | smartrappy = "smartrappy.__main__:main"
48 |
49 | [tool.uv]
50 | package = true
51 |
52 | [tool.mypy]
53 | strict = false
54 | pretty = true
55 | show_column_numbers = true
56 | show_error_codes = true
57 | show_error_context = true
58 | ignore_missing_imports = true
59 | disallow_untyped_calls = false
60 |
61 | [tool.pydoclint]
62 | style = 'google'
63 | exclude = ["noxfile.py", "tests/", "docs/"]
64 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/astral-sh/ruff-pre-commit
3 | # Ruff version.
4 | rev: v0.11.6
5 | hooks:
6 | # Run the linter.
7 | - id: ruff
8 | types_or: [python, pyi, jupyter]
9 | args: [ --fix ]
10 | - id: ruff
11 | types_or: [python, pyi, jupyter]
12 | name: sort imports with ruff
13 | args: [--select, I, --fix]
14 | # Run the formatter.
15 | - id: ruff-format
16 | types_or: [python, pyi, jupyter]
17 | - repo: local
18 | hooks:
19 | - id: check-added-large-files
20 | name: Check for added large files
21 | entry: check-added-large-files
22 | language: system
23 | - id: check-toml
24 | name: Check Toml
25 | entry: check-toml
26 | language: system
27 | types: [toml]
28 | - id: check-yaml
29 | exclude: docs/
30 | name: Check Yaml
31 | entry: check-yaml
32 | language: system
33 | types: [yaml]
34 | - id: end-of-file-fixer
35 | exclude: docs/
36 | name: Fix End of Files
37 | entry: end-of-file-fixer
38 | language: system
39 | types: [text]
40 | stages: [pre-commit, pre-push, manual]
41 | - id: trailing-whitespace
42 | exclude: docs/
43 | name: Trim Trailing Whitespace
44 | entry: trailing-whitespace-fixer
45 | language: system
46 | types: [text]
47 | stages: [pre-commit, pre-push, manual]
48 | - repo: https://github.com/kynan/nbstripout
49 | rev: 0.4.0
50 | hooks:
51 | - id: nbstripout
52 | name: nbstripout
53 | description: "nbstripout: strip output from Jupyter and IPython notebooks"
54 | entry: nbstripout
55 | language: python
56 | types: [jupyter]
57 | - repo: https://github.com/jsh9/pydoclint
58 | rev: 0.6.0
59 | hooks:
60 | - id: pydoclint
61 | args: [--style=google, --config=pyproject.toml, src/]
62 |
--------------------------------------------------------------------------------
/.github/labels.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # Labels names are important as they are used by Release Drafter to decide
3 | # regarding where to record them in changelog or if to skip them.
4 | #
5 | # The repository labels will be automatically configured using this file and
6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler.
7 | - name: breaking
8 | description: Breaking Changes
9 | color: bfd4f2
10 | - name: bug
11 | description: Something isn't working
12 | color: d73a4a
13 | - name: build
14 | description: Build System and Dependencies
15 | color: bfdadc
16 | - name: ci
17 | description: Continuous Integration
18 | color: 4a97d6
19 | - name: dependencies
20 | description: Pull requests that update a dependency file
21 | color: 0366d6
22 | - name: documentation
23 | description: Improvements or additions to documentation
24 | color: 0075ca
25 | - name: duplicate
26 | description: This issue or pull request already exists
27 | color: cfd3d7
28 | - name: enhancement
29 | description: New feature or request
30 | color: a2eeef
31 | - name: github_actions
32 | description: Pull requests that update Github_actions code
33 | color: "000000"
34 | - name: good first issue
35 | description: Good for newcomers
36 | color: 7057ff
37 | - name: help wanted
38 | description: Extra attention is needed
39 | color: 008672
40 | - name: invalid
41 | description: This doesn't seem right
42 | color: e4e669
43 | - name: performance
44 | description: Performance
45 | color: "016175"
46 | - name: python
47 | description: Pull requests that update Python code
48 | color: 2b67c6
49 | - name: question
50 | description: Further information is requested
51 | color: d876e3
52 | - name: refactoring
53 | description: Refactoring
54 | color: ef67c4
55 | - name: removal
56 | description: Removals and Deprecations
57 | color: 9ae7ea
58 | - name: style
59 | description: Style
60 | color: c120e5
61 | - name: testing
62 | description: Testing
63 | color: b1fc6f
64 | - name: wontfix
65 | description: This will not be worked on
66 | color: ffffff
67 |
--------------------------------------------------------------------------------
/docs/logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_all.py:
--------------------------------------------------------------------------------
1 | """Tests for the refactored smartrappy architecture."""
2 |
3 | import tempfile
4 | from pathlib import Path
5 |
6 | from smartrappy import analyse_project
7 | from smartrappy.reporters import (
8 | ConsoleReporter,
9 | GraphvizReporter,
10 | JsonReporter,
11 | MermaidReporter,
12 | )
13 |
14 |
15 | def test_analyse_project():
16 | """Test that analyse_project works with the test directories."""
17 | # Analyse the test set
18 | model = analyse_project("tests/test_set_one")
19 |
20 | # Check that the model contains expected data
21 | assert len(model.nodes) > 0
22 | assert len(model.edges) > 0
23 | assert "data.csv" in model.file_operations
24 |
25 | # Test with a different directory
26 | model2 = analyse_project("tests/test_set_two")
27 | assert len(model2.nodes) > 0
28 | assert "data/input.csv" in model2.file_operations
29 |
30 |
31 | def test_reporters():
32 | """Test that all reporters can generate output."""
33 | # Analyse a test set
34 | model = analyse_project("tests/test_set_one")
35 |
36 | with tempfile.TemporaryDirectory() as tmpdir:
37 | # Test console reporter
38 | console_reporter = ConsoleReporter()
39 | console_reporter.generate_report(model) # No output file needed
40 |
41 | # Test graphviz reporter
42 | graphviz_output = Path(tmpdir) / "graphviz_test"
43 | graphviz_reporter = GraphvizReporter()
44 | graphviz_reporter.generate_report(model, str(graphviz_output))
45 | assert (graphviz_output.with_suffix(".pdf")).exists()
46 |
47 | # Test mermaid reporter
48 | mermaid_output = Path(tmpdir) / "mermaid_test.md"
49 | mermaid_reporter = MermaidReporter()
50 | mermaid_reporter.generate_report(model, str(mermaid_output))
51 | assert mermaid_output.exists()
52 |
53 | # Test JSON reporter with console output
54 | json_reporter = JsonReporter()
55 | json_reporter.generate_report(model) # Should print to console
56 |
57 | # Test JSON reporter with file output
58 | json_output = Path(tmpdir) / "json_test.json"
59 | json_reporter.generate_report(model, str(json_output))
60 | assert json_output.exists()
61 |
62 |
63 | if __name__ == "__main__":
64 | # Simple manual test
65 | test_analyse_project()
66 | test_reporters()
67 | print("All tests passed!")
68 |
--------------------------------------------------------------------------------
/create_readme.py:
--------------------------------------------------------------------------------
1 | import re
2 | from pathlib import Path
3 |
4 | import nbformat
5 |
6 |
7 | def convert_notebook_to_markdown(
8 | notebook_path: Path, output_path: Path, num_cells: int = 5
9 | ) -> None:
10 | """Converts a Jupyter notebook to a markdown file, including only the first N cells.
11 |
12 | This function reads a Jupyter notebook, extracts a specified number of cells (default 5),
13 | and converts them to markdown format. Markdown cells are preserved as-is, while code cells
14 | are wrapped in Python code blocks.
15 |
16 | notebook_path (Path): Path to the input Jupyter notebook file (.ipynb)
17 | output_path (Path): Path where the output markdown file will be saved
18 | num_cells (int, optional): Number of cells to include from the start of the notebook. Defaults to 5.
19 |
20 | Returns:
21 | None: The function prints a confirmation message but does not return any value
22 |
23 | Notes:
24 | - The function creates the output directory if it doesn't exist
25 | - Code cells are wrapped in ```python blocks
26 | - Non-markdown and non-code cells are replaced with HTML comments
27 | """
28 | # Load the notebook
29 | with open(notebook_path, "r", encoding="utf-8") as f:
30 | notebook = nbformat.read(f, as_version=4)
31 |
32 | # Get the first `num_cells` cells
33 | cells = notebook.cells[:num_cells]
34 |
35 | # Convert cells to markdown text
36 | md_lines = []
37 | for cell in cells:
38 | if cell.cell_type == "markdown":
39 | md_lines.append(cell.source)
40 | elif cell.cell_type == "code":
41 | md_lines.append("```python\n" + cell.source + "\n```")
42 | else:
43 | md_lines.append(f"")
44 |
45 | # Join the lines
46 | markdown_text = "\n\n".join(md_lines)
47 |
48 | # Strip extraneous.
49 | # Remove special frontmatter
50 | markdown_text = markdown_text.replace("---\nexecute:\n echo: false\n---\n", "")
51 |
52 | # Remove width specifications
53 | markdown_text = re.sub(r"{width=\d+%}", "", markdown_text)
54 |
55 | # Remove leading whitespace and newlines before first hash
56 | markdown_text = re.sub(r"^\s*(?=#)", "", markdown_text)
57 |
58 | # Replace logo.svg with docs/logo.svg
59 | markdown_text = markdown_text.replace("logo.svg", "docs/logo.svg")
60 |
61 | # Write to output markdown file
62 | with open(output_path, "w", encoding="utf-8") as f:
63 | f.write(markdown_text)
64 |
65 | print(f"Markdown saved to: {output_path}")
66 |
67 |
68 | if __name__ == "__main__":
69 | # Example usage
70 | convert_notebook_to_markdown(
71 | Path("docs/index.ipynb"), Path("README.md"), num_cells=3
72 | )
73 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 |
2 | name: Release
3 |
4 | permissions:
5 | contents: write
6 | pages: write
7 | id-token: write
8 |
9 | on:
10 | push:
11 | branches:
12 | - main
13 | - master
14 |
15 | jobs:
16 | release:
17 | name: Release
18 | runs-on: ubuntu-latest
19 | environment: pypi
20 | steps:
21 | - name: Check out the repository
22 | uses: actions/checkout@v4
23 | with:
24 | fetch-depth: 2
25 |
26 | - name: Set up Python
27 | uses: actions/setup-python@v5.4.0
28 | with:
29 | python-version: "3.10"
30 |
31 | - name: Install uv
32 | uses: astral-sh/setup-uv@v5
33 | with:
34 | # Install a specific version of uv.
35 | version: "0.5.2"
36 |
37 | - name: Check if there is a parent commit
38 | id: check-parent-commit
39 | run: |
40 | echo "::set-output name=sha::$(git rev-parse --verify --quiet HEAD^)"
41 |
42 | - name: Detect and tag new version
43 | id: check-version
44 | if: steps.check-parent-commit.outputs.sha
45 | uses: salsify/action-detect-and-tag-new-version@v2.0.3
46 | with:
47 | version-command: |
48 | uvx --from=toml-cli toml get --toml-path=pyproject.toml project.version
49 |
50 | - name: Bump version for developmental release
51 | if: "! steps.check-version.outputs.tag"
52 | run: |
53 | uv run version_bumper.py &&
54 | version=$(uvx --from=toml-cli toml get --toml-path=pyproject.toml project.version) &&
55 | uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version $version.dev.$(date +%s)
56 |
57 | - name: Build package
58 | run: |
59 | uv build
60 |
61 | - name: Publish package on PyPI
62 | if: steps.check-version.outputs.tag
63 | uses: pypa/gh-action-pypi-publish@release/v1
64 |
65 | - name: Publish the release notes
66 | uses: release-drafter/release-drafter@v6.1.0
67 | with:
68 | publish: ${{ steps.check-version.outputs.tag != '' }}
69 | tag: ${{ steps.check-version.outputs.tag }}
70 | env:
71 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
72 |
73 | - name: Install dependencies
74 | run: |
75 | uv sync --extra dev
76 | uv pip install -e .
77 |
78 | - name: Install Quarto
79 | uses: quarto-dev/quarto-actions/setup@v2
80 | with:
81 | version: 1.6.39
82 |
83 | - name: install graphviz
84 | uses: ts-graphviz/setup-graphviz@v2
85 |
86 | - name: Build autodocs
87 | run: uv run quartodoc build --config docs/_quarto.yml
88 |
89 | - name: Build docs
90 | run: cd docs;uv run quarto render --execute
91 |
92 | - name: git config
93 | run: |
94 | git config user.name "$(git log -n 1 --pretty=format:%an)" &&
95 | git config user.email "$(git log -n 1 --pretty=format:%ae)"
96 |
97 | - name: Publish
98 | if: steps.check-version.outputs.tag
99 | run: cd docs;uv run quarto publish gh-pages --no-render --no-browser
100 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # smartrappy
2 |
3 | Smart reproducible analytical pipeline inspection.
4 |
5 | 
6 |
7 | [](https://pypi.org/project/smartrappy/)
8 | [](https://pypi.org/project/smartrappy/)
9 | [](https://pypi.org/project/smartrappy)
10 | [](https://opensource.org/licenses/MIT)
11 | [](https://aeturrell.github.io/smartrappy/)
12 | [](https://github.com/aeturrell/smartrappy/actions?workflow=Tests)
13 | [](https://codecov.io/gh/aeturrell/smartrappy)
14 | [](https://github.com/pre-commit/pre-commit)
15 | [](https://github.com/astral-sh/ruff)
16 | [](https://pepy.tech/project/smartrappy)
17 | [](https://github.com/aeturrell/smartrappy)
18 |
19 | 
20 | 
21 | 
22 |
23 |
24 |
25 | ## Introduction
26 |
27 | ### What does this package do?
28 |
29 | **smartrappy** analyses a Python project and infers the directed acyclic graph (DAG) of the code and data dependencies, including the last time any data were refreshed and whether the data exist at all on disk. It is not perfect, and will miss a lot in complex projects: but for simple projects using, say, `pd.read_csv()`, it does a good job of inferring the steps. It can also infer writing to and from most databases. The inferred DAG is then visualised, and there are several options for doing that—the default being to produce a visualisation in the terminal.
30 |
31 | ### What is **smartrappy** for?
32 |
33 | **smartrappy** is designed to help you understand the dependencies in a project, especially in a context where there may be a lot of legacy code that resembles tangled spaghetti.
34 |
35 | ### Quickstart
36 |
37 | To use **smartrappy** as a command-line tool:
38 |
39 | ```bash
40 | smartrappy /path/to/your/project
41 | ```
42 |
43 | Or to use it within a Python script:
44 |
45 | ```python
46 | from smartrappy import analyse_project
47 | from smartrappy.reporters import ConsoleReporter
48 |
49 |
50 | model = analyse_project("/path/to/your/project")
51 | reporter = ConsoleReporter()
52 | reporter.generate_report(model)
53 | ```
54 |
55 | ### Installation
56 |
57 | To install **smartrappy**, you can use `pip install smartrappy` or `uv add smartrappy` if you are using [Astral's uv](https://docs.astral.sh/uv/). You can also use it as a standalone command-line tool with uv and the `uvx` command:
58 |
59 | ```bash
60 | uvx smartrappy path/to/your/project
61 | ```
62 |
63 | ### Documentation
64 |
65 | You can find the full documentation for **smartrappy** at [https://aeturrell.github.io/smartrappy/](https://aeturrell.github.io/smartrappy/).
66 |
--------------------------------------------------------------------------------
/tests/test_reporter_integration.py:
--------------------------------------------------------------------------------
1 | """Integration tests for reporters to improve coverage."""
2 |
3 | import json
4 | import os
5 | import tempfile
6 |
7 | from smartrappy.analyser import analyse_project
8 | from smartrappy.reporters import (
9 | ConsoleReporter,
10 | JsonReporter,
11 | MermaidReporter,
12 | get_reporter,
13 | )
14 |
15 |
16 | def test_console_reporter_with_real_project():
17 | """Test console reporter with actual project analysis."""
18 | # Use test_set_one for a simple project
19 | test_dir = "tests/test_set_one"
20 | if not os.path.exists(test_dir):
21 | # Skip if test directory doesn't exist
22 | return
23 |
24 | model = analyse_project(test_dir, internal_only=False)
25 | reporter = ConsoleReporter()
26 |
27 | # This should not raise an exception
28 | reporter.generate_report(model)
29 |
30 |
31 | def test_mermaid_reporter_with_real_project():
32 | """Test mermaid reporter with actual project analysis."""
33 | test_dir = "tests/test_set_one"
34 | if not os.path.exists(test_dir):
35 | return
36 |
37 | model = analyse_project(test_dir, internal_only=False)
38 |
39 | with tempfile.TemporaryDirectory() as tmpdir:
40 | output_path = os.path.join(tmpdir, "diagram.md")
41 | reporter = MermaidReporter()
42 | reporter.generate_report(model, output_path)
43 |
44 | # Verify the file was created and has content
45 | assert os.path.exists(output_path)
46 | with open(output_path, "r") as f:
47 | content = f.read()
48 | assert "```mermaid" in content
49 | assert "graph TD" in content
50 |
51 |
52 | def test_json_reporter_with_real_project():
53 | """Test JSON reporter with actual project analysis."""
54 | test_dir = "tests/test_set_one"
55 | if not os.path.exists(test_dir):
56 | return
57 |
58 | model = analyse_project(test_dir, internal_only=False)
59 |
60 | with tempfile.TemporaryDirectory() as tmpdir:
61 | output_path = os.path.join(tmpdir, "output.json")
62 | reporter = JsonReporter()
63 | reporter.generate_report(model, output_path)
64 |
65 | # Verify the file was created and is valid JSON
66 | assert os.path.exists(output_path)
67 | with open(output_path, "r") as f:
68 | data = json.load(f)
69 | assert "nodes" in data
70 | assert "edges" in data
71 |
72 |
73 | def test_json_reporter_internal_only():
74 | """Test JSON reporter with internal_only flag."""
75 | test_dir = "tests/test_set_one"
76 | if not os.path.exists(test_dir):
77 | return
78 |
79 | model = analyse_project(test_dir, internal_only=True)
80 |
81 | with tempfile.TemporaryDirectory() as tmpdir:
82 | output_path = os.path.join(tmpdir, "output.json")
83 | reporter = JsonReporter()
84 | reporter.generate_report(model, output_path)
85 |
86 | # Verify the file was created
87 | assert os.path.exists(output_path)
88 | with open(output_path, "r") as f:
89 | data = json.load(f)
90 | # Should have filtered nodes in internal-only mode
91 | assert "nodes" in data
92 |
93 |
94 | def test_get_reporter_factory():
95 | """Test the reporter factory function."""
96 | console = get_reporter("console")
97 | assert isinstance(console, ConsoleReporter)
98 |
99 | mermaid = get_reporter("mermaid")
100 | assert isinstance(mermaid, MermaidReporter)
101 |
102 | json_rep = get_reporter("json")
103 | assert isinstance(json_rep, JsonReporter)
104 |
--------------------------------------------------------------------------------
/docs/objects.json:
--------------------------------------------------------------------------------
1 | {"project": "smartrappy", "version": "0.0.9999", "count": 18, "items": [{"name": "smartrappy.analyse_project", "domain": "py", "role": "function", "priority": "1", "uri": "reference/analyse_project.html#smartrappy.analyse_project", "dispname": "-"}, {"name": "smartrappy.analyser.analyse_project", "domain": "py", "role": "function", "priority": "1", "uri": "reference/analyse_project.html#smartrappy.analyse_project", "dispname": "smartrappy.analyse_project"}, {"name": "smartrappy.ConsoleReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ConsoleReporter.html#smartrappy.ConsoleReporter.generate_report", "dispname": "-"}, {"name": "smartrappy.reporters.ConsoleReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ConsoleReporter.html#smartrappy.ConsoleReporter.generate_report", "dispname": "smartrappy.ConsoleReporter.generate_report"}, {"name": "smartrappy.ConsoleReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/ConsoleReporter.html#smartrappy.ConsoleReporter", "dispname": "-"}, {"name": "smartrappy.reporters.ConsoleReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/ConsoleReporter.html#smartrappy.ConsoleReporter", "dispname": "smartrappy.ConsoleReporter"}, {"name": "smartrappy.JsonReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/JsonReporter.html#smartrappy.JsonReporter.generate_report", "dispname": "-"}, {"name": "smartrappy.reporters.JsonReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/JsonReporter.html#smartrappy.JsonReporter.generate_report", "dispname": "smartrappy.JsonReporter.generate_report"}, {"name": "smartrappy.JsonReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/JsonReporter.html#smartrappy.JsonReporter", "dispname": "-"}, {"name": "smartrappy.reporters.JsonReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/JsonReporter.html#smartrappy.JsonReporter", "dispname": "smartrappy.JsonReporter"}, {"name": "smartrappy.MermaidReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MermaidReporter.html#smartrappy.MermaidReporter.generate_report", "dispname": "-"}, {"name": "smartrappy.reporters.MermaidReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MermaidReporter.html#smartrappy.MermaidReporter.generate_report", "dispname": "smartrappy.MermaidReporter.generate_report"}, {"name": "smartrappy.MermaidReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MermaidReporter.html#smartrappy.MermaidReporter", "dispname": "-"}, {"name": "smartrappy.reporters.MermaidReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MermaidReporter.html#smartrappy.MermaidReporter", "dispname": "smartrappy.MermaidReporter"}, {"name": "smartrappy.GraphvizReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/GraphvizReporter.html#smartrappy.GraphvizReporter.generate_report", "dispname": "-"}, {"name": "smartrappy.reporters.GraphvizReporter.generate_report", "domain": "py", "role": "function", "priority": "1", "uri": "reference/GraphvizReporter.html#smartrappy.GraphvizReporter.generate_report", "dispname": "smartrappy.GraphvizReporter.generate_report"}, {"name": "smartrappy.GraphvizReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/GraphvizReporter.html#smartrappy.GraphvizReporter", "dispname": "-"}, {"name": "smartrappy.reporters.GraphvizReporter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/GraphvizReporter.html#smartrappy.GraphvizReporter", "dispname": "smartrappy.GraphvizReporter"}]}
--------------------------------------------------------------------------------
/src/smartrappy/notebook_parser.py:
--------------------------------------------------------------------------------
1 | """Parser for Jupyter notebook files (.ipynb)."""
2 |
3 | import ast
4 | import json
5 | from typing import List, Set, Tuple
6 |
7 | from smartrappy.models import DatabaseInfo, FileInfo, ModuleImport
8 |
9 |
10 | def extract_notebook_cells(notebook_content: str) -> List[str]:
11 | """
12 | Extract Python code cells from a Jupyter notebook file.
13 |
14 | Args:
15 | notebook_content: The content of the .ipynb file as a string
16 |
17 | Returns:
18 | A list of Python code cell contents found in the notebook
19 | """
20 | try:
21 | # Parse the notebook JSON
22 | notebook = json.loads(notebook_content)
23 |
24 | # Extract code cells
25 | code_cells = []
26 |
27 | # Jupyter notebooks have a 'cells' key containing a list of cell objects
28 | cells = notebook.get("cells", [])
29 |
30 | for cell in cells:
31 | # Only process cells with type 'code'
32 | if cell.get("cell_type") == "code":
33 | # The source can be a string or a list of strings
34 | source = cell.get("source", [])
35 |
36 | # Convert to a single string
37 | if isinstance(source, list):
38 | cell_code = "".join(source)
39 | else:
40 | cell_code = source
41 |
42 | # Only add non-empty cells
43 | if cell_code.strip():
44 | code_cells.append(cell_code)
45 |
46 | return code_cells
47 |
48 | except (json.JSONDecodeError, KeyError) as e:
49 | print(f"Error parsing notebook JSON: {str(e)}")
50 | return []
51 |
52 |
53 | def analyse_notebook_file(
54 | file_path: str,
55 | project_modules: Set[str],
56 | FileOperationFinder,
57 | ModuleImportFinder,
58 | DatabaseOperationFinder,
59 | ) -> Tuple[List[FileInfo], List[ModuleImport], List[DatabaseInfo]]:
60 | """
61 | Analyse a Jupyter notebook file for Python code cells.
62 |
63 | Args:
64 | file_path: Path to the .ipynb file
65 | project_modules: Set of known project module names
66 | FileOperationFinder: Class to find file operations
67 | ModuleImportFinder: Class to find module imports
68 | DatabaseOperationFinder: Class to find database operations
69 |
70 | Returns:
71 | A tuple of (file_operations, imports, database_operations)
72 | """
73 | try:
74 | # Read the notebook file content
75 | with open(file_path, "r", encoding="utf-8") as f:
76 | notebook_content = f.read()
77 |
78 | # Extract Python code cells
79 | code_cells = extract_notebook_cells(notebook_content)
80 |
81 | # Initialize result lists
82 | all_file_ops = []
83 | all_imports = []
84 | all_db_ops = []
85 |
86 | # Process each code cell separately
87 | for i, cell_code in enumerate(code_cells):
88 | try:
89 | # Parse the cell as Python code
90 | tree = ast.parse(cell_code)
91 |
92 | # Find file operations
93 | file_finder = FileOperationFinder(file_path)
94 | file_finder.visit(tree)
95 | all_file_ops.extend(file_finder.file_operations)
96 |
97 | # Find imports
98 | import_finder = ModuleImportFinder(file_path, project_modules)
99 | import_finder.visit(tree)
100 | all_imports.extend(import_finder.imports)
101 |
102 | # Find database operations
103 | db_finder = DatabaseOperationFinder(file_path)
104 | db_finder.visit(tree)
105 | all_db_ops.extend(db_finder.database_operations)
106 |
107 | except SyntaxError as e:
108 | print(f"Syntax error in code cell {i + 1} of {file_path}: {str(e)}")
109 |
110 | return all_file_ops, all_imports, all_db_ops
111 |
112 | except (UnicodeDecodeError, IOError) as e:
113 | print(f"Error processing notebook file {file_path}: {str(e)}")
114 | return [], [], []
115 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 |
2 | name: Tests
3 |
4 | on:
5 | - push
6 | - pull_request
7 |
8 | jobs:
9 | tests:
10 | name: ${{ matrix.session }} ${{ matrix.python-version }} / ${{ matrix.os }}
11 | runs-on: ${{ matrix.os }}
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | include:
16 | - { python-version: "3.10", os: ubuntu-latest, session: "pre-commit" }
17 | - { python-version: "3.10", os: ubuntu-latest, session: "tests" }
18 | - { python-version: "3.11", os: ubuntu-latest, session: "tests" }
19 | - { python-version: "3.12", os: ubuntu-latest, session: "tests" }
20 | - { python-version: "3.10", os: windows-latest, session: "tests" }
21 | - { python-version: "3.10", os: macos-latest, session: "tests" }
22 | - { python-version: "3.10", os: ubuntu-latest, session: "typeguard" }
23 | - { python-version: "3.10", os: ubuntu-latest, session: "xdoctest" }
24 |
25 | env:
26 | NOXSESSION: ${{ matrix.session }}
27 |
28 | steps:
29 | - name: Check out the repository
30 | uses: actions/checkout@v4
31 |
32 | - name: Set up Python ${{ matrix.python-version }}
33 | uses: actions/setup-python@v5.4.0
34 | with:
35 | python-version: ${{ matrix.python-version }}
36 |
37 | - name: Install uv
38 | uses: astral-sh/setup-uv@v5
39 | with:
40 | # Install a specific version of uv.
41 | version: "0.5.2"
42 |
43 | - name: install graphviz
44 | uses: ts-graphviz/setup-graphviz@v2
45 |
46 | - name: Compute pre-commit cache key
47 | if: matrix.session == 'pre-commit'
48 | id: pre-commit-cache
49 | shell: python
50 | run: |
51 | import hashlib
52 | import sys
53 |
54 | python = "py{}.{}".format(*sys.version_info[:2])
55 | payload = sys.version.encode() + sys.executable.encode()
56 | digest = hashlib.sha256(payload).hexdigest()
57 | result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8])
58 |
59 | print("::set-output name=result::{}".format(result))
60 |
61 | - name: Restore pre-commit cache
62 | uses: actions/cache@v4.2.0
63 | if: matrix.session == 'pre-commit'
64 | with:
65 | path: ~/.cache/pre-commit
66 | key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }}
67 | restore-keys: |
68 | ${{ steps.pre-commit-cache.outputs.result }}-
69 |
70 | - name: Run Nox
71 | run: |
72 | uv run nox --force-color --python=${{ matrix.python-version }}
73 |
74 | - name: Upload coverage data
75 | if: always() && matrix.session == 'tests'
76 | uses: "actions/upload-artifact@v4"
77 | with:
78 | name: coverage-data-${{ matrix.session }}-${{ matrix.python-version }}-${{ matrix.os }}
79 | path: ".coverage.*"
80 | if-no-files-found: ignore
81 | include-hidden-files: true
82 |
83 | coverage:
84 | runs-on: ubuntu-latest
85 | needs: tests
86 | steps:
87 | - name: Check out the repository
88 | uses: actions/checkout@v4
89 |
90 | - name: Set up Python 3.11
91 | uses: actions/setup-python@v5.4.0
92 | with:
93 | python-version: 3.11
94 |
95 | - name: Install uv
96 | uses: astral-sh/setup-uv@v5
97 | with:
98 | # Install a specific version of uv.
99 | version: "0.5.2"
100 |
101 | - name: Install dependencies
102 | run: |
103 | uv sync --extra dev
104 |
105 | - name: Download coverage data
106 | uses: actions/download-artifact@v4
107 | with:
108 | pattern: coverage-data-*
109 | merge-multiple: true
110 |
111 | - name: Combine coverage data and display human readable report
112 | run: |
113 | uv run nox --force-color --session=coverage
114 |
115 | - name: Create coverage report
116 | run: |
117 | uv run nox --force-color --session=coverage -- xml
118 |
119 | - name: Upload coverage report
120 | uses: codecov/codecov-action@v5.3.1
121 | with:
122 | token: ${{ secrets.CODECOV_TOKEN }}
123 | slug: aeturrell/smartrappy
124 |
--------------------------------------------------------------------------------
/docs/output_options.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "---\n",
8 | "title: Other output options\n",
9 | "execute:\n",
10 | " echo: false\n",
11 | "---\n"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Graphviz\n",
19 | "\n",
20 | "```bash\n",
21 | "uv run smartrappy . --internal --format=graphviz\n",
22 | "```\n"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | ""
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Mermaid\n",
37 | "\n",
38 | "```bash\n",
39 | "uv run smartrappy . --internal --format=mermaid\n",
40 | "```"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "```{mermaid}\n",
48 | "graph TD\n",
49 | " %% Style definitions\n",
50 | " classDef scriptNode fill:#90EE90,stroke:#333,stroke-width:2px;\n",
51 | " classDef fileNode fill:#FFB6C1,stroke:#333,stroke-width:2px;\n",
52 | " classDef quartoNode fill:#00CED1,stroke:#333,stroke-width:2px;\n",
53 | " classDef notebookNode fill:#FFD700,stroke:#333,stroke-width:2px;\n",
54 | " classDef missingFile fill:#FFB6C1,stroke:#FF0000,stroke-width:3px,stroke-dasharray: 5 5;\n",
55 | " classDef internalModule fill:#ADD8E6,stroke:#333,stroke-width:2px;\n",
56 | " classDef externalModule fill:#FFA07A,stroke:#333,stroke-width:2px;\n",
57 | " classDef importedItem fill:#ADD8E6,stroke:#333,stroke-width:2px,shape:circle;\n",
58 | " classDef externalImportedItem fill:#FFA07A,stroke:#333,stroke-width:2px,shape:circle;\n",
59 | " classDef databaseNode fill:#B19CD9,stroke:#333,stroke-width:2px,shape:cylinder;\n",
60 | "\n",
61 | " %% Nodes\n",
62 | " quarto_document_13558783[\"slides.qmd\"]:::quartoNode\n",
63 | " data_file_14384327[\"output.png
File does not exist\"]:::missingFile\n",
64 | " data_file_7682433[\"equation.tex
File does not exist\"]:::missingFile\n",
65 | " data_file_5741772[\"alternative_equation.tex
File does not exist\"]:::missingFile\n",
66 | " jupyter_notebook_2961208[\"nb_example.ipynb\"]:::notebookNode\n",
67 | " data_file_1692456[\"data/input.csv
Modified: 2025-04-20 17:31:30\"]:::fileNode\n",
68 | " script_5034441[\"model_solver.py\"]:::scriptNode\n",
69 | " script_10331269[\"visualisation.py\"]:::scriptNode\n",
70 | " data_file_9084974[\"data/processed.csv
File does not exist\"]:::missingFile\n",
71 | " script_3375286[\"data_processing.py\"]:::scriptNode\n",
72 | " database_3520401[\"mydatabase
Type: mssql\"]:::databaseNode\n",
73 | " internal_module_3617581((\"data_processing:process_data\")):::importedItem\n",
74 | "\n",
75 | " %% Relationships\n",
76 | " data_file_14384327 --> quarto_document_13558783\n",
77 | " data_file_7682433 --> quarto_document_13558783\n",
78 | " data_file_5741772 --> quarto_document_13558783\n",
79 | " data_file_1692456 --> jupyter_notebook_2961208\n",
80 | " jupyter_notebook_2961208 --> data_file_5741772\n",
81 | " script_5034441 --> data_file_7682433\n",
82 | " script_10331269 --> data_file_14384327\n",
83 | " data_file_9084974 --> script_10331269\n",
84 | " script_3375286 --> data_file_9084974\n",
85 | " data_file_1692456 --> script_3375286\n",
86 | " database_3520401 --> script_3375286\n",
87 | " script_3375286 --> database_3520401\n",
88 | " internal_module_3617581 --> script_10331269\n",
89 | "```\n"
90 | ]
91 | }
92 | ],
93 | "metadata": {
94 | "kernelspec": {
95 | "display_name": ".venv",
96 | "language": "python",
97 | "name": "python3"
98 | },
99 | "language_info": {
100 | "codemirror_mode": {
101 | "name": "ipython",
102 | "version": 3
103 | },
104 | "file_extension": ".py",
105 | "mimetype": "text/x-python",
106 | "name": "python",
107 | "nbconvert_exporter": "python",
108 | "pygments_lexer": "ipython3",
109 | "version": "3.12.0"
110 | }
111 | },
112 | "nbformat": 4,
113 | "nbformat_minor": 4
114 | }
115 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tests/test_output/**
2 | !tests/test_output/.gitkeep
3 |
4 | # Package related files
5 | /build/
6 | .mypy_cache/
7 | /.coverage
8 | /.coverage.*
9 | /.nox/
10 | /.python-version
11 | /.pytype/
12 | /dist/
13 | /docs/_build/
14 | /src/*.egg-info/
15 | __pycache__/
16 | settings.json
17 | .DS_store
18 | /docs/.quarto/
19 | /.quarto/
20 | .venv/
21 | /docs_old/
22 | docs/reference/*
23 | docs/_sidebar.yml
24 | /.luarc.json
25 | docs/_site/
26 |
27 | # Byte-compiled / optimized / DLL files
28 | __pycache__/
29 | *.py[cod]
30 | *$py.class
31 |
32 | # C extensions
33 | *.so
34 |
35 | # Distribution / packaging
36 | .Python
37 | build/
38 | develop-eggs/
39 | dist/
40 | downloads/
41 | eggs/
42 | .eggs/
43 | lib/
44 | lib64/
45 | parts/
46 | sdist/
47 | var/
48 | wheels/
49 | share/python-wheels/
50 | *.egg-info/
51 | .installed.cfg
52 | *.egg
53 | MANIFEST
54 |
55 | # PyInstaller
56 | # Usually these files are written by a python script from a template
57 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
58 | *.manifest
59 | *.spec
60 |
61 | # Installer logs
62 | pip-log.txt
63 | pip-delete-this-directory.txt
64 |
65 | # Unit test / coverage reports
66 | htmlcov/
67 | .tox/
68 | .nox/
69 | .coverage
70 | .coverage.*
71 | .cache
72 | nosetests.xml
73 | coverage.xml
74 | *.cover
75 | *.py,cover
76 | .hypothesis/
77 | .pytest_cache/
78 | cover/
79 |
80 | # Translations
81 | *.mo
82 | *.pot
83 |
84 | # Django stuff:
85 | *.log
86 | local_settings.py
87 | db.sqlite3
88 | db.sqlite3-journal
89 |
90 | # Flask stuff:
91 | instance/
92 | .webassets-cache
93 |
94 | # Scrapy stuff:
95 | .scrapy
96 |
97 | # Sphinx documentation
98 | docs/_build/
99 |
100 | # PyBuilder
101 | .pybuilder/
102 | target/
103 |
104 | # Jupyter Notebook
105 | .ipynb_checkpoints
106 |
107 | # IPython
108 | profile_default/
109 | ipython_config.py
110 |
111 | # pyenv
112 | # For a library or package, you might want to ignore these files since the code is
113 | # intended to run in multiple environments; otherwise, check them in:
114 | # .python-version
115 |
116 | # pipenv
117 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
118 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
119 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
120 | # install all needed dependencies.
121 | #Pipfile.lock
122 |
123 | # UV
124 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
125 | # This is especially recommended for binary packages to ensure reproducibility, and is more
126 | # commonly ignored for libraries.
127 | #uv.lock
128 |
129 | # poetry
130 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
131 | # This is especially recommended for binary packages to ensure reproducibility, and is more
132 | # commonly ignored for libraries.
133 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
134 | #poetry.lock
135 |
136 | # pdm
137 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
138 | #pdm.lock
139 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
140 | # in version control.
141 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
142 | .pdm.toml
143 | .pdm-python
144 | .pdm-build/
145 |
146 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
147 | __pypackages__/
148 |
149 | # Celery stuff
150 | celerybeat-schedule
151 | celerybeat.pid
152 |
153 | # SageMath parsed files
154 | *.sage.py
155 |
156 | # Environments
157 | .env
158 | .venv
159 | env/
160 | venv/
161 | ENV/
162 | env.bak/
163 | venv.bak/
164 |
165 | # Spyder project settings
166 | .spyderproject
167 | .spyproject
168 |
169 | # Rope project settings
170 | .ropeproject
171 |
172 | # mkdocs documentation
173 | /site
174 |
175 | # mypy
176 | .mypy_cache/
177 | .dmypy.json
178 | dmypy.json
179 |
180 | # Pyre type checker
181 | .pyre/
182 |
183 | # pytype static type analyser
184 | .pytype/
185 |
186 | # Cython debug symbols
187 | cython_debug/
188 |
189 | # PyCharm
190 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
191 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
192 | # and can be added to the global gitignore or merged into this file. For a more nuclear
193 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
194 | #.idea/
195 |
196 | # PyPI configuration file
197 | .pypirc
198 |
--------------------------------------------------------------------------------
/docs/contributing.qmd:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Thank you for your interest in improving this project. This project is
4 | open-source under the [MIT license](https://opensource.org/licenses/MIT)
5 | and welcomes contributions in the form of bug reports, feature requests,
6 | and pull requests.
7 |
8 | Here is a list of important resources for contributors:
9 |
10 | - [Source Code](https://github.com/aeturrell/smartrappy)
11 | - [Documentation](https://aeturrell.github.io/smartrappy/)
12 | - [Issue Tracker](https://github.com/aeturrell/smartrappy/issues)
13 |
14 | ## How to report a bug
15 |
16 | Report bugs on the [Issue Tracker](https://github.com/aeturrell/smartrappy/issues).
17 |
18 | When filing an issue, make sure to answer these questions:
19 |
20 | - Which operating system and Python version are you using?
21 | - Which version of this project are you using?
22 | - What did you do?
23 | - What did you expect to see?
24 | - What did you see instead?
25 |
26 | The best way to get your bug fixed is to provide a test case, and/or
27 | steps to reproduce the issue.
28 |
29 | ## How to request a feature
30 |
31 | Request features on the [Issue Tracker](https://github.com/aeturrell/smartrappy/issues).
32 |
33 | ## How to set up your development environment
34 |
35 | You need Python and the following tools:
36 |
37 | - [uv](https://docs.astral.sh/uv/)
38 | - [Nox](https://nox.thea.codes/)
39 | - [Make](https://www.gnu.org/software/make/)
40 | - [Quarto](https://quarto.org/)
41 |
42 | Install the package with the existing development requirements:
43 |
44 | ```bash
45 | $ uv sync --frozen
46 | ```
47 |
48 | To also update packages, do not use the `--frozen` flag.
49 |
50 | To build the documentation locally, you will also need [Make](https://www.gnu.org/software/make/) and [Quarto](https://quarto.org/) (these are non-Python dependencies).
51 |
52 | You can build the docs locally to look at them with `make`, which runs a command to build the README and then another to build the website which can then be found in `docs/_site/.` It's `make clean` to remove the existing README.
53 |
54 | To publish new docs to GitHub Pages (where the documentation is displayed as web pages), it's `make publish`—but only devs with admin rights will be able to execute this.
55 |
56 | ## How to test the project
57 |
58 | Run the full test suite:
59 |
60 | ```bash
61 | $ uv run nox
62 | ```
63 |
64 | List the available Nox sessions:
65 |
66 | ```bash
67 | $ uv run nox --list-sessions
68 | ```
69 |
70 | You can also run a specific Nox session. For example, invoke the unit
71 | test suite like this:
72 |
73 | ```bash
74 | $ uv run nox --session=tests
75 | ```
76 |
77 | Unit tests are located in the `tests` directory, and are written using
78 | the [pytest](https://pytest.readthedocs.io/) testing framework.
79 |
80 | You may need to use, for example, `uv run nox` to ensure that the
81 | tests are run in the right environment.
82 |
83 | For the pre-commit checks, use
84 |
85 | ```bash
86 | $ uv run pre-commit run --all-files
87 | ```
88 |
89 | ## How to submit changes
90 |
91 | Open a [pull request](https://github.com/aeturrell/smartrappy/pulls) to
92 | submit changes to this project.
93 |
94 | Your pull request needs to meet the following guidelines for acceptance:
95 |
96 | - The Nox test suite must pass without errors and warnings.
97 | - Include unit tests. This project aims to maintain 96% code
98 | coverage.
99 | - If your changes add functionality, update the documentation
100 | accordingly.
101 | - Run make to generate the new documentation.
102 | - Run the pre-commit suite before committing.
103 |
104 | Feel free to submit early, though---we can always iterate on this.
105 |
106 | To run linting and code formatting checks before committing your change,
107 | you need to run the following
108 | command:
109 |
110 | ```bash
111 | $ uv run nox --session=pre-commit -- install
112 | ```
113 |
114 | It is recommended to open an issue before starting work on anything.
115 | This will allow a chance to talk it over with the owners and validate
116 | your approach.
117 |
118 | ## How to create a package release
119 |
120 | - Open a new branch with the version name
121 |
122 | - Change the version in pyproject.toml (you can run `uv run version_bumper.py`, which has script-level dependencies)
123 |
124 | - Commit the change with a new version label as the commit message (checking the tests pass)
125 |
126 | - Head to GitHub and merge into main (again, if the CI works)
127 |
128 | - Confirm the release draft on GitHub
129 |
130 | - The automatic release GitHub Action will push to PyPI.
131 |
132 | If you ever need distributable files, you can use the `uv build` command locally.
133 |
134 | ## How to build the documentation manually and locally
135 |
136 | You shouldn't need to publish the documentation because there's a GitHub action that covers it automatically whenever there's a new release. But to upload the documentation manually, it's
137 |
138 | - Run `make` to build the documentation
139 | - Run `make publish` to publish the documentation
140 |
--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
1 | """Nox sessions."""
2 |
3 | from pathlib import Path
4 | from textwrap import dedent
5 |
6 | import nox
7 |
8 | package = "smartrappy"
9 | python_versions = ["3.10", "3.11", "3.12"]
10 | nox.needs_version = ">= 2021.6.6"
11 | nox.options.default_venv_backend = "uv"
12 | nox.options.sessions = (
13 | "pre-commit",
14 | "tests",
15 | "typeguard",
16 | "xdoctest",
17 | )
18 |
19 |
20 | def activate_virtualenv_in_precommit_hooks(session: nox.Session) -> None:
21 | """Activate virtualenv in hooks installed by pre-commit.
22 |
23 | This function patches git hooks installed by pre-commit to activate the
24 | session's virtual environment. This allows pre-commit to locate hooks in
25 | that environment when invoked from git.
26 |
27 | Args:
28 | session: The Session object.
29 | """
30 | if session.bin is None:
31 | return
32 |
33 | virtualenv = session.env.get("VIRTUAL_ENV")
34 | if virtualenv is None:
35 | return
36 |
37 | hookdir = Path(".git") / "hooks"
38 | if not hookdir.is_dir():
39 | return
40 |
41 | for hook in hookdir.iterdir():
42 | if hook.name.endswith(".sample") or not hook.is_file():
43 | continue
44 |
45 | text = hook.read_text()
46 | bindir = repr(session.bin)[1:-1] # strip quotes
47 | if not (
48 | Path("A") == Path("a") and bindir.lower() in text.lower() or bindir in text
49 | ):
50 | continue
51 |
52 | lines = text.splitlines()
53 | if not (lines[0].startswith("#!") and "python" in lines[0].lower()):
54 | continue
55 |
56 | header = dedent(
57 | f"""\
58 | import os
59 | os.environ["VIRTUAL_ENV"] = {virtualenv!r}
60 | os.environ["PATH"] = os.pathsep.join((
61 | {session.bin!r},
62 | os.environ.get("PATH", ""),
63 | ))
64 | """
65 | )
66 |
67 | lines.insert(1, header)
68 | hook.write_text("\n".join(lines))
69 |
70 |
71 | @nox.session(python=python_versions)
72 | def tests(session: nox.Session) -> None:
73 | """Run the test suite."""
74 | session.run_install(
75 | "uv",
76 | "sync",
77 | "--group",
78 | "dev",
79 | env={"UV_PROJECT_ENVIRONMENT": session.virtualenv.location},
80 | )
81 | session.env["PYTHONPATH"] = "src"
82 |
83 | try:
84 | session.run(
85 | "coverage",
86 | "run",
87 | "--parallel",
88 | "-m",
89 | "pytest",
90 | "--cache-clear",
91 | external=True,
92 | env={"UV_PROJECT_ENVIRONMENT": session.virtualenv.location},
93 | *session.posargs,
94 | )
95 | finally:
96 | if session.interactive:
97 | session.notify("coverage", posargs=[])
98 |
99 |
100 | @nox.session(python=python_versions[0])
101 | def coverage(session: nox.Session) -> None:
102 | """Produce the coverage report."""
103 | args = session.posargs or ["report"]
104 | session.run(
105 | "uv",
106 | "pip",
107 | "install",
108 | "coverage[toml]",
109 | env={"UV_PROJECT_ENVIRONMENT": session.virtualenv.location},
110 | external=True,
111 | )
112 | if not session.posargs and any(Path().glob(".coverage.*")):
113 | session.run("coverage", "erase", "--data-file=.coverage")
114 | session.run("coverage", "combine")
115 |
116 | session.run("coverage", *args, "-i")
117 |
118 |
119 | @nox.session(name="pre-commit", python=python_versions[0], venv_backend="uv")
120 | def precommit(session: nox.Session) -> None:
121 | """Lint using pre-commit."""
122 | args = session.posargs or ["run", "--all-files", "--show-diff-on-failure"]
123 | session.run_install(
124 | "uv",
125 | "sync",
126 | "--extra=dev",
127 | env={"UV_PROJECT_ENVIRONMENT": session.virtualenv.location},
128 | )
129 | session.run("pre-commit", *args)
130 | if args and args[0] == "install":
131 | activate_virtualenv_in_precommit_hooks(session)
132 |
133 |
134 | @nox.session(venv_backend="uv", python=python_versions)
135 | def typeguard(session: nox.Session) -> None:
136 | """Runtime type checking using Typeguard."""
137 | # Install project and dependencies using uv
138 | session.run_install(
139 | "uv",
140 | "sync",
141 | "--extra=dev",
142 | env={"UV_PROJECT_ENVIRONMENT": session.virtualenv.location},
143 | )
144 | session.run_install("uv", "pip", "install", "-e", ".")
145 | session.run("pytest", f"--typeguard-packages={package}", *session.posargs)
146 |
147 |
148 | @nox.session(venv_backend="uv", python=python_versions)
149 | def xdoctest(session: nox.Session) -> None:
150 | """Run examples with xdoctest."""
151 | args = session.posargs or ["all"]
152 |
153 | # Install project and dependencies using uv
154 | session.run_install(
155 | "uv",
156 | "sync",
157 | "--extra=dev",
158 | env={"UV_PROJECT_ENVIRONMENT": session.virtualenv.location},
159 | )
160 | session.run_install("uv", "pip", "install", "-e", ".")
161 | session.run("python", "-m", "xdoctest", package, *args)
162 |
--------------------------------------------------------------------------------
/src/smartrappy/__main__.py:
--------------------------------------------------------------------------------
1 | """Command-line interface for smartrappy."""
2 |
3 | import os
4 | import sys
5 | from datetime import datetime
6 |
7 | import click
8 |
9 | from smartrappy import __version__
10 | from smartrappy.analyser import analyse_project
11 | from smartrappy.reporters import get_reporter
12 |
13 |
14 | def validate_repo_path(ctx, param, value):
15 | """Validate that the input path exists and is a directory."""
16 | if not os.path.exists(value):
17 | raise click.BadParameter(f"Path does not exist: {value}")
18 | if not os.path.isdir(value):
19 | raise click.BadParameter(f"Path is not a directory: {value}")
20 | return value
21 |
22 |
23 | def validate_output_path(ctx, param, value):
24 | """Validate that the output path is writable."""
25 | if value is None:
26 | return None
27 |
28 | try:
29 | directory = os.path.dirname(value) or "."
30 | if not os.path.exists(directory):
31 | os.makedirs(directory)
32 | # Check if we can write to this location
33 | test_file = f"{value}_test"
34 | with open(test_file, "w") as f:
35 | f.write("")
36 | os.remove(test_file)
37 | return value
38 | except (OSError, IOError) as e:
39 | raise click.BadParameter(f"Cannot write to output location: {value}\n{str(e)}")
40 |
41 |
42 | @click.command(context_settings=dict(help_option_names=["-h", "--help"]))
43 | @click.argument(
44 | "repo_path",
45 | callback=validate_repo_path,
46 | type=click.Path(exists=True, file_okay=False, dir_okay=True),
47 | )
48 | @click.option(
49 | "-o",
50 | "--output",
51 | callback=validate_output_path,
52 | help="Output path for the analysis files (without extension)",
53 | type=click.Path(dir_okay=False),
54 | )
55 | @click.option(
56 | "-f",
57 | "--format",
58 | "format_type",
59 | type=click.Choice(["console", "graphviz", "mermaid", "json"], case_sensitive=False),
60 | default="console",
61 | help="Output format for the analysis (default: console)",
62 | )
63 | @click.option(
64 | "--all-formats",
65 | is_flag=True,
66 | help="Generate all output formats",
67 | )
68 | @click.option(
69 | "--internal",
70 | is_flag=True,
71 | help="Only include internal modules in the visualisation (exclude external packages)",
72 | )
73 | @click.version_option(version=__version__, prog_name="smartrappy")
74 | def main(repo_path, output, format_type, all_formats, internal):
75 | """Smart reproducible analytical pipeline execution analyser.
76 |
77 | Analyses Python projects to create a visual representation of file operations
78 | and module dependencies.
79 |
80 | Examples:
81 |
82 | \b
83 | # Analyse current directory with default console output
84 | smartrappy .
85 |
86 | \b
87 | # Analyse specific project with graphviz output
88 | smartrappy /path/to/project --formnat graphviz --output /path/to/output/analysis
89 |
90 | \b
91 | # Generate all output formats
92 | smartrappy /path/to/project --all-formats --output /path/to/output/analysis
93 |
94 | \b
95 | # Show only internal module dependencies
96 | smartrappy /path/to/project --internal
97 | """
98 | try:
99 | # Analyse the project
100 | click.echo(f"Analysing project at: {repo_path}")
101 | model = analyse_project(repo_path, internal_only=internal)
102 |
103 | # Generate reports
104 | formats_to_generate = (
105 | ["console", "graphviz", "mermaid", "json"] if all_formats else [format_type]
106 | )
107 |
108 | for fmt in formats_to_generate:
109 | try:
110 | reporter = get_reporter(fmt)
111 |
112 | # Handle output paths based on format
113 | fmt_output = None
114 |
115 | # Only use output path for formats that need files
116 | if fmt in ["graphviz", "mermaid"]:
117 | # Generate default output path if none provided
118 | if output is None:
119 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
120 | base_output = f"smartrappy_analysis_{timestamp}"
121 | else:
122 | base_output = output
123 |
124 | # Append format type to output path when generating multiple formats
125 | fmt_output = (
126 | f"{base_output}_{fmt}"
127 | if len(formats_to_generate) > 1
128 | else base_output
129 | )
130 |
131 | # For JSON, only use output path if explicitly provided by the user
132 | elif fmt == "json" and output is not None:
133 | fmt_output = (
134 | f"{output}_{fmt}" if len(formats_to_generate) > 1 else output
135 | )
136 |
137 | reporter.generate_report(model, fmt_output)
138 | except Exception as e:
139 | click.secho(
140 | f"Error generating {fmt} report: {str(e)}", fg="yellow", err=True
141 | )
142 |
143 | except Exception as e:
144 | click.secho(f"Error during analysis: {str(e)}", fg="red", err=True)
145 | sys.exit(1)
146 |
147 |
148 | if __name__ == "__main__":
149 | main(prog_name="smartrappy") # pragma: no cover
150 |
--------------------------------------------------------------------------------
/tests/test_qmd_parser.py:
--------------------------------------------------------------------------------
1 | """Tests for QMD parsing functionality."""
2 |
3 | from smartrappy.qmd_parser import extract_markdown_resources, extract_python_chunks
4 |
5 |
6 | def test_extract_python_chunks():
7 | """Test that Python chunks are extracted correctly from QMD files."""
8 | # Sample QMD content with Python chunks
9 | qmd_content = """# Test QMD File
10 |
11 | This is a test QMD file with Python chunks.
12 |
13 | ```{python}
14 | import pandas as pd
15 | df = pd.read_csv("data.csv")
16 | ```
17 |
18 | Some markdown text between chunks.
19 |
20 | ```{python}
21 | df.to_excel("output.xlsx")
22 | ```
23 |
24 | ```{r}
25 | # This is an R chunk that should be ignored
26 | print("Hello from R")
27 | ```
28 |
29 | ```{python}
30 | import matplotlib.pyplot as plt
31 | plt.plot(df["x"], df["y"])
32 | plt.savefig("plot.png")
33 | ```
34 | """
35 |
36 | # Extract Python chunks
37 | chunks = extract_python_chunks(qmd_content)
38 |
39 | # Check that we found the right number of chunks
40 | assert len(chunks) == 3
41 |
42 | # Check that the chunks have the right content
43 | assert "import pandas as pd" in chunks[0]
44 | assert "df.to_excel(" in chunks[1]
45 | assert "import matplotlib.pyplot" in chunks[2]
46 |
47 | # Check that the R chunk was ignored
48 | for chunk in chunks:
49 | assert "Hello from R" not in chunk
50 |
51 |
52 | def test_empty_qmd_file():
53 | """Test handling of QMD files with no Python chunks."""
54 | qmd_content = """# Empty QMD File
55 |
56 | This QMD file has no Python chunks.
57 |
58 | ```{r}
59 | print("Hello from R")
60 | ```
61 | """
62 | chunks = extract_python_chunks(qmd_content)
63 | assert len(chunks) == 0
64 |
65 |
66 | def test_malformed_chunks():
67 | """Test handling of malformed Python chunks."""
68 | qmd_content = """# Malformed QMD File
69 |
70 | ```{python
71 | # Missing closing brace
72 | x = 1
73 | ```
74 |
75 | ```{python}
76 | # This one is fine
77 | y = 2
78 | ```
79 | """
80 | # The regex should still handle the malformed chunk
81 | chunks = extract_python_chunks(qmd_content)
82 | assert len(chunks) == 1
83 | assert "y = 2" in chunks[0]
84 |
85 |
86 | def test_with_metadata():
87 | """Test handling of Python chunks with metadata."""
88 | qmd_content = """# QMD with metadata
89 |
90 | ```{python echo=false, eval=true}
91 | import pandas as pd
92 | df = pd.read_csv("data.csv")
93 | ```
94 | """
95 | chunks = extract_python_chunks(qmd_content)
96 | assert len(chunks) == 1
97 | assert "import pandas as pd" in chunks[0]
98 |
99 |
100 | def test_with_actual_file(tmp_path):
101 | """Test extraction from an actual file."""
102 | # Create a temporary QMD file
103 | qmd_file = tmp_path / "test.qmd"
104 | qmd_content = """# Test QMD File
105 |
106 | ```{python}
107 | import pandas as pd
108 | df = pd.read_csv("data.csv")
109 | df.to_excel("output.xlsx")
110 | ```
111 |
112 | ```{python}
113 | import matplotlib.pyplot as plt
114 | plt.savefig("plot.png")
115 | ```
116 | """
117 | qmd_file.write_text(qmd_content)
118 |
119 | # Extract chunks from the file
120 | with open(qmd_file, "r") as f:
121 | chunks = extract_python_chunks(f.read())
122 |
123 | assert len(chunks) == 2
124 | assert "import pandas as pd" in chunks[0]
125 | assert "import matplotlib.pyplot as plt" in chunks[1]
126 |
127 |
128 | def test_extract_markdown_resources():
129 | """Test that markdown resources are extracted correctly from QMD files."""
130 | # Sample QMD content with both image references and include directives
131 | qmd_content = """# Test QMD File
132 |
133 | This is a test QMD file with markdown image references and includes.
134 |
135 | 
136 |
137 | Some text between resources.
138 |
139 | {{< include /outputs/equation.tex >}}
140 |
141 | 
142 |
143 | {{< include "/outputs/table.html" >}}
144 |
145 | 
146 |
147 | {{< include 'outputs/data.csv' >}}
148 |
149 | 
150 | """
151 |
152 | # Extract markdown resources
153 | resources = extract_markdown_resources(qmd_content)
154 |
155 | # Check that we found the right resources (excluding external URLs)
156 | assert len(resources) == 6 # 3 images (excluding external URL) + 3 includes
157 |
158 | # Check image resources
159 | image_resources = [path for path, type_ in resources if type_ == "image"]
160 | assert len(image_resources) == 3
161 | assert "path/to/image.png" in image_resources
162 | assert "outputs/my diagram.svg" in image_resources
163 | assert "outputs/chart.png" in image_resources
164 |
165 | # Check include resources
166 | include_resources = [path for path, type_ in resources if type_ == "include"]
167 | assert len(include_resources) == 3
168 | assert "outputs/equation.tex" in include_resources
169 | assert "outputs/table.html" in include_resources
170 | assert "outputs/data.csv" in include_resources
171 |
172 |
173 | def test_complex_quarto_includes():
174 | """Test handling of complex Quarto include directives."""
175 | qmd_content = """# Complex cases
176 |
177 | Standard include:
178 | {{< include /outputs/equation.tex >}}
179 |
180 | Include with options:
181 | {{< include /outputs/report.md echo=true >}}
182 |
183 | Include with multiple options:
184 | {{}}
185 |
186 | Include with whitespace:
187 | {{< include /outputs/whitespace.txt >}}
188 | """
189 |
190 | resources = extract_markdown_resources(qmd_content)
191 |
192 | # Extract just the include paths
193 | include_paths = [path for path, type_ in resources if type_ == "include"]
194 |
195 | # Check that we found all includes
196 | assert len(include_paths) == 4
197 | assert "outputs/equation.tex" in include_paths
198 | assert "outputs/report.md" in include_paths # Should strip options
199 | assert "outputs/data.R" in include_paths
200 | assert "outputs/whitespace.txt" in include_paths
201 |
--------------------------------------------------------------------------------
/src/smartrappy/qmd_parser.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import re
3 | from typing import List, Set, Tuple
4 |
5 | from smartrappy.models import DatabaseInfo, FileInfo, ModuleImport
6 |
7 |
8 | def extract_python_chunks(qmd_content: str) -> List[str]:
9 | """
10 | Extract Python code chunks from a Quarto markdown file.
11 |
12 | Args:
13 | qmd_content: The content of the QMD file as a string
14 |
15 | Returns:
16 | A list of Python code chunks found in the file
17 | """
18 | # Pattern to match Python code chunks in QMD files
19 | # Matches ```{python} ... ``` blocks, including those with parameters
20 | pattern = r"```\{python[^}]*\}(.*?)```"
21 |
22 | # Find all matches using re.DOTALL to match across multiple lines
23 | matches = re.findall(pattern, qmd_content, re.DOTALL)
24 |
25 | # Clean up the chunks (remove leading/trailing whitespace)
26 | cleaned_chunks = [chunk.strip() for chunk in matches]
27 |
28 | return cleaned_chunks
29 |
30 |
31 | def extract_markdown_resources(qmd_content: str) -> List[Tuple[str, str]]:
32 | """
33 | Extract markdown resource references from a Quarto markdown file.
34 |
35 | Extracts:
36 | 1. Image references: 
37 | 2. Include directives: {{< include /path/to/file.ext >}}
38 |
39 | Args:
40 | qmd_content: The content of the QMD file as a string
41 |
42 | Returns:
43 | A list of tuples containing (file_path, resource_type)
44 | """
45 | resources = []
46 |
47 | # Pattern to match markdown image syntax 
48 | image_pattern = r"!\[.*?\]\(([^)]+)\)"
49 | image_matches = re.findall(image_pattern, qmd_content)
50 |
51 | # Pattern to match Quarto include directives {{< include /path/to/file >}} or {{< include /path/to/file param=value >}}
52 | include_pattern = r"\{\{<\s*include\s+([^\s>]+)(?:\s+[^>]+?)?\s*>\}\}"
53 | include_matches = re.findall(include_pattern, qmd_content)
54 |
55 | # Process image paths
56 | for path in image_matches:
57 | # Remove query parameters if present
58 | clean_path = path.split("?")[0].strip()
59 | # Remove any fragment identifiers
60 | clean_path = clean_path.split("#")[0].strip()
61 | # Remove any surrounding quotation marks
62 | if (clean_path.startswith('"') and clean_path.endswith('"')) or (
63 | clean_path.startswith("'") and clean_path.endswith("'")
64 | ):
65 | clean_path = clean_path[1:-1]
66 |
67 | # Ignore external URLs
68 | if not clean_path.startswith(("http://", "https://", "ftp://")):
69 | # Remove leading slash if present
70 | if clean_path.startswith("/"):
71 | clean_path = clean_path[1:]
72 | resources.append((clean_path, "image"))
73 |
74 | # Process include directives
75 | for path in include_matches:
76 | clean_path = path.strip()
77 |
78 | # The regex might capture additional parameters after the path,
79 | # so ensure we just get the file path by splitting on whitespace
80 | # and taking the first part (the file path)
81 | clean_path = clean_path.split()[0] if " " in clean_path else clean_path
82 |
83 | # Remove any surrounding quotation marks
84 | if (clean_path.startswith('"') and clean_path.endswith('"')) or (
85 | clean_path.startswith("'") and clean_path.endswith("'")
86 | ):
87 | clean_path = clean_path[1:-1]
88 |
89 | # Remove leading slash if present
90 | if clean_path.startswith("/"):
91 | clean_path = clean_path[1:]
92 |
93 | resources.append((clean_path, "include"))
94 |
95 | return resources
96 |
97 |
98 | def analyse_qmd_file(
99 | file_path: str,
100 | project_modules: Set[str],
101 | FileOperationFinder,
102 | ModuleImportFinder,
103 | DatabaseOperationFinder,
104 | ) -> Tuple[List[FileInfo], List[ModuleImport], List[DatabaseInfo]]:
105 | """
106 | Analyse a Quarto markdown file for Python code chunks and external resources.
107 |
108 | Detects:
109 | - Python code chunks
110 | - Markdown image references ()
111 | - Quarto include directives ({{< include /path/to/file.ext >}})
112 |
113 | Args:
114 | file_path: Path to the QMD file
115 | project_modules: Set of known project module names
116 | FileOperationFinder: Class to find file operations
117 | ModuleImportFinder: Class to find module imports
118 | DatabaseOperationFinder: Class to find database operations
119 |
120 | Returns:
121 | A tuple of (file_operations, imports, database_operations)
122 | """
123 | try:
124 | # Read the QMD file content
125 | with open(file_path, "r", encoding="utf-8") as f:
126 | qmd_content = f.read()
127 |
128 | # Extract Python code chunks
129 | python_chunks = extract_python_chunks(qmd_content)
130 |
131 | # Extract markdown resources (images and includes)
132 | resources = extract_markdown_resources(qmd_content)
133 |
134 | # Create FileInfo objects for resource inputs (images, includes, etc.)
135 | resource_file_ops = []
136 | for resource_path, resource_type in resources:
137 | # All external resources are considered read operations in QMD files
138 | resource_file_ops.append(
139 | FileInfo(
140 | filename=resource_path,
141 | is_read=True,
142 | is_write=False,
143 | source_file=file_path,
144 | )
145 | )
146 |
147 | # Initialize result lists
148 | all_file_ops = resource_file_ops # Start with external resource operations
149 | all_imports = []
150 | all_db_ops = []
151 |
152 | # Process each Python chunk separately
153 | for i, chunk in enumerate(python_chunks):
154 | try:
155 | # Parse the chunk as Python code
156 | tree = ast.parse(chunk)
157 |
158 | # Find file operations
159 | file_finder = FileOperationFinder(file_path)
160 | file_finder.visit(tree)
161 | all_file_ops.extend(file_finder.file_operations)
162 |
163 | # Find imports
164 | import_finder = ModuleImportFinder(file_path, project_modules)
165 | import_finder.visit(tree)
166 | all_imports.extend(import_finder.imports)
167 |
168 | # Find database operations
169 | db_finder = DatabaseOperationFinder(file_path)
170 | db_finder.visit(tree)
171 | all_db_ops.extend(db_finder.database_operations)
172 |
173 | except SyntaxError as e:
174 | print(f"Syntax error in Python chunk {i + 1} of {file_path}: {str(e)}")
175 |
176 | return all_file_ops, all_imports, all_db_ops
177 |
178 | except (UnicodeDecodeError, IOError) as e:
179 | print(f"Error processing QMD file {file_path}: {str(e)}")
180 | return [], [], []
181 |
--------------------------------------------------------------------------------
/docs/index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "---\n",
8 | "execute:\n",
9 | " echo: false\n",
10 | "---\n",
11 | "\n",
12 | "# smartrappy\n",
13 | "\n",
14 | "Smart reproducible analytical pipeline inspection.\n",
15 | "\n",
16 | "{width=40%}"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "[](https://pypi.org/project/smartrappy/)\n",
24 | "[](https://pypi.org/project/smartrappy/)\n",
25 | "[](https://pypi.org/project/smartrappy)\n",
26 | "[](https://opensource.org/licenses/MIT)\n",
27 | "[](https://github.com/astral-sh/ruff)\n",
28 | "[](https://github.com/pre-commit/pre-commit)\n",
29 | "[](https://github.com/aeturrell/smartrappy/actions?workflow=Tests)\n",
30 | "[](https://codecov.io/gh/aeturrell/smartrappy)\n",
31 | "[](https://aeturrell.github.io/smartrappy/)\n",
32 | "[](https://pepy.tech/project/smartrappy)\n",
33 | "\n",
34 | "\n",
35 | "\n",
36 | "\n",
37 | "\n",
38 | "[](https://github.com/aeturrell/smartrappy)"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Introduction\n",
46 | "\n",
47 | "### What does this package do?\n",
48 | "\n",
49 | "**smartrappy** analyses a Python project and infers the directed acyclic graph (DAG) of the code and data dependencies, including the last time any data were refreshed and whether the data exist at all on disk. It is not perfect, and will miss a lot in complex projects: but for simple projects using, say, `pd.read_csv()`, it does a good job of inferring the steps. It can also infer writing to and from most databases. The inferred DAG is then visualised, and there are several options for doing that—the default being to produce a visualisation in the terminal.\n",
50 | "\n",
51 | "### What is **smartrappy** for?\n",
52 | "\n",
53 | "**smartrappy** is designed to help you understand the dependencies in a project, especially in a context where there may be a lot of legacy code that resembles tangled spaghetti.\n",
54 | "\n",
55 | "### Quickstart\n",
56 | "\n",
57 | "To use **smartrappy** as a command-line tool:\n",
58 | "\n",
59 | "```bash\n",
60 | "smartrappy /path/to/your/project\n",
61 | "```\n",
62 | "\n",
63 | "Or to use it within a Python script:\n",
64 | "\n",
65 | "```python\n",
66 | "from smartrappy import analyse_project\n",
67 | "from smartrappy.reporters import ConsoleReporter\n",
68 | "\n",
69 | "\n",
70 | "model = analyse_project(\"/path/to/your/project\")\n",
71 | "reporter = ConsoleReporter()\n",
72 | "reporter.generate_report(model)\n",
73 | "```\n",
74 | "\n",
75 | "### Installation\n",
76 | "\n",
77 | "To install **smartrappy**, you can use `pip install smartrappy` or `uv add smartrappy` if you are using [Astral's uv](https://docs.astral.sh/uv/). You can also use it as a standalone command-line tool with uv and the `uvx` command:\n",
78 | "\n",
79 | "```bash\n",
80 | "uvx smartrappy path/to/your/project\n",
81 | "```\n",
82 | "\n",
83 | "### Documentation\n",
84 | "\n",
85 | "You can find the full documentation for **smartrappy** at [https://aeturrell.github.io/smartrappy/](https://aeturrell.github.io/smartrappy/)."
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "## Example of output\n",
93 | "\n",
94 | "```bash\n",
95 | "smartrappy .\n",
96 | "```"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "! uv run smartrappy ../tests/test_set_two"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "The graphviz and mermaid options are equally as aesthetically pleasing! [Head to the docs](https://aeturrell.github.io/smartrappy/output_options.html) to see those."
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "## Options and configuration\n",
120 | "\n",
121 | "### Command-line\n",
122 | "\n",
123 | "```bash\n",
124 | "smartrappy [OPTIONS] path/to/your/project\n",
125 | "```\n",
126 | "\n",
127 | "### Arguments\n",
128 | "\n",
129 | "- `path/to/your/project`: Path to the Python project directory to analyse (required)\n",
130 | "\n",
131 | "### Options\n",
132 | "\n",
133 | "- `-o, --output PATH`: Output path for the analysis files (without extension)\n",
134 | "- `-f, --format [console|graphviz|mermaid|json]`: Output format (default: console)\n",
135 | "- `--all-formats`: Generate all output formats. Because why shouldn't you have it all?\n",
136 | "- `-h, --help`: Show help message\n",
137 | "- `--internal`: Show only internal code dependencies. (default: false)\n",
138 | "- `--version`: Show version information\n",
139 | "\n",
140 | "### Output formats\n",
141 | "\n",
142 | "- `console`: Output in terminal (default)\n",
143 | "- `graphviz`: Generate Graphviz visualisation, saved as a PDF\n",
144 | "- `mermaid`: Generate Mermaid diagram, which can be embedded in Markdown\n",
145 | "- `json`: Generate JSON representation, printed to the terminal if no output path is specified\n",
146 | "\n",
147 | "By default, outputs are stored in the directory from where the `smartrappy` command is run."
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "## Requirements\n",
155 | "\n",
156 | "You can find a full list of requirements in the [pyproject.toml](https://github.com/aeturrell/smartrappy/blob/main/pyproject.toml) file.\n",
157 | "\n",
158 | "This package also requires that you have [GraphViz](https://graphviz.org/) installed."
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "## License\n",
166 | "\n",
167 | "Distributed under the terms of the [MIT license](https://opensource.org/licenses/MIT), *smartrappy* is free and open source software.\n",
168 | "\n",
169 | "## Issues\n",
170 | "\n",
171 | "If you encounter any problems, please [file an issue](https://github.com/aeturrell/smartrappy/issues) along with a detailed description."
172 | ]
173 | }
174 | ],
175 | "metadata": {
176 | "kernelspec": {
177 | "display_name": ".venv",
178 | "language": "python",
179 | "name": "python3"
180 | },
181 | "language_info": {
182 | "codemirror_mode": {
183 | "name": "ipython",
184 | "version": 3
185 | },
186 | "file_extension": ".py",
187 | "mimetype": "text/x-python",
188 | "name": "python",
189 | "nbconvert_exporter": "python",
190 | "pygments_lexer": "ipython3",
191 | "version": "3.12.0"
192 | }
193 | },
194 | "nbformat": 4,
195 | "nbformat_minor": 4
196 | }
197 |
--------------------------------------------------------------------------------
/tests/test_qmd_integration.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | from pathlib import Path
4 |
5 | from smartrappy import analyse_project
6 | from smartrappy.models import NodeType
7 | from smartrappy.reporters import ConsoleReporter
8 |
9 |
10 | def test_qmd_integration():
11 | """Test that QMD files are properly analyzed in a project."""
12 | with tempfile.TemporaryDirectory() as tmpdir:
13 | # Create a simple project structure with Python and QMD files
14 | tmpdir_path = Path(tmpdir)
15 |
16 | # Create a Python file
17 | py_file = tmpdir_path / "process.py"
18 | py_file.write_text("""
19 | import pandas as pd
20 |
21 | df = pd.read_csv("input.csv")
22 | df.to_excel("output.xlsx")
23 | """)
24 |
25 | # Create a QMD file
26 | qmd_file = tmpdir_path / "analysis.qmd"
27 | qmd_file.write_text("""# Analysis Document
28 |
29 | This is a Quarto document with Python code chunks.
30 |
31 | ```{python}
32 | import pandas as pd
33 | import matplotlib.pyplot as plt
34 |
35 | df = pd.read_excel("output.xlsx")
36 | plt.plot(df["x"], df["y"])
37 | plt.savefig("plot.png")
38 | ```
39 |
40 | ```{python}
41 | # Another code chunk
42 | import sqlite3
43 |
44 | conn = sqlite3.connect("data.db")
45 | df_db = pd.read_sql("SELECT * FROM mytable", conn)
46 | df_db.to_csv("db_export.csv")
47 | ```
48 | """)
49 |
50 | # Create a dummy data file to make it exist on disk
51 | (tmpdir_path / "input.csv").touch()
52 |
53 | # Analyze the project
54 | model = analyse_project(str(tmpdir_path))
55 |
56 | # Check that nodes were created for both files
57 | py_script_found = False
58 | qmd_doc_found = False
59 |
60 | for node_id, node in model.nodes.items():
61 | if node.name == "process.py" and node.type == NodeType.SCRIPT:
62 | py_script_found = True
63 | elif node.name == "analysis.qmd" and node.type == NodeType.QUARTO_DOCUMENT:
64 | qmd_doc_found = True
65 |
66 | assert py_script_found, "Python script node not found in the model"
67 | assert qmd_doc_found, "Quarto document node not found in the model"
68 |
69 | # Check that file operations were detected in the QMD file
70 | qmd_file_ops = []
71 | for filename, ops in model.file_operations.items():
72 | for op in ops:
73 | if os.path.basename(op.source_file) == "analysis.qmd":
74 | qmd_file_ops.append((filename, op.is_read, op.is_write))
75 |
76 | # Verify expected file operations in the QMD file
77 | assert ("output.xlsx", True, False) in qmd_file_ops # Read operation
78 | assert ("plot.png", False, True) in qmd_file_ops # Write operation
79 | assert ("db_export.csv", False, True) in qmd_file_ops # Write operation
80 |
81 | # Check that database operations were detected
82 | db_ops_found = False
83 | for db_name, ops in model.database_operations.items():
84 | for op in ops:
85 | if os.path.basename(op.source_file) == "analysis.qmd":
86 | db_ops_found = True
87 | break
88 |
89 | assert db_ops_found, "Database operations not found for QMD file"
90 |
91 | # Test that the console reporter can handle QMD files without errors
92 | reporter = ConsoleReporter()
93 | reporter.generate_report(model) # This should not raise exceptions
94 |
95 |
96 | def test_empty_qmd():
97 | """Test that QMD files without Python chunks are handled correctly."""
98 | with tempfile.TemporaryDirectory() as tmpdir:
99 | tmpdir_path = Path(tmpdir)
100 |
101 | # Create a QMD file without Python chunks
102 | qmd_file = tmpdir_path / "empty.qmd"
103 | qmd_file.write_text("""# Empty Document
104 |
105 | This Quarto document has no Python code chunks.
106 |
107 | ```{r}
108 | # R code that should be ignored
109 | print("Hello from R")
110 | ```
111 | """)
112 |
113 | # Analyze the project
114 | model = analyse_project(str(tmpdir_path))
115 |
116 | # Since there are no Python chunks, the QMD file should not appear in the model
117 | qmd_found = False
118 | for _, node in model.nodes.items():
119 | if node.name == "empty.qmd" and node.type == NodeType.QUARTO_DOCUMENT:
120 | qmd_found = True
121 | break
122 |
123 | assert not qmd_found, "Empty QMD file should not create nodes"
124 |
125 |
126 | def test_qmd_integration_with_all_resources():
127 | """Test that QMD files with images and include directives are properly analyzed."""
128 | with tempfile.TemporaryDirectory() as tmpdir:
129 | # Create a simple project structure with a Quarto document containing various resources
130 | tmpdir_path = Path(tmpdir)
131 |
132 | # Create a QMD file with markdown images and include directives
133 | qmd_file = tmpdir_path / "report.qmd"
134 | qmd_file.write_text("""# Comprehensive Quarto Document
135 |
136 | This document includes various types of resources and Python code.
137 |
138 | ## Images
139 | 
140 |
141 | ## LaTeX Equation
142 | {{< include /outputs/equation.tex >}}
143 |
144 | ## Python Analysis
145 | ```{python}
146 | import pandas as pd
147 | df = pd.read_csv("data.csv")
148 | df.to_excel("processed_data.xlsx")
149 | ```
150 |
151 | ## Results Visualization
152 | 
153 |
154 | ## Data Table
155 | {{< include /outputs/table.html >}}
156 |
157 | ```{python}
158 | import matplotlib.pyplot as plt
159 | plt.figure()
160 | plt.plot(df["x"], df["y"])
161 | plt.savefig("output_plot.png")
162 | ```
163 |
164 | ## Appendix
165 | {{< include /outputs/appendix.md >}}
166 | """)
167 |
168 | # Create dummy files to make them exist on disk
169 | outputs_dir = tmpdir_path / "outputs"
170 | outputs_dir.mkdir()
171 | (outputs_dir / "figure1.png").touch()
172 | (outputs_dir / "results.svg").touch()
173 | (outputs_dir / "equation.tex").touch()
174 | (outputs_dir / "table.html").touch()
175 | (outputs_dir / "appendix.md").touch()
176 |
177 | # Create input data file
178 | (tmpdir_path / "data.csv").touch()
179 |
180 | # Analyze the project
181 | model = analyse_project(str(tmpdir_path))
182 |
183 | # Check that the QMD document was properly processed
184 | qmd_doc_found = False
185 | qmd_node_id = None
186 | for node_id, node in model.nodes.items():
187 | if node.name == "report.qmd" and node.type == NodeType.QUARTO_DOCUMENT:
188 | qmd_doc_found = True
189 | qmd_node_id = node_id
190 | break
191 |
192 | assert qmd_doc_found, "Quarto document node not found in the model"
193 |
194 | # Collect all file operations from the QMD document
195 | qmd_file_ops = []
196 | for filename, ops in model.file_operations.items():
197 | for op in ops:
198 | if os.path.basename(op.source_file) == "report.qmd":
199 | qmd_file_ops.append((filename, op.is_read, op.is_write))
200 |
201 | # Verify all resource types were detected
202 | # Python code operations
203 | assert ("data.csv", True, False) in qmd_file_ops # Read operation
204 | assert ("processed_data.xlsx", False, True) in qmd_file_ops # Write operation
205 | assert ("output_plot.png", False, True) in qmd_file_ops # Write operation
206 |
207 | # Image references (read operations)
208 | assert ("outputs/figure1.png", True, False) in qmd_file_ops
209 | assert ("outputs/results.svg", True, False) in qmd_file_ops
210 |
211 | # Include directives (read operations)
212 | assert ("outputs/equation.tex", True, False) in qmd_file_ops
213 | assert ("outputs/table.html", True, False) in qmd_file_ops
214 | assert ("outputs/appendix.md", True, False) in qmd_file_ops
215 |
216 | # Verify edges in the graph
217 | image_nodes_with_edges = 0
218 | include_nodes_with_edges = 0
219 |
220 | for edge in model.edges:
221 | if edge.target == qmd_node_id and edge.type == "read":
222 | source_node = model.nodes[edge.source]
223 | source_name = source_node.name
224 |
225 | # Count image and include nodes with edges to the QMD document
226 | if source_name in ["outputs/figure1.png", "outputs/results.svg"]:
227 | image_nodes_with_edges += 1
228 | elif source_name in [
229 | "outputs/equation.tex",
230 | "outputs/table.html",
231 | "outputs/appendix.md",
232 | ]:
233 | include_nodes_with_edges += 1
234 |
235 | # Verify we have the right number of edges for each resource type
236 | assert image_nodes_with_edges == 2, (
237 | "Not all image nodes have edges to the QMD document"
238 | )
239 | assert include_nodes_with_edges == 3, (
240 | "Not all include nodes have edges to the QMD document"
241 | )
242 |
243 | # Test that the console reporter works with these resources
244 | reporter = ConsoleReporter()
245 | reporter.generate_report(model) # This should not raise exceptions
246 |
--------------------------------------------------------------------------------
/tests/test_jupyter_integration.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import tempfile
4 | from pathlib import Path
5 |
6 | from smartrappy import analyse_project
7 | from smartrappy.models import NodeType
8 | from smartrappy.reporters import ConsoleReporter
9 |
10 |
11 | def test_jupyter_integration():
12 | """Test that Jupyter notebook files are properly analyzed in a project."""
13 | with tempfile.TemporaryDirectory() as tmpdir:
14 | # Create a simple project structure with Python and notebook files
15 | tmpdir_path = Path(tmpdir)
16 |
17 | # Create a Python file
18 | py_file = tmpdir_path / "process.py"
19 | py_file.write_text("""
20 | import pandas as pd
21 |
22 | df = pd.read_csv("input.csv")
23 | df.to_excel("output.xlsx")
24 | """)
25 |
26 | # Create a Jupyter notebook file
27 | notebook_file = tmpdir_path / "analysis.ipynb"
28 | notebook_content = {
29 | "cells": [
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "# Analysis Notebook\n",
35 | "\n",
36 | "This is a Jupyter notebook with Python code cells.",
37 | ],
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 1,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "import pandas as pd\n",
46 | "import matplotlib.pyplot as plt\n",
47 | "\n",
48 | "df = pd.read_excel('output.xlsx')\n",
49 | "plt.plot(df['x'], df['y'])\n",
50 | "plt.savefig('plot.png')",
51 | ],
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 2,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# Another code cell\n",
60 | "import sqlite3\n",
61 | "\n",
62 | "conn = sqlite3.connect('data.db')\n",
63 | "df_db = pd.read_sql('SELECT * FROM mytable', conn)\n",
64 | "df_db.to_csv('db_export.csv')",
65 | ],
66 | },
67 | ],
68 | "metadata": {
69 | "kernelspec": {
70 | "display_name": "Python 3",
71 | "language": "python",
72 | "name": "python3",
73 | },
74 | "language_info": {
75 | "name": "python",
76 | "version": "3.11.0",
77 | },
78 | },
79 | "nbformat": 4,
80 | "nbformat_minor": 5,
81 | }
82 | notebook_file.write_text(json.dumps(notebook_content, indent=2))
83 |
84 | # Create a dummy data file to make it exist on disk
85 | (tmpdir_path / "input.csv").touch()
86 |
87 | # Analyze the project
88 | model = analyse_project(str(tmpdir_path))
89 |
90 | # Check that nodes were created for both files
91 | py_script_found = False
92 | notebook_found = False
93 |
94 | for node_id, node in model.nodes.items():
95 | if node.name == "process.py" and node.type == NodeType.SCRIPT:
96 | py_script_found = True
97 | elif (
98 | node.name == "analysis.ipynb" and node.type == NodeType.JUPYTER_NOTEBOOK
99 | ):
100 | notebook_found = True
101 |
102 | assert py_script_found, "Python script node not found in the model"
103 | assert notebook_found, "Jupyter notebook node not found in the model"
104 |
105 | # Check that file operations were detected in the notebook file
106 | notebook_file_ops = []
107 | for filename, ops in model.file_operations.items():
108 | for op in ops:
109 | if os.path.basename(op.source_file) == "analysis.ipynb":
110 | notebook_file_ops.append((filename, op.is_read, op.is_write))
111 |
112 | # Verify expected file operations in the notebook file
113 | assert ("output.xlsx", True, False) in notebook_file_ops # Read operation
114 | assert ("plot.png", False, True) in notebook_file_ops # Write operation
115 | assert ("db_export.csv", False, True) in notebook_file_ops # Write operation
116 |
117 | # Check that database operations were detected
118 | db_ops_found = False
119 | for db_name, ops in model.database_operations.items():
120 | for op in ops:
121 | if os.path.basename(op.source_file) == "analysis.ipynb":
122 | db_ops_found = True
123 | break
124 |
125 | assert db_ops_found, "Database operations not found for notebook file"
126 |
127 | # Test that the console reporter can handle notebook files without errors
128 | reporter = ConsoleReporter()
129 | reporter.generate_report(model) # This should not raise exceptions
130 |
131 |
132 | def test_empty_jupyter_notebook():
133 | """Test that Jupyter notebooks without code cells are handled correctly."""
134 | with tempfile.TemporaryDirectory() as tmpdir:
135 | tmpdir_path = Path(tmpdir)
136 |
137 | # Create a notebook file without code cells
138 | notebook_file = tmpdir_path / "empty.ipynb"
139 | notebook_content = {
140 | "cells": [
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "# Empty Notebook\n",
146 | "\n",
147 | "This notebook has no code cells.",
148 | ],
149 | },
150 | ],
151 | "metadata": {
152 | "kernelspec": {
153 | "display_name": "Python 3",
154 | "language": "python",
155 | "name": "python3",
156 | },
157 | },
158 | "nbformat": 4,
159 | "nbformat_minor": 5,
160 | }
161 | notebook_file.write_text(json.dumps(notebook_content, indent=2))
162 |
163 | # Analyze the project
164 | model = analyse_project(str(tmpdir_path))
165 |
166 | # Since there are no code cells, the notebook file should not appear in the model
167 | notebook_found = False
168 | for _, node in model.nodes.items():
169 | if node.name == "empty.ipynb" and node.type == NodeType.JUPYTER_NOTEBOOK:
170 | notebook_found = True
171 | break
172 |
173 | assert not notebook_found, "Empty notebook file should not create nodes"
174 |
175 |
176 | def test_jupyter_integration_with_complex_operations():
177 | """Test that Jupyter notebooks with complex operations are properly analyzed."""
178 | with tempfile.TemporaryDirectory() as tmpdir:
179 | # Create a simple project structure with a notebook containing various operations
180 | tmpdir_path = Path(tmpdir)
181 |
182 | # Create a Jupyter notebook file with various operations
183 | notebook_file = tmpdir_path / "analysis.ipynb"
184 | notebook_content = {
185 | "cells": [
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "# Comprehensive Jupyter Notebook\n",
191 | "\n",
192 | "This notebook includes various types of operations.",
193 | ],
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 1,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "import pandas as pd\n",
202 | "df = pd.read_csv('data.csv')\n",
203 | "df.to_excel('processed_data.xlsx')",
204 | ],
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 2,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "import matplotlib.pyplot as plt\n",
213 | "plt.figure()\n",
214 | "plt.plot(df['x'], df['y'])\n",
215 | "plt.savefig('output_plot.png')",
216 | ],
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 3,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "# Multiple file operations in one cell\n",
225 | "df2 = pd.read_csv('data2.csv')\n",
226 | "df3 = pd.read_parquet('data3.parquet')\n",
227 | "df_combined = pd.concat([df, df2, df3])\n",
228 | "df_combined.to_json('combined.json')",
229 | ],
230 | },
231 | ],
232 | "metadata": {
233 | "kernelspec": {
234 | "display_name": "Python 3",
235 | "language": "python",
236 | "name": "python3",
237 | },
238 | },
239 | "nbformat": 4,
240 | "nbformat_minor": 5,
241 | }
242 | notebook_file.write_text(json.dumps(notebook_content, indent=2))
243 |
244 | # Create input data files
245 | (tmpdir_path / "data.csv").touch()
246 | (tmpdir_path / "data2.csv").touch()
247 | (tmpdir_path / "data3.parquet").touch()
248 |
249 | # Analyze the project
250 | model = analyse_project(str(tmpdir_path))
251 |
252 | # Check that the notebook was properly processed
253 | notebook_found = False
254 | notebook_node_id = None
255 | for node_id, node in model.nodes.items():
256 | if node.name == "analysis.ipynb" and node.type == NodeType.JUPYTER_NOTEBOOK:
257 | notebook_found = True
258 | notebook_node_id = node_id
259 | break
260 |
261 | assert notebook_found, "Jupyter notebook node not found in the model"
262 |
263 | # Collect all file operations from the notebook
264 | notebook_file_ops = []
265 | for filename, ops in model.file_operations.items():
266 | for op in ops:
267 | if os.path.basename(op.source_file) == "analysis.ipynb":
268 | notebook_file_ops.append((filename, op.is_read, op.is_write))
269 |
270 | # Verify all operations were detected
271 | # Read operations
272 | assert ("data.csv", True, False) in notebook_file_ops
273 | assert ("data2.csv", True, False) in notebook_file_ops
274 | assert ("data3.parquet", True, False) in notebook_file_ops
275 |
276 | # Write operations
277 | assert ("processed_data.xlsx", False, True) in notebook_file_ops
278 | assert ("output_plot.png", False, True) in notebook_file_ops
279 | assert ("combined.json", False, True) in notebook_file_ops
280 |
281 | # Verify edges in the graph
282 | read_edges = 0
283 | write_edges = 0
284 |
285 | for edge in model.edges:
286 | if edge.target == notebook_node_id and edge.type == "read":
287 | read_edges += 1
288 | elif edge.source == notebook_node_id and edge.type == "write":
289 | write_edges += 1
290 |
291 | # We should have read edges for the input files
292 | assert read_edges >= 3, "Not all read operations created edges"
293 | # We should have write edges for the output files
294 | assert write_edges >= 3, "Not all write operations created edges"
295 |
296 | # Test that the console reporter works with these operations
297 | reporter = ConsoleReporter()
298 | reporter.generate_report(model) # This should not raise exceptions
299 |
--------------------------------------------------------------------------------
/src/smartrappy/models.py:
--------------------------------------------------------------------------------
1 | """Data models for smartrappy."""
2 |
3 | import os
4 | from datetime import datetime
5 | from pathlib import Path
6 | from typing import Dict, List, NamedTuple, Optional
7 |
8 |
9 | class FileInfo(NamedTuple):
10 | """Information about a file operation found in Python code."""
11 |
12 | filename: str
13 | is_read: bool
14 | is_write: bool
15 | source_file: str
16 |
17 |
18 | class FileStatus(NamedTuple):
19 | """Information about a file's status on disk."""
20 |
21 | exists: bool
22 | last_modified: Optional[datetime] = None
23 |
24 |
25 | class ModuleImport(NamedTuple):
26 | """Information about a module import found in Python code."""
27 |
28 | module_name: str
29 | source_file: str
30 | is_from_import: bool
31 | imported_names: List[str]
32 | is_internal: bool
33 |
34 |
35 | class DatabaseInfo(NamedTuple):
36 | """Information about a database operation found in Python code."""
37 |
38 | db_name: str # Name or identifier of the database
39 | connection_string: Optional[str] # Connection string (if available)
40 | db_type: str # Type of database (e.g., "postgresql", "mysql", "sqlite")
41 | is_read: bool # Whether data is read from the database
42 | is_write: bool # Whether data is written to the database
43 | source_file: str # File containing the database operation
44 | conn_var_name: Optional[str] = None # Connection variable name if applicable
45 | uses_conn_var: Optional[str] = None # If this operation uses a connection variable
46 |
47 |
48 | class NodeType:
49 | """Enumeration of node types in the project graph."""
50 |
51 | SCRIPT = "script"
52 | DATA_FILE = "data_file"
53 | EXTERNAL_MODULE = "external_module"
54 | INTERNAL_MODULE = "internal_module"
55 | DATABASE = "database"
56 | QUARTO_DOCUMENT = "quarto_document"
57 | JUPYTER_NOTEBOOK = "jupyter_notebook"
58 |
59 |
60 | class Node(NamedTuple):
61 | """A node in the project dependency graph."""
62 |
63 | id: str
64 | name: str
65 | type: str
66 | metadata: dict
67 |
68 |
69 | class Edge(NamedTuple):
70 | """An edge in the project dependency graph."""
71 |
72 | source: str
73 | target: str
74 | type: str
75 |
76 |
77 | class ProjectModel:
78 | """A complete model of the project's structure and dependencies."""
79 |
80 | def __init__(self, base_path: str, internal_only: bool = False):
81 | self.base_path = Path(base_path)
82 | self.internal_only = internal_only
83 | self.nodes: Dict[str, Node] = {}
84 | self.edges: List[Edge] = []
85 | self.file_operations: Dict[str, List[FileInfo]] = {}
86 | self.imports: Dict[str, List[ModuleImport]] = {}
87 | self.file_statuses: Dict[str, FileStatus] = {}
88 | self.database_operations: Dict[str, List[DatabaseInfo]] = {}
89 |
90 | def get_node_id(self, name: str, node_type: str) -> str:
91 | """Generate a consistent node ID based on name and type."""
92 | return f"{node_type}_{hash(name) & 0xFFFFFF}"
93 |
94 | def add_node(
95 | self, name: str, node_type: str, metadata: Optional[dict] = None
96 | ) -> str:
97 | """Add a node to the model and return its ID."""
98 | metadata = metadata or {}
99 | node_id = self.get_node_id(name, node_type)
100 |
101 | if node_id not in self.nodes:
102 | self.nodes[node_id] = Node(
103 | id=node_id, name=name, type=node_type, metadata=metadata
104 | )
105 |
106 | return node_id
107 |
108 | def add_edge(
109 | self, source_id: str, target_id: str, edge_type: str = "dependency"
110 | ) -> None:
111 | """Add an edge between two nodes."""
112 | # Prevent duplicate edges
113 | for edge in self.edges:
114 | if (
115 | edge.source == source_id
116 | and edge.target == target_id
117 | and edge.type == edge_type
118 | ):
119 | return
120 |
121 | self.edges.append(Edge(source=source_id, target=target_id, type=edge_type))
122 |
123 | def add_file_operation(self, operation: FileInfo) -> None:
124 | """Add a file operation to the model."""
125 | if operation.filename not in self.file_operations:
126 | self.file_operations[operation.filename] = []
127 |
128 | # Prevent duplicate operations
129 | for op in self.file_operations[operation.filename]:
130 | if (
131 | op.source_file == operation.source_file
132 | and op.is_read == operation.is_read
133 | and op.is_write == operation.is_write
134 | ):
135 | return
136 |
137 | self.file_operations[operation.filename].append(operation)
138 |
139 | # Update file status if not already stored
140 | if operation.filename not in self.file_statuses:
141 | filepath = self.base_path / operation.filename
142 | self.file_statuses[operation.filename] = get_file_status(filepath)
143 |
144 | def add_import(self, import_info: ModuleImport) -> None:
145 | """Add a module import to the model."""
146 | if import_info.source_file not in self.imports:
147 | self.imports[import_info.source_file] = []
148 |
149 | # Prevent duplicate imports
150 | for imp in self.imports[import_info.source_file]:
151 | if (
152 | imp.module_name == import_info.module_name
153 | and imp.is_from_import == import_info.is_from_import
154 | ):
155 | return
156 |
157 | self.imports[import_info.source_file].append(import_info)
158 |
159 | def add_database_operation(self, operation: DatabaseInfo) -> None:
160 | """Add a database operation to the model, handling connection variables."""
161 | db_name_to_use = operation.db_name
162 |
163 | # If this operation uses a connection variable, prioritize its database name
164 | if hasattr(operation, "uses_conn_var") and operation.uses_conn_var:
165 | conn_var = operation.uses_conn_var
166 | # Look through existing operations to find the referenced connection
167 | for existing_ops in self.database_operations.values():
168 | for op in existing_ops:
169 | if hasattr(op, "conn_var_name") and op.conn_var_name == conn_var:
170 | db_name_to_use = op.db_name
171 | break
172 |
173 | # Now add the operation under the appropriate database name
174 | if db_name_to_use not in self.database_operations:
175 | self.database_operations[db_name_to_use] = []
176 |
177 | # Prevent duplicate operations
178 | for op in self.database_operations[db_name_to_use]:
179 | if (
180 | op.source_file == operation.source_file
181 | and op.is_read == operation.is_read
182 | and op.is_write == operation.is_write
183 | and op.db_type == operation.db_type
184 | ):
185 | return
186 |
187 | self.database_operations[db_name_to_use].append(operation)
188 |
189 | def build_graph(self) -> None:
190 | """Build the graph representation from file operations, database operations, and imports."""
191 | # Process file operations
192 | for filename, operations in self.file_operations.items():
193 | file_node_id = self.add_node(
194 | filename,
195 | NodeType.DATA_FILE,
196 | {"status": self.file_statuses.get(filename, FileStatus(exists=False))},
197 | )
198 |
199 | for op in operations:
200 | script_name = os.path.basename(op.source_file)
201 |
202 | # Determine node type based on file extension
203 | if script_name.endswith(".qmd"):
204 | node_type = NodeType.QUARTO_DOCUMENT
205 | elif script_name.endswith(".ipynb"):
206 | node_type = NodeType.JUPYTER_NOTEBOOK
207 | else:
208 | node_type = NodeType.SCRIPT
209 | script_node_id = self.add_node(script_name, node_type)
210 |
211 | if op.is_read:
212 | self.add_edge(file_node_id, script_node_id, "read")
213 | if op.is_write:
214 | self.add_edge(script_node_id, file_node_id, "write")
215 |
216 | # Process database operations
217 | for db_name, operations in self.database_operations.items():
218 | db_node_id = self.add_node(
219 | db_name,
220 | NodeType.DATABASE,
221 | {"db_type": operations[0].db_type}, # Use type from first operation
222 | )
223 |
224 | for op in operations:
225 | script_name = os.path.basename(op.source_file)
226 | # Determine node type based on file extension
227 | if script_name.endswith(".qmd"):
228 | node_type = NodeType.QUARTO_DOCUMENT
229 | elif script_name.endswith(".ipynb"):
230 | node_type = NodeType.JUPYTER_NOTEBOOK
231 | else:
232 | node_type = NodeType.SCRIPT
233 | script_node_id = self.add_node(script_name, node_type)
234 |
235 | if op.is_read:
236 | self.add_edge(db_node_id, script_node_id, "read")
237 | if op.is_write:
238 | self.add_edge(script_node_id, db_node_id, "write")
239 |
240 | # Process imports - create more detailed nodes
241 | for source_file, imports in self.imports.items():
242 | script_name = os.path.basename(source_file)
243 | # Determine node type based on file extension
244 | if script_name.endswith(".qmd"):
245 | node_type = NodeType.QUARTO_DOCUMENT
246 | elif script_name.endswith(".ipynb"):
247 | node_type = NodeType.JUPYTER_NOTEBOOK
248 | else:
249 | node_type = NodeType.SCRIPT
250 | script_node_id = self.add_node(script_name, node_type)
251 |
252 | for imp in imports:
253 | # Skip external modules if internal_only is True
254 | if self.internal_only and not imp.is_internal:
255 | continue
256 |
257 | # Get base module name without path
258 | base_module_name = os.path.basename(imp.module_name.replace(".", "/"))
259 | module_display_name = base_module_name
260 |
261 | # Create separate nodes for each imported item if it's a from-import
262 | if imp.is_from_import and imp.imported_names:
263 | for imported_name in imp.imported_names:
264 | # Create detailed import name with module:function format
265 | detailed_name = f"{module_display_name}:{imported_name}"
266 | node_type = (
267 | NodeType.INTERNAL_MODULE
268 | if imp.is_internal
269 | else NodeType.EXTERNAL_MODULE
270 | )
271 |
272 | import_node_id = self.add_node(
273 | detailed_name,
274 | node_type,
275 | {
276 | "module": module_display_name,
277 | "imported_name": imported_name,
278 | "is_from_import": True,
279 | },
280 | )
281 | self.add_edge(import_node_id, script_node_id, "import")
282 | else:
283 | # For regular imports, just use the module name
284 | node_type = (
285 | NodeType.INTERNAL_MODULE
286 | if imp.is_internal
287 | else NodeType.EXTERNAL_MODULE
288 | )
289 | import_node_id = self.add_node(module_display_name, node_type)
290 | self.add_edge(import_node_id, script_node_id, "import")
291 |
292 |
293 | def get_file_status(filepath: Path) -> FileStatus:
294 | """Get file existence and modification time information."""
295 | if filepath.exists():
296 | mtime = datetime.fromtimestamp(filepath.stat().st_mtime)
297 | return FileStatus(exists=True, last_modified=mtime)
298 | return FileStatus(exists=False)
299 |
--------------------------------------------------------------------------------
/tests/test_analyser.py:
--------------------------------------------------------------------------------
1 | """Comprehensive tests for analyser.py functions."""
2 |
3 | import ast
4 |
5 | from smartrappy.analyser import (
6 | DatabaseOperationFinder,
7 | FileOperationFinder,
8 | ModuleImportFinder,
9 | extract_string_from_node,
10 | get_direct_db_driver_info,
11 | get_matplotlib_file_info,
12 | get_mode_properties,
13 | get_open_file_info,
14 | get_pandas_file_info,
15 | get_pandas_sql_info,
16 | get_sqlalchemy_info,
17 | )
18 |
19 |
20 | class TestGetModeProperties:
21 | """Test file mode parsing."""
22 |
23 | def test_default_read_mode(self):
24 | """Test default mode is read-only."""
25 | is_read, is_write = get_mode_properties(None)
26 | assert is_read is True
27 | assert is_write is False
28 |
29 | def test_read_mode(self):
30 | """Test 'r' mode is read-only."""
31 | is_read, is_write = get_mode_properties("r")
32 | assert is_read is True
33 | assert is_write is False
34 |
35 | def test_write_mode(self):
36 | """Test 'w' mode is write-only."""
37 | is_read, is_write = get_mode_properties("w")
38 | assert is_read is False
39 | assert is_write is True
40 |
41 | def test_append_mode(self):
42 | """Test 'a' mode is write-only."""
43 | is_read, is_write = get_mode_properties("a")
44 | assert is_read is False
45 | assert is_write is True
46 |
47 | def test_exclusive_creation_mode(self):
48 | """Test 'x' mode is write-only."""
49 | is_read, is_write = get_mode_properties("x")
50 | assert is_read is False
51 | assert is_write is True
52 |
53 | def test_read_write_mode(self):
54 | """Test 'r+' mode allows both read and write."""
55 | is_read, is_write = get_mode_properties("r+")
56 | assert is_read is True
57 | assert is_write is True
58 |
59 | def test_write_read_mode(self):
60 | """Test 'w+' mode allows both read and write."""
61 | is_read, is_write = get_mode_properties("w+")
62 | assert is_read is True
63 | assert is_write is True
64 |
65 | def test_append_read_mode(self):
66 | """Test 'a+' mode allows both read and write."""
67 | is_read, is_write = get_mode_properties("a+")
68 | assert is_read is True
69 | assert is_write is True
70 |
71 |
72 | class TestExtractStringFromNode:
73 | """Test string extraction from AST nodes."""
74 |
75 | def test_path_call_with_name(self):
76 | """Test extraction from Path() call."""
77 | code = 'Path("test.txt")'
78 | tree = ast.parse(code)
79 | node = tree.body[0].value
80 | result = extract_string_from_node(node)
81 | assert result == "test.txt"
82 |
83 | def test_path_call_with_attribute(self):
84 | """Test extraction from pathlib.Path() call."""
85 | code = 'pathlib.Path("test.txt")'
86 | tree = ast.parse(code)
87 | node = tree.body[0].value
88 | result = extract_string_from_node(node)
89 | assert result == "test.txt"
90 |
91 | def test_non_path_call(self):
92 | """Test that non-Path calls return None."""
93 | code = 'other_func("test.txt")'
94 | tree = ast.parse(code)
95 | node = tree.body[0].value
96 | result = extract_string_from_node(node)
97 | assert result is None
98 |
99 |
100 | class TestGetOpenFileInfo:
101 | """Test extraction of file info from open() calls."""
102 |
103 | def test_open_without_args(self):
104 | """Test open() without arguments returns None."""
105 | code = "open()"
106 | tree = ast.parse(code)
107 | node = tree.body[0].value
108 | result = get_open_file_info(node, "test.py")
109 | assert result is None
110 |
111 | def test_open_with_path_object(self):
112 | """Test open() with Path object."""
113 | code = 'open(Path("test.txt"))'
114 | tree = ast.parse(code)
115 | node = tree.body[0].value
116 | result = get_open_file_info(node, "test.py")
117 | assert result is not None
118 | assert result.filename == "test.txt"
119 | assert result.is_read is True
120 | assert result.is_write is False
121 |
122 | def test_open_with_keyword_mode(self):
123 | """Test open() with mode as keyword argument."""
124 | code = 'open("test.txt", mode="w")'
125 | tree = ast.parse(code)
126 | node = tree.body[0].value
127 | result = get_open_file_info(node, "test.py")
128 | assert result is not None
129 | assert result.filename == "test.txt"
130 | assert result.is_read is False
131 | assert result.is_write is True
132 |
133 | def test_open_with_append_mode(self):
134 | """Test open() with append mode."""
135 | code = 'open("test.txt", "a")'
136 | tree = ast.parse(code)
137 | node = tree.body[0].value
138 | result = get_open_file_info(node, "test.py")
139 | assert result is not None
140 | assert result.is_write is True
141 |
142 | def test_open_with_read_write_mode(self):
143 | """Test open() with r+ mode."""
144 | code = 'open("test.txt", "r+")'
145 | tree = ast.parse(code)
146 | node = tree.body[0].value
147 | result = get_open_file_info(node, "test.py")
148 | assert result is not None
149 | assert result.is_read is True
150 | assert result.is_write is True
151 |
152 |
153 | class TestGetPandasFileInfo:
154 | """Test extraction of file info from pandas operations."""
155 |
156 | def test_pandas_read_without_args(self):
157 | """Test pd.read_csv() without arguments returns None."""
158 | code = "pd.read_csv()"
159 | tree = ast.parse(code)
160 | node = tree.body[0].value
161 | result = get_pandas_file_info(node, "test.py")
162 | assert result is None
163 |
164 | def test_pandas_non_file_method(self):
165 | """Test pandas method that doesn't read/write files."""
166 | code = "pd.concat([df1, df2])"
167 | tree = ast.parse(code)
168 | node = tree.body[0].value
169 | result = get_pandas_file_info(node, "test.py")
170 | assert result is None
171 |
172 | def test_dataframe_to_csv_without_args(self):
173 | """Test df.to_csv() without arguments returns None."""
174 | code = "df.to_csv()"
175 | tree = ast.parse(code)
176 | node = tree.body[0].value
177 | result = get_pandas_file_info(node, "test.py")
178 | assert result is None
179 |
180 | def test_dataframe_to_sql(self):
181 | """Test df.to_sql() returns None (database, not file)."""
182 | code = 'df.to_sql("table", conn)'
183 | tree = ast.parse(code)
184 | node = tree.body[0].value
185 | result = get_pandas_file_info(node, "test.py")
186 | assert result is None
187 |
188 | def test_pandas_read_sql(self):
189 | """Test pd.read_sql() returns None (database, not file)."""
190 | code = 'pd.read_sql("SELECT * FROM table", conn)'
191 | tree = ast.parse(code)
192 | node = tree.body[0].value
193 | result = get_pandas_file_info(node, "test.py")
194 | assert result is None
195 |
196 |
197 | class TestGetMatplotlibFileInfo:
198 | """Test extraction of file info from matplotlib operations."""
199 |
200 | def test_savefig_with_path_object(self):
201 | """Test plt.savefig() with Path object."""
202 | code = 'plt.savefig(Path("plot.png"))'
203 | tree = ast.parse(code)
204 | node = tree.body[0].value
205 | result = get_matplotlib_file_info(node, "test.py")
206 | assert result is not None
207 | assert result.filename == "plot.png"
208 | assert result.is_write is True
209 |
210 |
211 | class TestGetSQLAlchemyInfo:
212 | """Test extraction of database info from SQLAlchemy operations."""
213 |
214 | def test_create_engine_with_string(self):
215 | """Test create_engine with connection string."""
216 | code = 'create_engine("sqlite:///test.db")'
217 | tree = ast.parse(code)
218 | node = tree.body[0].value
219 | result = get_sqlalchemy_info(node, "test.py")
220 | assert result is not None
221 | assert result.db_type == "sqlite"
222 | assert result.connection_string == "sqlite:///test.db"
223 |
224 | def test_create_engine_postgresql(self):
225 | """Test create_engine with PostgreSQL connection."""
226 | code = 'create_engine("postgresql://user:pass@localhost/mydb")'
227 | tree = ast.parse(code)
228 | node = tree.body[0].value
229 | result = get_sqlalchemy_info(node, "test.py")
230 | assert result is not None
231 | assert result.db_type == "postgresql"
232 | assert result.db_name == "mydb"
233 |
234 | def test_create_engine_mysql(self):
235 | """Test create_engine with MySQL connection."""
236 | code = 'create_engine("mysql://user:pass@localhost/mydb")'
237 | tree = ast.parse(code)
238 | node = tree.body[0].value
239 | result = get_sqlalchemy_info(node, "test.py")
240 | assert result is not None
241 | assert result.db_type == "mysql"
242 | assert result.db_name == "mydb"
243 |
244 | def test_create_engine_mssql(self):
245 | """Test create_engine with MSSQL connection."""
246 | code = 'create_engine("mssql://user:pass@localhost/mydb")'
247 | tree = ast.parse(code)
248 | node = tree.body[0].value
249 | result = get_sqlalchemy_info(node, "test.py")
250 | assert result is not None
251 | assert result.db_type == "mssql"
252 |
253 |
254 | class TestGetPandasSQLInfo:
255 | """Test extraction of database info from pandas SQL operations."""
256 |
257 | def test_read_sql_with_connection_string(self):
258 | """Test pd.read_sql with connection string."""
259 | code = 'pd.read_sql("SELECT * FROM table", con="sqlite:///test.db")'
260 | tree = ast.parse(code)
261 | node = tree.body[0].value
262 | result = get_pandas_sql_info(node, "test.py")
263 | assert result is not None
264 | assert result.db_type == "sqlite"
265 | assert result.is_read is True
266 |
267 | def test_read_sql_with_postgresql_connection(self):
268 | """Test pd.read_sql with PostgreSQL connection."""
269 | code = 'pd.read_sql("SELECT * FROM table", con="postgresql://localhost/mydb")'
270 | tree = ast.parse(code)
271 | node = tree.body[0].value
272 | result = get_pandas_sql_info(node, "test.py")
273 | assert result is not None
274 | assert result.db_type == "postgresql"
275 | assert result.db_name == "mydb"
276 |
277 | def test_read_sql_with_mysql_connection(self):
278 | """Test pd.read_sql with MySQL connection."""
279 | code = 'pd.read_sql("SELECT * FROM table", con="mysql://localhost/mydb")'
280 | tree = ast.parse(code)
281 | node = tree.body[0].value
282 | result = get_pandas_sql_info(node, "test.py")
283 | assert result is not None
284 | assert result.db_type == "mysql"
285 |
286 | def test_read_sql_with_mssql_connection(self):
287 | """Test pd.read_sql with MSSQL ODBC connection."""
288 | code = 'pd.read_sql("SELECT * FROM table", con="Driver={SQL Server};Server=localhost;Database=mydb")'
289 | tree = ast.parse(code)
290 | node = tree.body[0].value
291 | result = get_pandas_sql_info(node, "test.py")
292 | assert result is not None
293 | assert result.db_type == "mssql"
294 | assert result.db_name == "mydb"
295 |
296 | def test_read_sql_with_variable_connection(self):
297 | """Test pd.read_sql with variable connection."""
298 | code = 'pd.read_sql("SELECT * FROM table", conn_var)'
299 | tree = ast.parse(code)
300 | node = tree.body[0].value
301 | result = get_pandas_sql_info(node, "test.py")
302 | # Should still return a DatabaseInfo but without connection details
303 | assert result is not None
304 |
305 |
306 | class TestGetDirectDBDriverInfo:
307 | """Test extraction of database info from direct database drivers."""
308 |
309 | def test_sqlite3_connect(self):
310 | """Test sqlite3.connect() call."""
311 | code = 'sqlite3.connect("test.db")'
312 | tree = ast.parse(code)
313 | node = tree.body[0].value
314 | result = get_direct_db_driver_info(node, "test.py")
315 | assert result is not None
316 | assert result.db_type == "sqlite"
317 | assert result.db_name == "test.db"
318 |
319 | def test_psycopg2_connect(self):
320 | """Test psycopg2.connect() call."""
321 | code = 'psycopg2.connect("dbname=mydb user=postgres")'
322 | tree = ast.parse(code)
323 | node = tree.body[0].value
324 | result = get_direct_db_driver_info(node, "test.py")
325 | assert result is not None
326 | assert result.db_type == "postgresql"
327 |
328 | def test_pymysql_connect(self):
329 | """Test pymysql.connect() call."""
330 | code = 'pymysql.connect(host="localhost", database="mydb")'
331 | tree = ast.parse(code)
332 | node = tree.body[0].value
333 | result = get_direct_db_driver_info(node, "test.py")
334 | assert result is not None
335 | assert result.db_type == "mysql"
336 |
337 | def test_pyodbc_connect(self):
338 | """Test pyodbc.connect() call."""
339 | code = 'pyodbc.connect("Driver={SQL Server};Server=localhost;Database=mydb")'
340 | tree = ast.parse(code)
341 | node = tree.body[0].value
342 | result = get_direct_db_driver_info(node, "test.py")
343 | assert result is not None
344 | assert result.db_type == "mssql"
345 |
346 |
347 | class TestDatabaseOperationFinder:
348 | """Test the DatabaseOperationFinder AST visitor."""
349 |
350 | def test_sqlalchemy_engine_tracking(self):
351 | """Test that SQLAlchemy engines are tracked correctly."""
352 | code = """
353 | import sqlalchemy as sa
354 | engine = sa.create_engine("sqlite:///test.db")
355 | df = pd.read_sql("SELECT * FROM table", engine)
356 | """
357 | tree = ast.parse(code)
358 | finder = DatabaseOperationFinder("test.py")
359 | finder.visit(tree)
360 |
361 | # Should find 2 operations: engine creation and read_sql usage
362 | assert len(finder.database_operations) >= 1
363 |
364 | def test_direct_create_engine(self):
365 | """Test direct create_engine call tracking."""
366 | code = """
367 | from sqlalchemy import create_engine
368 | engine = create_engine("postgresql://localhost/mydb")
369 | """
370 | tree = ast.parse(code)
371 | finder = DatabaseOperationFinder("test.py")
372 | finder.visit(tree)
373 |
374 | assert len(finder.database_operations) >= 1
375 | # Check that engine was registered
376 | assert "engine" in finder.sqlalchemy_engines
377 |
378 | def test_connection_variable_tracking(self):
379 | """Test that database connections are tracked correctly."""
380 | code = """
381 | import sqlite3
382 | conn = sqlite3.connect("test.db")
383 | df = pd.read_sql("SELECT * FROM table", conn)
384 | """
385 | tree = ast.parse(code)
386 | finder = DatabaseOperationFinder("test.py")
387 | finder.visit(tree)
388 |
389 | # Should track the connection
390 | assert len(finder.database_operations) >= 1
391 |
392 | def test_to_sql_with_engine(self):
393 | """Test df.to_sql() with SQLAlchemy engine."""
394 | code = """
395 | import sqlalchemy as sa
396 | engine = sa.create_engine("sqlite:///test.db")
397 | df.to_sql("table_name", engine)
398 | """
399 | tree = ast.parse(code)
400 | finder = DatabaseOperationFinder("test.py")
401 | finder.visit(tree)
402 |
403 | # Should find both engine creation and to_sql operation
404 | assert len(finder.database_operations) >= 1
405 |
406 | def test_to_sql_with_connection(self):
407 | """Test df.to_sql() with database connection."""
408 | code = """
409 | import sqlite3
410 | conn = sqlite3.connect("test.db")
411 | df.to_sql("table_name", conn)
412 | """
413 | tree = ast.parse(code)
414 | finder = DatabaseOperationFinder("test.py")
415 | finder.visit(tree)
416 |
417 | assert len(finder.database_operations) >= 1
418 |
419 | def test_to_sql_without_tracked_connection(self):
420 | """Test df.to_sql() without a tracked connection variable."""
421 | code = """
422 | df.to_sql("table_name", "sqlite:///test.db")
423 | """
424 | tree = ast.parse(code)
425 | finder = DatabaseOperationFinder("test.py")
426 | finder.visit(tree)
427 |
428 | # Should still detect the operation
429 | assert len(finder.database_operations) >= 0
430 |
431 |
432 | class TestFileOperationFinder:
433 | """Test the FileOperationFinder AST visitor."""
434 |
435 | def test_multiple_file_operations(self):
436 | """Test finding multiple file operations in one script."""
437 | code = """
438 | with open("input.txt", "r") as f:
439 | data = f.read()
440 |
441 | with open("output.txt", "w") as f:
442 | f.write(data)
443 |
444 | df = pd.read_csv("data.csv")
445 | df.to_excel("output.xlsx")
446 | """
447 | tree = ast.parse(code)
448 | finder = FileOperationFinder("test.py")
449 | finder.visit(tree)
450 |
451 | # Should find all 4 file operations
452 | assert len(finder.file_operations) >= 4
453 |
454 |
455 | class TestModuleImportFinder:
456 | """Test the ModuleImportFinder AST visitor."""
457 |
458 | def test_import_tracking(self):
459 | """Test that imports are tracked correctly."""
460 | code = """
461 | import pandas as pd
462 | from pathlib import Path
463 | import numpy
464 | """
465 | tree = ast.parse(code)
466 | # ModuleImportFinder requires project_modules parameter
467 | finder = ModuleImportFinder("test.py", project_modules=set())
468 | finder.visit(tree)
469 |
470 | # Should find all imports
471 | assert len(finder.imports) >= 3
472 |
--------------------------------------------------------------------------------
/src/smartrappy/reporters.py:
--------------------------------------------------------------------------------
1 | """Reporters for smartrappy analysis results."""
2 |
3 | import json
4 | import os
5 | from abc import ABC, abstractmethod
6 | from typing import Dict, Optional, Set
7 |
8 | from graphviz import Digraph
9 | from rich.console import Console
10 | from rich.text import Text
11 | from rich.tree import Tree
12 |
13 | from smartrappy.models import NodeType, ProjectModel
14 |
15 |
16 | class Reporter(ABC):
17 | """Base class for all reporters."""
18 |
19 | @abstractmethod
20 | def generate_report(
21 | self, model: ProjectModel, output_path: Optional[str] = None
22 | ) -> None:
23 | """Generate a report from the project model."""
24 | pass
25 |
26 |
27 | class ConsoleReporter(Reporter):
28 | """Report analysis results to the console."""
29 |
30 | def generate_report(
31 | self, model: ProjectModel, output_path: Optional[str] = None
32 | ) -> None:
33 | """Generate a console report from the project model."""
34 | console = Console()
35 |
36 | # Print header
37 | console.print(
38 | "\n[bold cyan]File Operations, Database Operations, and Import Analysis[/bold cyan]"
39 | )
40 | console.print("=" * 80)
41 |
42 | # Print file operations
43 | for filename, file_ops in sorted(model.file_operations.items()):
44 | console.print(f"\n[bold]File:[/bold] {filename}")
45 | has_read = any(op.is_read for op in file_ops)
46 | has_write = any(op.is_write for op in file_ops)
47 | op_type = (
48 | "READ/WRITE"
49 | if has_read and has_write
50 | else ("READ" if has_read else "WRITE")
51 | )
52 | console.print(f"[bold]Operation:[/bold] {op_type}")
53 | console.print("[bold]Referenced in:[/bold]")
54 | sources = sorted(set(op.source_file for op in file_ops))
55 | for source in sources:
56 | console.print(f" - {source}")
57 |
58 | if model.database_operations:
59 | console.print("\n[bold purple]💽 Database Operations:[/bold purple]")
60 | for db_name, db_ops in sorted(model.database_operations.items()):
61 | console.print(f"\n[bold]Database:[/bold] {db_name}")
62 | db_type = db_ops[0].db_type # Get type from first operation
63 | console.print(f"[bold]Type:[/bold] {db_type}")
64 |
65 | has_read = any(op.is_read for op in db_ops)
66 | has_write = any(op.is_write for op in db_ops)
67 | op_type = (
68 | "READ/WRITE"
69 | if has_read and has_write
70 | else ("READ" if has_read else "WRITE")
71 | )
72 | console.print(f"[bold]Operation:[/bold] {op_type}")
73 |
74 | console.print("[bold]Referenced in:[/bold]")
75 | sources = sorted(set(op.source_file for op in db_ops))
76 | for source in sources:
77 | console.print(f" - {source}")
78 |
79 | # Print import analysis
80 | console.print("\n[bold]Module Imports:[/bold]")
81 | for script, script_imports in sorted(model.imports.items()):
82 | if script_imports:
83 | script_name = os.path.basename(script)
84 | console.print(f"\n[bold]Script:[/bold] {script_name}")
85 | for imp in script_imports:
86 | # Get module display name with .py extension for Python modules
87 | module_display = os.path.basename(imp.module_name.replace(".", "/"))
88 | # if not module_display.endswith(".py") and "." not in module_display:
89 | # module_display = f"{module_display}.py"
90 |
91 | import_type = "from" if imp.is_from_import else "import"
92 | module_type = (
93 | "[blue]internal[/blue]"
94 | if imp.is_internal
95 | else "[red]external[/red]"
96 | )
97 |
98 | # For 'from' imports, show as module:imported_names
99 | if imp.is_from_import:
100 | detailed_imports = [
101 | f"{module_display}:{name}" for name in imp.imported_names
102 | ]
103 | detailed_str = ", ".join(detailed_imports)
104 | console.print(
105 | f" - {import_type} {imp.module_name} → {detailed_str} [{module_type}]"
106 | )
107 | else:
108 | console.print(
109 | f" - {import_type} {module_display} [{module_type}]"
110 | )
111 |
112 | # Create and display terminal visualisation
113 | console.print("\n[bold cyan]Terminal Visualisation[/bold cyan]")
114 | tree = self._create_terminal_tree(model)
115 | console.print(tree)
116 |
117 | def _create_terminal_tree(self, model: ProjectModel) -> Tree:
118 | """Create a rich Tree visualisation of the dependency graph."""
119 | # Create the main tree
120 | tree = Tree("📦 Project Dependencies", guide_style="bold cyan")
121 |
122 | # Track all nodes and their dependencies
123 | dependencies: Dict[str, Set[str]] = {} # node_id -> set of dependency node_ids
124 |
125 | # Process edges to build dependency map
126 | for edge in model.edges:
127 | if edge.target not in dependencies:
128 | dependencies[edge.target] = set()
129 | dependencies[edge.target].add(edge.source)
130 |
131 | # Find root nodes (nodes with no incoming edges)
132 | all_nodes = set(model.nodes.keys())
133 | dependency_targets = set()
134 | for deps in dependencies.values():
135 | dependency_targets.update(deps)
136 | root_nodes = all_nodes - dependency_targets
137 |
138 | # Helper function to get node style
139 | def get_node_style(node_type: str, name: str) -> Text:
140 | icons = {
141 | NodeType.SCRIPT: "📜",
142 | NodeType.EXTERNAL_MODULE: "📦",
143 | NodeType.INTERNAL_MODULE: "🔧",
144 | NodeType.DATA_FILE: "📄",
145 | NodeType.DATABASE: "💽",
146 | NodeType.QUARTO_DOCUMENT: "📰",
147 | NodeType.JUPYTER_NOTEBOOK: "📓",
148 | }
149 | colors = {
150 | NodeType.SCRIPT: "green",
151 | NodeType.EXTERNAL_MODULE: "red",
152 | NodeType.INTERNAL_MODULE: "blue",
153 | NodeType.DATA_FILE: "magenta",
154 | NodeType.DATABASE: "purple",
155 | NodeType.QUARTO_DOCUMENT: "cyan",
156 | NodeType.JUPYTER_NOTEBOOK: "yellow",
157 | }
158 | return Text(
159 | f"{icons.get(node_type, '❓')} {name}",
160 | style=colors.get(node_type, "white"),
161 | )
162 |
163 | # Helper function to recursively build tree
164 | def build_tree(node_id: str, seen: Set[str], parent_tree: Tree) -> None:
165 | if node_id in seen:
166 | return
167 |
168 | node = model.nodes[node_id]
169 | seen.add(node_id)
170 |
171 | # Add node to tree
172 | node_tree = parent_tree.add(get_node_style(node.type, node.name))
173 |
174 | # For database nodes, add type information
175 | if node.type == NodeType.DATABASE and "db_type" in node.metadata:
176 | node_tree.add(Text(f"Type: {node.metadata['db_type']}", "purple"))
177 |
178 | # Add dependencies
179 | for dep_id in sorted(dependencies.get(node_id, set())):
180 | if dep_id not in seen:
181 | build_tree(dep_id, seen.copy(), node_tree)
182 | else:
183 | # Show circular dependency
184 | dep_node = model.nodes[dep_id]
185 | node_tree.add(Text(f"↻ {dep_node.name} (circular)", "yellow"))
186 |
187 | # Build tree from each root node
188 | for root_id in sorted(root_nodes):
189 | build_tree(root_id, set(), tree)
190 |
191 | return tree
192 |
193 |
194 | class GraphvizReporter(Reporter):
195 | """Generate a Graphviz visualisation of the project model. Exports as PDF"""
196 |
197 | def generate_report(
198 | self, model: ProjectModel, output_path: Optional[str] = None
199 | ) -> None:
200 | """Generate a Graphviz visualisation from the project model."""
201 | if not output_path:
202 | output_path = "project_graph"
203 |
204 | # Create a new directed graph
205 | dot = Digraph(comment="Project Dependency Graph")
206 | dot.attr(rankdir="TB") # Top to bottom layout
207 |
208 | # Define node styles
209 | dot.attr("node", shape="box", style="filled")
210 |
211 | # Add nodes
212 | for node_id, node in model.nodes.items():
213 | if node.type == NodeType.SCRIPT:
214 | dot.node(
215 | node_id,
216 | node.name,
217 | fillcolor="#90EE90", # Light green
218 | color="#333333",
219 | penwidth="2.0",
220 | )
221 | elif node.type == NodeType.DATA_FILE:
222 | # Handle file status for data files
223 | if "status" in node.metadata:
224 | status = node.metadata["status"]
225 | if status.exists:
226 | mod_time = status.last_modified.strftime("%Y-%m-%d %H:%M:%S")
227 | label = f"{node.name}\nModified: {mod_time}"
228 | dot.node(
229 | node_id,
230 | label,
231 | fillcolor="#FFB6C1", # Light pink
232 | color="#333333",
233 | penwidth="2.0",
234 | )
235 | else:
236 | label = f"{node.name}\nFile does not exist"
237 | dot.node(
238 | node_id,
239 | label,
240 | fillcolor="#FFB6C1", # Light pink
241 | color="#FF0000", # Red border
242 | penwidth="3.0",
243 | style="filled,dashed",
244 | )
245 | else:
246 | dot.node(
247 | node_id,
248 | node.name,
249 | fillcolor="#FFB6C1", # Light pink
250 | color="#333333",
251 | penwidth="2.0",
252 | )
253 | elif node.type == NodeType.DATABASE:
254 | # Special styling for database nodes
255 | db_type = node.metadata.get("db_type", "unknown")
256 | label = f"{node.name}\nType: {db_type}" # Using node.name, not node_id
257 | dot.node(
258 | node_id,
259 | label,
260 | fillcolor="#B19CD9", # Light purple for databases
261 | color="#333333",
262 | penwidth="2.0",
263 | shape="cylinder", # Database shape
264 | )
265 | elif node.type == NodeType.INTERNAL_MODULE:
266 | # Handle imported item nodes with specific style
267 | if "imported_name" in node.metadata:
268 | dot.node(
269 | node_id,
270 | node.name,
271 | fillcolor="#ADD8E6", # Light blue for internal modules
272 | color="#333333",
273 | penwidth="2.0",
274 | shape="oval", # Use oval shape for imported items
275 | )
276 | else:
277 | dot.node(
278 | node_id,
279 | node.name,
280 | fillcolor="#ADD8E6", # Light blue for internal modules
281 | color="#333333",
282 | penwidth="2.0",
283 | )
284 | elif node.type == NodeType.EXTERNAL_MODULE:
285 | # Handle imported item nodes with specific style
286 | if "imported_name" in node.metadata:
287 | dot.node(
288 | node_id,
289 | node.name,
290 | fillcolor="#FFA07A", # Light salmon for external modules
291 | color="#333333",
292 | penwidth="2.0",
293 | shape="oval", # Use oval shape for imported items
294 | )
295 | else:
296 | dot.node(
297 | node_id,
298 | node.name,
299 | fillcolor="#FFA07A", # Light salmon for external modules
300 | color="#333333",
301 | penwidth="2.0",
302 | )
303 | elif node.type == NodeType.QUARTO_DOCUMENT:
304 | # Special styling for Quarto documents
305 | dot.node(
306 | node_id,
307 | node.name,
308 | fillcolor="#00CED1", # Dark turquoise for Quarto docs
309 | color="#333333",
310 | penwidth="2.0",
311 | )
312 | elif node.type == NodeType.JUPYTER_NOTEBOOK:
313 | # Special styling for Jupyter notebooks
314 | dot.node(
315 | node_id,
316 | node.name,
317 | fillcolor="#FFD700", # Gold for Jupyter notebooks
318 | color="#333333",
319 | penwidth="2.0",
320 | )
321 |
322 | # Add edges
323 | dot.attr("edge", color="#333333")
324 | for edge in model.edges:
325 | dot.edge(edge.source, edge.target)
326 |
327 | # Render the graph
328 | output_dir = os.path.dirname(output_path) or "."
329 | os.makedirs(output_dir, exist_ok=True)
330 |
331 | dot.render(output_path, view=False, format="pdf", cleanup=True)
332 | print(f"Graphviz visualisation saved as {output_path}.pdf")
333 |
334 |
335 | class MermaidReporter(Reporter):
336 | """Generate a Mermaid visualisation of the project model."""
337 |
338 | def generate_report(
339 | self, model: ProjectModel, output_path: Optional[str] = None
340 | ) -> None:
341 | """Generate a Mermaid diagram from the project model."""
342 | if not output_path:
343 | output_path = "project_diagram.md"
344 |
345 | # Generate Mermaid markup
346 | mermaid = [
347 | "graph TD",
348 | " %% Style definitions",
349 | " classDef scriptNode fill:#90EE90,stroke:#333,stroke-width:2px;",
350 | " classDef fileNode fill:#FFB6C1,stroke:#333,stroke-width:2px;",
351 | " classDef quartoNode fill:#00CED1,stroke:#333,stroke-width:2px;",
352 | " classDef notebookNode fill:#FFD700,stroke:#333,stroke-width:2px;",
353 | " classDef missingFile fill:#FFB6C1,stroke:#FF0000,stroke-width:3px,stroke-dasharray: 5 5;",
354 | " classDef internalModule fill:#ADD8E6,stroke:#333,stroke-width:2px;",
355 | " classDef externalModule fill:#FFA07A,stroke:#333,stroke-width:2px;",
356 | " classDef importedItem fill:#ADD8E6,stroke:#333,stroke-width:2px,shape:circle;",
357 | " classDef externalImportedItem fill:#FFA07A,stroke:#333,stroke-width:2px,shape:circle;",
358 | " classDef databaseNode fill:#B19CD9,stroke:#333,stroke-width:2px,shape:cylinder;",
359 | "",
360 | " %% Nodes",
361 | ]
362 |
363 | # Add nodes
364 | for node_id, node in model.nodes.items():
365 | if node.type == NodeType.SCRIPT:
366 | mermaid.append(f' {node_id}["{node.name}"]:::scriptNode')
367 | elif node.type == NodeType.DATA_FILE:
368 | # Handle file status for data files
369 | if "status" in node.metadata:
370 | status = node.metadata["status"]
371 | if status.exists:
372 | mod_time = status.last_modified.strftime("%Y-%m-%d %H:%M:%S")
373 | label = f"{node.name}
Modified: {mod_time}"
374 | mermaid.append(f' {node_id}["{label}"]:::fileNode')
375 | else:
376 | label = f"{node.name}
File does not exist"
377 | mermaid.append(f' {node_id}["{label}"]:::missingFile')
378 | else:
379 | mermaid.append(f' {node_id}["{node.name}"]:::fileNode')
380 | elif node.type == NodeType.DATABASE:
381 | # Database nodes with specific styling
382 | db_type = node.metadata.get("db_type", "unknown")
383 | label = f"{node.name}
Type: {db_type}"
384 | mermaid.append(f' {node_id}["{label}"]:::databaseNode')
385 | elif node.type == NodeType.INTERNAL_MODULE:
386 | # Handle imported item nodes with specific style
387 | if "imported_name" in node.metadata:
388 | mermaid.append(f' {node_id}(("{node.name}")):::importedItem')
389 | else:
390 | mermaid.append(f' {node_id}["{node.name}"]:::internalModule')
391 | elif node.type == NodeType.EXTERNAL_MODULE:
392 | # Handle imported item nodes with specific style
393 | if "imported_name" in node.metadata:
394 | mermaid.append(
395 | f' {node_id}(("{node.name}")):::externalImportedItem'
396 | )
397 | else:
398 | mermaid.append(f' {node_id}["{node.name}"]:::externalModule')
399 | elif node.type == NodeType.QUARTO_DOCUMENT:
400 | mermaid.append(f' {node_id}["{node.name}"]:::quartoNode')
401 | elif node.type == NodeType.JUPYTER_NOTEBOOK:
402 | mermaid.append(f' {node_id}["{node.name}"]:::notebookNode')
403 |
404 | mermaid.append("")
405 | mermaid.append(" %% Relationships")
406 |
407 | # Add edges
408 | for edge in model.edges:
409 | mermaid.append(f" {edge.source} --> {edge.target}")
410 |
411 | # Create markdown file with mermaid diagram
412 | output_dir = os.path.dirname(output_path) or "."
413 | os.makedirs(output_dir, exist_ok=True)
414 |
415 | with open(output_path, "w") as f:
416 | f.write("# Project Dependency Diagram\n\n")
417 | f.write("```mermaid\n")
418 | f.write("\n".join(mermaid))
419 | f.write("\n```\n")
420 |
421 | print(f"Mermaid diagram saved as {output_path}")
422 |
423 |
424 | class JsonReporter(Reporter):
425 | """Generate a JSON representation of the project model."""
426 |
427 | def generate_report(
428 | self, model: ProjectModel, output_path: Optional[str] = None
429 | ) -> None:
430 | """Generate a JSON file from the project model or print to console if no path is given."""
431 | # Create a serializable representation of the model
432 | serializable = {"nodes": [], "edges": [], "file_operations": [], "imports": []}
433 |
434 | # Add nodes
435 | for node_id, node in model.nodes.items():
436 | # Skip external modules if internal_only is True
437 | if model.internal_only and node.type == NodeType.EXTERNAL_MODULE:
438 | continue
439 |
440 | node_data = {
441 | "id": node_id,
442 | "name": node.name,
443 | "type": node.type,
444 | "metadata": {},
445 | }
446 |
447 | # Handle file status for data files
448 | if node.type == NodeType.DATA_FILE and "status" in node.metadata:
449 | status = node.metadata["status"]
450 | node_data["metadata"]["exists"] = status.exists
451 | if status.last_modified:
452 | node_data["metadata"]["last_modified"] = (
453 | status.last_modified.isoformat()
454 | )
455 |
456 | serializable["nodes"].append(node_data)
457 |
458 | # Add edges
459 | for edge in model.edges:
460 | serializable["edges"].append(
461 | {"source": edge.source, "target": edge.target, "type": edge.type}
462 | )
463 |
464 | # Add file operations
465 | for filename, operations in model.file_operations.items():
466 | for op in operations:
467 | serializable["file_operations"].append(
468 | {
469 | "filename": op.filename,
470 | "is_read": op.is_read,
471 | "is_write": op.is_write,
472 | "source_file": op.source_file,
473 | }
474 | )
475 |
476 | # Add imports
477 | for source_file, imports in model.imports.items():
478 | for imp in imports:
479 | # Skip external modules if internal_only is True
480 | if model.internal_only and not imp.is_internal:
481 | continue
482 |
483 | serializable["imports"].append(
484 | {
485 | "module_name": imp.module_name,
486 | "source_file": imp.source_file,
487 | "is_from_import": imp.is_from_import,
488 | "imported_names": imp.imported_names,
489 | "is_internal": imp.is_internal,
490 | }
491 | )
492 |
493 | # If no output path specified, print to console with rich
494 | if output_path is None:
495 | console = Console()
496 | console.print("\n[bold cyan]JSON Representation[/bold cyan]")
497 | console.print("=" * 80)
498 | console.print_json(data=serializable, indent=2)
499 | else:
500 | # Write to file
501 | output_dir = os.path.dirname(output_path) or "."
502 | os.makedirs(output_dir, exist_ok=True)
503 |
504 | with open(output_path, "w") as f:
505 | json.dump(serializable, f, indent=2)
506 |
507 | print(f"JSON report saved as {output_path}")
508 |
509 |
510 | def get_reporter(format_type: str) -> Reporter:
511 | """
512 | Factory function to get the appropriate reporter.
513 |
514 | Args:
515 | format_type: The type of reporter to use ('console', 'graphviz', 'mermaid', or 'json')
516 |
517 | Returns:
518 | A Reporter instance
519 |
520 | Raises:
521 | ValueError: If the format type is not supported
522 | """
523 | reporters = {
524 | "console": ConsoleReporter(),
525 | "graphviz": GraphvizReporter(),
526 | "mermaid": MermaidReporter(),
527 | "json": JsonReporter(),
528 | }
529 |
530 | if format_type.lower() not in reporters:
531 | raise ValueError(
532 | f"Unsupported format: {format_type}. "
533 | f"Supported formats: {', '.join(reporters.keys())}"
534 | )
535 |
536 | return reporters[format_type.lower()]
537 |
--------------------------------------------------------------------------------
/docs/output.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
510 |
--------------------------------------------------------------------------------