├── requirements.txt
├── pyproject.toml
├── README.md
├── etl_loader
    ├── __init__.py
    └── loader.py
├── tests
    └── test_loader.py
└── streamlit_app.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | pandas>=2.0
3 | pydantic>=2.5
4 | openpyxl>=3.1
5 | pytest>=8.0
6 | streamlit>=1.34
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [project]
 3 | name = "etl-pydantic-pytest-demo"
 4 | version = "0.1.0"
 5 | requires-python = ">=3.10"
 6 | dependencies = [
 7 |     "pandas>=2.0",
 8 |     "pydantic>=2.5",
 9 |     "openpyxl>=3.1",
10 |     "pytest>=8.0",
11 | ]
12 | 
13 | [tool.pytest.ini_options]
14 | addopts = "-q"
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # ETL Loader with Pydantic + Pytest (Demo)
 3 | 
 4 | This mini-project shows how to validate Excel-based financial transaction data with **pandas** + **pydantic**,
 5 | emit user-friendly errors, and test the whole pipeline with **pytest**.
 6 | 
 7 | ## Structure
 8 | - `etl_loader/loader.py` – loader and schema
 9 | - `tests/test_loader.py` – comprehensive unit tests
10 | 
11 | ## Quickstart
12 | ```bash
13 | pip install -U pandas pydantic openpyxl pytest
14 | pytest -q
15 | ```
16 | 
17 | Adjust the `EXPECTED` columns and the `TransactionRecord` model to match your real schema.
18 | 


--------------------------------------------------------------------------------
/etl_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .loader import (
 3 |     ExcelTransactionLoader,
 4 |     TransactionRecord,
 5 |     ETLError,
 6 |     WrongFileTypeError,
 7 |     EmptyFileError,
 8 |     NoContentError,
 9 |     MultipleSheetsError,
10 |     MissingColumnsError,
11 |     DuplicateColumnsError,
12 |     CorruptFileError,
13 |     DataValidationError,
14 |     ErrorDetail,
15 |     configure_logging,
16 |     safe_load_transactions,
17 | )
18 | __all__ = [
19 |     "ExcelTransactionLoader",
20 |     "TransactionRecord",
21 |     "ETLError",
22 |     "WrongFileTypeError",
23 |     "EmptyFileError",
24 |     "NoContentError",
25 |     "MultipleSheetsError",
26 |     "MissingColumnsError",
27 |     "DuplicateColumnsError",
28 |     "CorruptFileError",
29 |     "DataValidationError",
30 |     "ErrorDetail",
31 |     "configure_logging",
32 |     "safe_load_transactions",
33 | ]
34 | 


--------------------------------------------------------------------------------
/tests/test_loader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import io
  3 | import os
  4 | from pathlib import Path
  5 | from decimal import Decimal
  6 | import pandas as pd
  7 | import pytest
  8 | 
  9 | from etl_loader import (
 10 |     ExcelTransactionLoader,
 11 |     TransactionRecord,
 12 |     WrongFileTypeError,
 13 |     EmptyFileError,
 14 |     NoContentError,
 15 |     MultipleSheetsError,
 16 |     MissingColumnsError,
 17 |     DuplicateColumnsError,
 18 |     CorruptFileError,
 19 |     DataValidationError,
 20 |     configure_logging,
 21 |     safe_load_transactions,
 22 | )
 23 | 
 24 | EXPECTED = ["date", "description", "amount", "currency", "account_id", "category"]
 25 | 
 26 | @pytest.fixture
 27 | def tmp_excel_dir(tmp_path: Path):
 28 |     log_dir = tmp_path / "logs"
 29 |     log_dir.mkdir(exist_ok=True, parents=True)
 30 |     configure_logging(log_dir)  # set up log files
 31 |     return tmp_path
 32 | 
 33 | def write_excel(path: Path, data: pd.DataFrame, sheet_name="Sheet1", extras=None):
 34 |     with pd.ExcelWriter(path) as writer:
 35 |         data.to_excel(writer, index=False, sheet_name=sheet_name)
 36 |         if extras:
 37 |             for name, df in extras.items():
 38 |                 df.to_excel(writer, index=False, sheet_name=name)
 39 | 
 40 | def valid_df(n=3):
 41 |     return pd.DataFrame({
 42 |         "date": pd.to_datetime(["2024-01-01","2024-01-02","2024-01-03"]).date,
 43 |         "description": ["Coffee","Lunch","Taxi"],
 44 |         "amount": [Decimal("3.50"), Decimal("12.00"), Decimal("25.00")],
 45 |         "currency": ["USD","USD","USD"],
 46 |         "account_id": ["A1","A1","A2"],
 47 |         "category": ["Food","Food","Travel"]
 48 |     })
 49 | 
 50 | def test_happy_path_single_sheet(tmp_excel_dir: Path):
 51 |     p = tmp_excel_dir / "ok.xlsx"
 52 |     write_excel(p, valid_df())
 53 |     loader = ExcelTransactionLoader(EXPECTED)
 54 |     df, errs = loader.load(p, fail_on_any_error=False)
 55 |     assert errs == []
 56 |     assert len(df) == 3
 57 |     assert set(df.columns) == set(EXPECTED)
 58 | 
 59 | def test_empty_file_0_bytes(tmp_excel_dir: Path):
 60 |     p = tmp_excel_dir / "empty.xlsx"
 61 |     p.write_bytes(b"")  # 0-byte file
 62 |     loader = ExcelTransactionLoader(EXPECTED)
 63 |     with pytest.raises(EmptyFileError):
 64 |         loader.load(p)
 65 | 
 66 | def test_headers_but_no_rows(tmp_excel_dir: Path):
 67 |     p = tmp_excel_dir / "headers_only.xlsx"
 68 |     df = pd.DataFrame(columns=EXPECTED)
 69 |     write_excel(p, df)
 70 |     loader = ExcelTransactionLoader(EXPECTED)
 71 |     with pytest.raises(NoContentError):
 72 |         loader.load(p)
 73 | 
 74 | def test_multiple_sheets_requires_explicit_sheet(tmp_excel_dir: Path):
 75 |     p = tmp_excel_dir / "multi.xlsx"
 76 |     write_excel(p, valid_df(), extras={"Other": valid_df()})
 77 |     loader = ExcelTransactionLoader(EXPECTED)
 78 |     with pytest.raises(MultipleSheetsError):
 79 |         loader.load(p)
 80 | 
 81 | def test_multiple_sheets_with_sheet_selected(tmp_excel_dir: Path):
 82 |     p = tmp_excel_dir / "multi_ok.xlsx"
 83 |     write_excel(p, valid_df(), extras={"Other": valid_df()})
 84 |     loader = ExcelTransactionLoader(EXPECTED)
 85 |     df, errs = loader.load(p, sheet="Sheet1", fail_on_any_error=False)
 86 |     assert len(df) == 3
 87 |     assert errs == []
 88 | 
 89 | def test_wrong_file_type(tmp_excel_dir: Path):
 90 |     p = tmp_excel_dir / "not_excel.csv"
 91 |     valid_df().to_csv(p, index=False)
 92 |     loader = ExcelTransactionLoader(EXPECTED)
 93 |     with pytest.raises(WrongFileTypeError):
 94 |         loader.load(p)
 95 | 
 96 | def test_corrupt_excel(tmp_excel_dir: Path):
 97 |     p = tmp_excel_dir / "corrupt.xlsx"
 98 |     p.write_bytes(os.urandom(256))  # random bytes
 99 |     loader = ExcelTransactionLoader(EXPECTED)
100 |     with pytest.raises(CorruptFileError):
101 |         loader.load(p, sheet=0)
102 | 
103 | def test_missing_required_columns(tmp_excel_dir: Path):
104 |     p = tmp_excel_dir / "missing_cols.xlsx"
105 |     df = valid_df().drop(columns=["account_id"])
106 |     write_excel(p, df)
107 |     loader = ExcelTransactionLoader(EXPECTED)
108 |     with pytest.raises(MissingColumnsError):
109 |         loader.load(p)
110 | 
111 | def test_duplicate_columns(tmp_excel_dir: Path):
112 |     p = tmp_excel_dir / "dup_cols.xlsx"
113 |     df = valid_df()
114 |     # create duplicate by renaming a column to existing name
115 |     df.columns = ["date","description","amount","currency","date","category"]
116 |     write_excel(p, df)
117 |     loader = ExcelTransactionLoader(EXPECTED)
118 |     with pytest.raises(DuplicateColumnsError):
119 |         loader.load(p)
120 | 
121 | def test_bad_data_rows_are_reported(tmp_excel_dir: Path):
122 |     p = tmp_excel_dir / "bad_rows.xlsx"
123 |     df = valid_df()
124 |     # introduce bad values
125 |     df.loc[1, "amount"] = "abc"       # invalid number
126 |     df.loc[2, "currency"] = "usd"     # lowercase not allowed
127 |     write_excel(p, df)
128 |     loader = ExcelTransactionLoader(EXPECTED)
129 |     with pytest.raises(DataValidationError) as exc:
130 |         loader.load(p)
131 |     err = exc.value
132 |     # Expect two bad rows (index 1 and 2)
133 |     assert len(err.errors) >= 2
134 |     fields = [e.field for e in err.errors]
135 |     assert "amount" in fields
136 |     assert "currency" in fields
137 | 
138 | def test_fail_on_any_error_false_returns_valid_and_errors(tmp_excel_dir: Path):
139 |     p = tmp_excel_dir / "bad_rows2.xlsx"
140 |     df = valid_df()
141 |     df.loc[1, "amount"] = "NaN"
142 |     write_excel(p, df)
143 |     loader = ExcelTransactionLoader(EXPECTED)
144 |     valid, errors = loader.load(p, fail_on_any_error=False)
145 |     # 2 valid rows, 1 invalid
146 |     assert len(valid) == 2
147 |     assert len(errors) == 1
148 | 
149 | def test_safe_loader_user_payload(tmp_excel_dir: Path):
150 |     p = tmp_excel_dir / "safe.xlsx"
151 |     df = valid_df()
152 |     df.loc[0, "currency"] = "usd"  # invalid
153 |     write_excel(p, df)
154 |     payload = safe_load_transactions(p, EXPECTED, log_dir=tmp_excel_dir, fail_on_any_error=True)
155 |     assert payload["ok"] is False
156 |     assert payload["type"] == "DataValidationError"
157 |     assert "errors" in payload and len(payload["errors"]) >= 1
158 | 


--------------------------------------------------------------------------------
/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import io
  3 | import os
  4 | from pathlib import Path
  5 | from datetime import datetime
  6 | from typing import List, Dict, Any
  7 | 
  8 | import streamlit as st
  9 | import pandas as pd
 10 | from pydantic import ValidationError
 11 | 
 12 | from etl_loader import (
 13 |     configure_logging,
 14 |     ExcelTransactionLoader,
 15 |     DataValidationError,
 16 |     ETLError,
 17 |     TransactionRecord,
 18 | )
 19 | 
 20 | st.set_page_config(page_title="ETL Validator (Pydantic + Pytest)", layout="wide")
 21 | 
 22 | # ------------------------------
 23 | # Setup & Session State
 24 | # ------------------------------
 25 | 
 26 | BASE_DIR = Path(__file__).parent
 27 | LOG_DIR = BASE_DIR / "logs"
 28 | SAMPLE_DIR = BASE_DIR / "sample_data"
 29 | LOG_DIR.mkdir(exist_ok=True, parents=True)
 30 | SAMPLE_DIR.mkdir(exist_ok=True, parents=True)
 31 | 
 32 | # configure logger once per session
 33 | if "logger_ready" not in st.session_state:
 34 |     configure_logging(LOG_DIR)
 35 |     st.session_state["logger_ready"] = True
 36 | 
 37 | DEFAULT_COLUMNS = "date, description, amount, currency, account_id, category"
 38 | 
 39 | def parse_columns(s: str) -> List[str]:
 40 |     return [c.strip() for c in s.split(",") if c.strip()]
 41 | 
 42 | def schema_markdown() -> str:
 43 |     schema = TransactionRecord.model_json_schema()
 44 |     lines = ["### Pydantic Schema (TransactionRecord)"]
 45 |     lines.append("```json")
 46 |     import json
 47 |     lines.append(json.dumps(schema, indent=2, default=str))
 48 |     lines.append("```")
 49 |     return "\n".join(lines)
 50 | 
 51 | def tail_text(path: Path, lines: int = 400) -> str:
 52 |     if not path.exists():
 53 |         return "(no log file yet)"
 54 |     with path.open("r", encoding="utf-8", errors="ignore") as f:
 55 |         buf = f.readlines()
 56 |     return "".join(buf[-lines:]) if buf else "(empty log)"
 57 | 
 58 | def run_pytest() -> Dict[str, Any]:
 59 |     """Run pytest programmatically and capture output."""
 60 |     import pytest, sys
 61 |     import contextlib
 62 |     stdout = io.StringIO()
 63 |     stderr = io.StringIO()
 64 |     with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
 65 |         # -q for quiet, but still summary; add -k to filter if ever needed
 66 |         exit_code = pytest.main(["-q"])
 67 |     return {
 68 |         "exit_code": exit_code,
 69 |         "stdout": stdout.getvalue(),
 70 |         "stderr": stderr.getvalue(),
 71 |     }
 72 | 
 73 | def save_uploaded_file(uploaded, suffix: str) -> Path:
 74 |     ext = os.path.splitext(uploaded.name)[1] or suffix
 75 |     temp_name = f"upload_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}{ext}"
 76 |     p = BASE_DIR / temp_name
 77 |     with open(p, "wb") as f:
 78 |         f.write(uploaded.getbuffer())
 79 |     return p
 80 | 
 81 | def generate_samples():
 82 |     import numpy as np
 83 |     from decimal import Decimal
 84 | 
 85 |     # valid
 86 |     df_valid = pd.DataFrame({
 87 |         "date": pd.to_datetime(["2024-01-01","2024-01-02","2024-01-03"]).date,
 88 |         "description": ["Coffee","Lunch","Taxi"],
 89 |         "amount": [Decimal("3.50"), Decimal("12.00"), Decimal("25.00")],
 90 |         "currency": ["USD","USD","USD"],
 91 |         "account_id": ["A1","A1","A2"],
 92 |         "category": ["Food","Food","Travel"]
 93 |     })
 94 |     (SAMPLE_DIR / "valid.xlsx").unlink(missing_ok=True)
 95 |     with pd.ExcelWriter(SAMPLE_DIR / "valid.xlsx") as w:
 96 |         df_valid.to_excel(w, sheet_name="Sheet1", index=False)
 97 | 
 98 |     # headers only
 99 |     df_headers = pd.DataFrame(columns=["date","description","amount","currency","account_id","category"])
100 |     (SAMPLE_DIR / "headers_only.xlsx").unlink(missing_ok=True)
101 |     with pd.ExcelWriter(SAMPLE_DIR / "headers_only.xlsx") as w:
102 |         df_headers.to_excel(w, sheet_name="Sheet1", index=False)
103 | 
104 |     # multiple sheets
105 |     (SAMPLE_DIR / "multi.xlsx").unlink(missing_ok=True)
106 |     with pd.ExcelWriter(SAMPLE_DIR / "multi.xlsx") as w:
107 |         df_valid.to_excel(w, sheet_name="Sheet1", index=False)
108 |         df_valid.to_excel(w, sheet_name="Other", index=False)
109 | 
110 |     # missing columns
111 |     df_missing = df_valid.drop(columns=["account_id"])
112 |     (SAMPLE_DIR / "missing_cols.xlsx").unlink(missing_ok=True)
113 |     with pd.ExcelWriter(SAMPLE_DIR / "missing_cols.xlsx") as w:
114 |         df_missing.to_excel(w, sheet_name="Sheet1", index=False)
115 | 
116 |     # duplicate columns
117 |     df_dup = df_valid.copy()
118 |     df_dup.columns = ["date","description","amount","currency","date","category"]
119 |     (SAMPLE_DIR / "dup_cols.xlsx").unlink(missing_ok=True)
120 |     with pd.ExcelWriter(SAMPLE_DIR / "dup_cols.xlsx") as w:
121 |         df_dup.to_excel(w, sheet_name="Sheet1", index=False)
122 | 
123 |     # bad rows
124 |     df_bad = df_valid.copy()
125 |     df_bad.loc[1, "amount"] = "abc"       # invalid number
126 |     df_bad.loc[2, "currency"] = "usd"     # lowercase invalid
127 |     (SAMPLE_DIR / "bad_rows.xlsx").unlink(missing_ok=True)
128 |     with pd.ExcelWriter(SAMPLE_DIR / "bad_rows.xlsx") as w:
129 |         df_bad.to_excel(w, sheet_name="Sheet1", index=False)
130 | 
131 |     # wrong type (csv)
132 |     df_valid.to_csv(SAMPLE_DIR / "not_excel.csv", index=False)
133 | 
134 |     # corrupt file
135 |     (SAMPLE_DIR / "corrupt.xlsx").write_bytes(os.urandom(256))
136 | 
137 |     # empty (0 bytes)
138 |     (SAMPLE_DIR / "empty.xlsx").write_bytes(b"")
139 | 
140 | # ------------------------------
141 | # UI
142 | # ------------------------------
143 | 
144 | st.title("ETL Validator • Pydantic + Pandas + Pytest")
145 | st.caption("Validate Excel-based financial transaction data. View logs. Run tests.")
146 | 
147 | tabs = st.tabs(["🔍 Upload & Validate", "🧪 Run Test Suite", "📜 Logs", "⚙️ Settings & Schema", "📦 Sample Files"])
148 | 
149 | # ---- Upload & Validate
150 | with tabs[0]:
151 |     st.subheader("Validate an uploaded file")
152 |     cols_input = st.text_input("Expected columns (comma-separated):", value=DEFAULT_COLUMNS)
153 |     expected_cols = parse_columns(cols_input)
154 | 
155 |     fail_any = st.checkbox("Fail on any invalid row (raise error)", value=True)
156 | 
157 |     uploaded = st.file_uploader(
158 |         "Upload an Excel file (.xlsx, .xls, .xlsm). You can also upload a .csv to trigger the 'wrong file type' path.",
159 |         type=["xlsx", "xls", "xlsm", "csv"],
160 |     )
161 | 
162 |     sheet_name = None
163 |     if uploaded and uploaded.name.lower().endswith((".xlsx",".xls",".xlsm")):
164 |         # Try to peek sheets for selection
165 |         try:
166 |             tmp = save_uploaded_file(uploaded, suffix=".xlsx")
167 |             xl = pd.ExcelFile(tmp)
168 |             sheets = xl.sheet_names
169 |             if len(sheets) > 1:
170 |                 sheet_name = st.selectbox("Select sheet", sheets, index=0, key="sheet_select")
171 |             else:
172 |                 sheet_name = sheets[0] if sheets else None
173 |         except Exception as e:
174 |             st.info("Couldn't read sheet names (maybe corrupt or wrong type). You can still try validating to see proper errors.")
175 |             tmp = save_uploaded_file(uploaded, suffix=".xlsx")
176 |     elif uploaded:
177 |         tmp = save_uploaded_file(uploaded, suffix=".xlsx")
178 |     else:
179 |         tmp = None
180 | 
181 |     if st.button("Validate file", disabled=(tmp is None)):
182 |         loader = ExcelTransactionLoader(expected_cols)
183 |         try:
184 |             valid_df, errs = loader.load(tmp, sheet=sheet_name, fail_on_any_error=fail_any)
185 |             st.success(f"Validation OK ✅ | Valid rows: {len(valid_df)} | Invalid rows: {len(errs)}")
186 |             if len(valid_df) > 0:
187 |                 st.dataframe(valid_df.head(50))
188 |             if errs:
189 |                 st.subheader("Row-level validation messages")
190 |                 st.dataframe(pd.DataFrame([e.to_dict() for e in errs]))
191 |         except DataValidationError as dve:
192 |             st.error(f"{dve.__class__.__name__}: {dve.message}")
193 |             err_df = pd.DataFrame([e.to_dict() for e in dve.errors])
194 |             st.dataframe(err_df)
195 |         except ETLError as ee:
196 |             st.error(f"{ee.__class__.__name__}: {ee.message}")
197 |             st.json(ee.to_dict())
198 |         except Exception as ex:
199 |             st.exception(ex)
200 | 
201 | # ---- Run Test Suite
202 | with tabs[1]:
203 |     st.subheader("Pytest: run the built-in test suite")
204 |     st.write("Runs the unit tests shipped with this repo (happy path + all edge cases).")
205 |     if st.button("Run tests now"):
206 |         result = run_pytest()
207 |         ok = (result["exit_code"] == 0)
208 |         st.write("Exit code:", result["exit_code"])
209 |         if ok:
210 |             st.success("All tests passed ✔️")
211 |         else:
212 |             st.error("Some tests failed ❌")
213 |         st.subheader("stdout")
214 |         st.code(result["stdout"] or "(empty)")
215 |         st.subheader("stderr")
216 |         st.code(result["stderr"] or "(empty)")
217 | 
218 | # ---- Logs
219 | with tabs[2]:
220 |     st.subheader("Logs (auto-generated by the loader)")
221 |     info_log = LOG_DIR / "etl.log"
222 |     err_log = LOG_DIR / "etl_errors.log"
223 | 
224 |     cols = st.columns(2)
225 |     with cols[0]:
226 |         st.markdown("**etl.log (INFO+)**")
227 |         st.download_button("Download etl.log", data=info_log.read_bytes() if info_log.exists() else b"", file_name="etl.log")
228 |         st.text_area("Tail of etl.log", tail_text(info_log), height=300)
229 | 
230 |     with cols[1]:
231 |         st.markdown("**etl_errors.log (ERROR+)**")
232 |         st.download_button("Download etl_errors.log", data=err_log.read_bytes() if err_log.exists() else b"", file_name="etl_errors.log")
233 |         st.text_area("Tail of etl_errors.log", tail_text(err_log), height=300)
234 | 
235 |     if st.button("Clear logs"):
236 |         for p in [info_log, err_log]:
237 |             p.write_text("")
238 |         st.success("Logs cleared.")
239 | 
240 | # ---- Settings & Schema
241 | with tabs[3]:
242 |     st.subheader("Pydantic Schema")
243 |     st.markdown(schema_markdown())
244 | 
245 |     st.subheader("Advanced")
246 |     st.write("- Log directory:", str(LOG_DIR))
247 |     st.write("- Base directory:", str(BASE_DIR))
248 | 
249 | # ---- Sample Files
250 | with tabs[4]:
251 |     st.subheader("Generate sample files for manual testing")
252 |     st.write("This will create a variety of sample files under `sample_data/` which you can download below.")
253 |     if st.button("Generate sample files"):
254 |         generate_samples()
255 |         st.success("Sample files generated.")
256 | 
257 |     if SAMPLE_DIR.exists():
258 |         for p in sorted(SAMPLE_DIR.glob("*")):
259 |             st.download_button(
260 |                 label=f"Download {p.name}",
261 |                 data=p.read_bytes(),
262 |                 file_name=p.name,
263 |             )
264 | 


--------------------------------------------------------------------------------
/etl_loader/loader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import annotations
  3 | 
  4 | import logging
  5 | from logging.handlers import RotatingFileHandler
  6 | from dataclasses import dataclass, asdict
  7 | from decimal import Decimal, InvalidOperation
  8 | from datetime import date
  9 | from pathlib import Path
 10 | from typing import List, Optional, Tuple, Union, Iterable, Dict, Any
 11 | 
 12 | import pandas as pd
 13 | from pydantic import BaseModel, Field, ValidationError, field_validator, ConfigDict
 14 | 
 15 | # ------------------------------
 16 | # Logging
 17 | # ------------------------------
 18 | 
 19 | def configure_logging(log_dir: Union[str, Path] = ".", base_name: str = "etl") -> logging.Logger:
 20 |     """
 21 |     Configure a logger with two rotating file handlers:
 22 |       - {base_name}.log (INFO and above): high-level operational log.
 23 |       - {base_name}_errors.log (ERROR and above): errors-only log for quick triage.
 24 |     Returns a logger named 'etl'.
 25 |     """
 26 |     log_dir = Path(log_dir)
 27 |     log_dir.mkdir(parents=True, exist_ok=True)
 28 |     logger = logging.getLogger("etl")
 29 |     logger.setLevel(logging.DEBUG)
 30 |     logger.propagate = False  # avoid double logging in notebooks/apps
 31 | 
 32 |     # Clear old handlers if reconfiguring
 33 |     if logger.handlers:
 34 |         for h in list(logger.handlers):
 35 |             logger.removeHandler(h)
 36 |             h.close()
 37 | 
 38 |     info_path = log_dir / f"{base_name}.log"
 39 |     errors_path = log_dir / f"{base_name}_errors.log"
 40 | 
 41 |     info_handler = RotatingFileHandler(info_path, maxBytes=1_000_000, backupCount=3)
 42 |     info_handler.setLevel(logging.INFO)
 43 |     info_handler.setFormatter(logging.Formatter(
 44 |         "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
 45 |     ))
 46 | 
 47 |     error_handler = RotatingFileHandler(errors_path, maxBytes=1_000_000, backupCount=5)
 48 |     error_handler.setLevel(logging.ERROR)
 49 |     error_handler.setFormatter(logging.Formatter(
 50 |         "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
 51 |     ))
 52 | 
 53 |     logger.addHandler(info_handler)
 54 |     logger.addHandler(error_handler)
 55 |     logger.info("Logger initialized. info_log=%s errors_log=%s", info_path, errors_path)
 56 |     return logger
 57 | 
 58 | # ------------------------------
 59 | # Error Types
 60 | # ------------------------------
 61 | 
 62 | class ETLError(Exception):
 63 |     """Base class for ETL-related errors with context for user-facing messages."""
 64 |     def __init__(self, message: str, **context: Any) -> None:
 65 |         super().__init__(message)
 66 |         self.message = message
 67 |         self.context = context
 68 | 
 69 |     def to_dict(self) -> Dict[str, Any]:
 70 |         return {"message": self.message, "context": self.context}
 71 | 
 72 | class WrongFileTypeError(ETLError): ...
 73 | class EmptyFileError(ETLError): ...
 74 | class NoContentError(ETLError): ...
 75 | class MultipleSheetsError(ETLError): ...
 76 | class MissingColumnsError(ETLError): ...
 77 | class DuplicateColumnsError(ETLError): ...
 78 | class CorruptFileError(ETLError): ...
 79 | 
 80 | @dataclass(frozen=True)
 81 | class ErrorDetail:
 82 |     row_index: int
 83 |     field: str
 84 |     value: Any
 85 |     error: str
 86 | 
 87 |     def to_dict(self) -> Dict[str, Any]:
 88 |         return asdict(self)
 89 | 
 90 | class DataValidationError(ETLError):
 91 |     def __init__(self, message: str, errors: List[ErrorDetail], **context: Any) -> None:
 92 |         super().__init__(message, **context)
 93 |         self.errors = errors
 94 | 
 95 |     def to_dict(self) -> Dict[str, Any]:
 96 |         base = super().to_dict()
 97 |         base["errors"] = [e.to_dict() for e in self.errors]
 98 |         return base
 99 | 
100 | # ------------------------------
101 | # Pydantic Model
102 | # ------------------------------
103 | 
104 | class TransactionRecord(BaseModel):
105 |     """
106 |     Strict schema for a single financial transaction record.
107 |     """
108 |     model_config = ConfigDict(extra="forbid", frozen=False)
109 | 
110 |     date: date
111 |     description: str = Field(min_length=1, max_length=200)
112 |     amount: Decimal
113 |     currency: str = Field(pattern=r"^[A-Z]{3}$", description="ISO 4217 currency code (e.g., USD)")
114 |     account_id: str = Field(min_length=1, max_length=50)
115 |     category: Optional[str] = Field(default=None, max_length=100)
116 | 
117 |     @field_validator("amount")
118 |     @classmethod
119 |     def valid_amount(cls, v: Any) -> Decimal:
120 |         """
121 |         Ensure amount parses as Decimal and is finite.
122 |         """
123 |         if isinstance(v, float):
124 |             # Convert float to string first to avoid binary float artifacts
125 |             v = str(v)
126 |         try:
127 |             d = Decimal(v)
128 |         except (InvalidOperation, ValueError) as e:
129 |             raise ValueError(f"amount must be a valid number; got {v!r}") from e
130 |         if d.is_nan():
131 |             raise ValueError("amount may not be NaN")
132 |         if d == Decimal("0"):
133 |             # Allow zero? Many pipelines disallow zero transactions. Adjust if needed.
134 |             return d
135 |         return d
136 | 
137 |     @field_validator("description")
138 |     @classmethod
139 |     def description_trimmed(cls, v: str) -> str:
140 |         v = v.strip()
141 |         if not v:
142 |             raise ValueError("description cannot be empty/whitespace")
143 |         return v
144 | 
145 | # ------------------------------
146 | # Loader
147 | # ------------------------------
148 | 
149 | class ExcelTransactionLoader:
150 |     def __init__(
151 |         self,
152 |         expected_columns: Iterable[str],
153 |         logger: Optional[logging.Logger] = None,
154 |     ) -> None:
155 |         self.expected_columns = [c.strip() for c in expected_columns]
156 |         self.logger = logger or logging.getLogger("etl")
157 | 
158 |     @staticmethod
159 |     def _check_extension(path: Path) -> None:
160 |         if path.suffix.lower() not in {".xlsx", ".xls", ".xlsm"}:
161 |             raise WrongFileTypeError(
162 |                 f"Unsupported file type {path.suffix!r}; expected an Excel file (.xlsx, .xls, .xlsm)",
163 |                 path=str(path),
164 |             )
165 | 
166 |     def _excel_sheets(self, path: Path) -> List[str]:
167 |         try:
168 |             xl = pd.ExcelFile(path)  # type: ignore[no-untyped-call]
169 |             return list(xl.sheet_names)
170 |         except Exception as e:
171 |             self.logger.error("Failed to open Excel file (possibly corrupt): %s", path, exc_info=True)
172 |             raise CorruptFileError("Unable to open Excel file; the file may be corrupt or unreadable", path=str(path)) from e
173 | 
174 |     def _read_sheet(self, path: Path, sheet: Optional[Union[int, str]]) -> pd.DataFrame:
175 |         try:
176 |             return pd.read_excel(path, sheet_name=sheet)  # type: ignore[no-untyped-call]
177 |         except ValueError as ve:
178 |             # pandas raises ValueError for bad sheet names/indices
179 |             raise MultipleSheetsError(str(ve), path=str(path), sheet=sheet) from ve
180 |         except Exception as e:
181 |             self.logger.error("Failed to read Excel sheet", exc_info=True)
182 |             raise CorruptFileError("Failed to read Excel sheet; the file or sheet may be corrupt", path=str(path), sheet=sheet) from e
183 | 
184 |     def _normalize_columns(self, cols: Iterable[str]) -> List[str]:
185 |         return [str(c).strip() for c in cols]
186 | 
187 |     def _validate_columns(self, df: pd.DataFrame, path: Path) -> None:
188 |         cols = self._normalize_columns(df.columns.tolist())
189 |         duplicates = [c for c in cols if cols.count(c) > 1]
190 |         if duplicates:
191 |             raise DuplicateColumnsError(f"Duplicate header columns found: {sorted(set(duplicates))}", path=str(path))
192 |         missing = [c for c in self.expected_columns if c not in cols]
193 |         if missing:
194 |             raise MissingColumnsError(f"Missing required columns: {missing}", path=str(path), expected=self.expected_columns, found=cols)
195 | 
196 |     def validate_rows(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[ErrorDetail]]:
197 |         """
198 |         Validate all rows via Pydantic. Returns a dataframe of valid rows and a list of ErrorDetail for invalid rows.
199 |         """
200 |         errors: List[ErrorDetail] = []
201 |         valid_records: List[Dict[str, Any]] = []
202 | 
203 |         # Align columns used by the model; ignore extras
204 |         projection = {c: c for c in self.expected_columns if c in df.columns}
205 | 
206 |         for idx, row in df.iterrows():
207 |             raw = {k: row[v] for k, v in projection.items()}
208 |             try:
209 |                 record = TransactionRecord(**raw)
210 |                 valid_records.append(record.model_dump())
211 |             except ValidationError as ve:
212 |                 for e in ve.errors():
213 |                     fld = e["loc"][0] if e.get("loc") else "<unknown>"
214 |                     msg = e.get("msg", "validation error")
215 |                     val = raw.get(fld, None)
216 |                     errors.append(ErrorDetail(row_index=int(idx), field=str(fld), value=val, error=msg))
217 | 
218 |         valid_df = pd.DataFrame(valid_records) if valid_records else pd.DataFrame(columns=self.expected_columns)
219 |         return valid_df, errors
220 | 
221 |     def load(
222 |         self,
223 |         path: Union[str, Path],
224 |         *,
225 |         sheet: Optional[Union[int, str]] = None,
226 |         fail_on_any_error: bool = True,
227 |     ) -> Tuple[pd.DataFrame, List[ErrorDetail]]:
228 |         """
229 |         Load and validate an Excel file.
230 |         - fail_on_any_error=True: raise DataValidationError if any row fails validation.
231 |         Returns (valid_df, errors) where errors is a list of ErrorDetail for rows that failed.
232 |         """
233 |         p = Path(path)
234 |         self.logger.info("Starting load | path=%s sheet=%s", p, sheet)
235 | 
236 |         # Basic checks
237 |         if not p.exists():
238 |             raise EmptyFileError("File not found", path=str(p))
239 |         if p.stat().st_size == 0:
240 |             raise EmptyFileError("File is empty (0 bytes)", path=str(p))
241 | 
242 |         self._check_extension(p)
243 | 
244 |         # Sheet handling
245 |         sheets = self._excel_sheets(p)
246 |         if len(sheets) > 1 and sheet is None:
247 |             raise MultipleSheetsError(
248 |                 f"Multiple sheets present ({sheets}); specify a sheet by name or index.",
249 |                 path=str(p),
250 |                 sheets=sheets,
251 |             )
252 | 
253 |         # Read data
254 |         df = self._read_sheet(p, sheet if sheet is not None else sheets[0] if sheets else 0)
255 | 
256 |         if df.empty:
257 |             raise NoContentError("The selected sheet has headers but no rows.", path=str(p), sheet=sheet)
258 | 
259 |         self._validate_columns(df, p)
260 | 
261 |         valid_df, errors = self.validate_rows(df)
262 | 
263 |         if errors and fail_on_any_error:
264 |             self.logger.error("Validation failed | invalid_rows=%d", len(errors))
265 |             raise DataValidationError(
266 |                 f"Validation failed for {len(errors)} row(s)",
267 |                 errors=errors,
268 |                 path=str(p),
269 |                 sheet=sheet,
270 |             )
271 | 
272 |         self.logger.info("Load complete | total_rows=%d valid_rows=%d invalid_rows=%d",
273 |                          len(df), len(valid_df), len(errors))
274 |         return valid_df, errors
275 | 
276 | # ------------------------------
277 | # Convenience Wrapper
278 | # ------------------------------
279 | 
280 | def safe_load_transactions(
281 |     path: Union[str, Path],
282 |     expected_columns: Iterable[str],
283 |     *,
284 |     sheet: Optional[Union[int, str]] = None,
285 |     log_dir: Union[str, Path] = ".",
286 |     fail_on_any_error: bool = True,
287 | ) -> Dict[str, Any]:
288 |     """
289 |     Convenience one-call function that configures logging, loads, and captures exceptions
290 |     into a user-friendly dict suitable for APIs/UI layers.
291 |     """
292 |     logger = configure_logging(log_dir)
293 |     loader = ExcelTransactionLoader(expected_columns=expected_columns, logger=logger)
294 |     try:
295 |         df, errors = loader.load(path, sheet=sheet, fail_on_any_error=fail_on_any_error)
296 |         return {
297 |             "ok": True,
298 |             "valid_row_count": len(df),
299 |             "invalid_row_count": len(errors),
300 |             "errors": [e.to_dict() for e in errors],
301 |             # Avoid returning the full DF to keep payloads light
302 |         }
303 |     except ETLError as e:
304 |         logger.exception("ETL load failed with ETLError")
305 |         payload = {"ok": False, "type": e.__class__.__name__, **e.to_dict()}
306 |         if isinstance(e, DataValidationError):
307 |             payload["errors"] = [er.to_dict() for er in e.errors]
308 |         return payload
309 |     except Exception as e:  # pragma: no cover
310 |         logger.exception("Unexpected error during ETL load")
311 |         return {"ok": False, "type": "UnexpectedError", "message": str(e)}
312 | 


--------------------------------------------------------------------------------