├── requirements.txt ├── pyproject.toml ├── README.md ├── etl_loader ├── __init__.py └── loader.py ├── tests └── test_loader.py └── streamlit_app.py /requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | pandas>=2.0 3 | pydantic>=2.5 4 | openpyxl>=3.1 5 | pytest>=8.0 6 | streamlit>=1.34 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [project] 3 | name = "etl-pydantic-pytest-demo" 4 | version = "0.1.0" 5 | requires-python = ">=3.10" 6 | dependencies = [ 7 | "pandas>=2.0", 8 | "pydantic>=2.5", 9 | "openpyxl>=3.1", 10 | "pytest>=8.0", 11 | ] 12 | 13 | [tool.pytest.ini_options] 14 | addopts = "-q" 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # ETL Loader with Pydantic + Pytest (Demo) 3 | 4 | This mini-project shows how to validate Excel-based financial transaction data with **pandas** + **pydantic**, 5 | emit user-friendly errors, and test the whole pipeline with **pytest**. 6 | 7 | ## Structure 8 | - `etl_loader/loader.py` – loader and schema 9 | - `tests/test_loader.py` – comprehensive unit tests 10 | 11 | ## Quickstart 12 | ```bash 13 | pip install -U pandas pydantic openpyxl pytest 14 | pytest -q 15 | ``` 16 | 17 | Adjust the `EXPECTED` columns and the `TransactionRecord` model to match your real schema. 18 | -------------------------------------------------------------------------------- /etl_loader/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .loader import ( 3 | ExcelTransactionLoader, 4 | TransactionRecord, 5 | ETLError, 6 | WrongFileTypeError, 7 | EmptyFileError, 8 | NoContentError, 9 | MultipleSheetsError, 10 | MissingColumnsError, 11 | DuplicateColumnsError, 12 | CorruptFileError, 13 | DataValidationError, 14 | ErrorDetail, 15 | configure_logging, 16 | safe_load_transactions, 17 | ) 18 | __all__ = [ 19 | "ExcelTransactionLoader", 20 | "TransactionRecord", 21 | "ETLError", 22 | "WrongFileTypeError", 23 | "EmptyFileError", 24 | "NoContentError", 25 | "MultipleSheetsError", 26 | "MissingColumnsError", 27 | "DuplicateColumnsError", 28 | "CorruptFileError", 29 | "DataValidationError", 30 | "ErrorDetail", 31 | "configure_logging", 32 | "safe_load_transactions", 33 | ] 34 | -------------------------------------------------------------------------------- /tests/test_loader.py: -------------------------------------------------------------------------------- 1 | 2 | import io 3 | import os 4 | from pathlib import Path 5 | from decimal import Decimal 6 | import pandas as pd 7 | import pytest 8 | 9 | from etl_loader import ( 10 | ExcelTransactionLoader, 11 | TransactionRecord, 12 | WrongFileTypeError, 13 | EmptyFileError, 14 | NoContentError, 15 | MultipleSheetsError, 16 | MissingColumnsError, 17 | DuplicateColumnsError, 18 | CorruptFileError, 19 | DataValidationError, 20 | configure_logging, 21 | safe_load_transactions, 22 | ) 23 | 24 | EXPECTED = ["date", "description", "amount", "currency", "account_id", "category"] 25 | 26 | @pytest.fixture 27 | def tmp_excel_dir(tmp_path: Path): 28 | log_dir = tmp_path / "logs" 29 | log_dir.mkdir(exist_ok=True, parents=True) 30 | configure_logging(log_dir) # set up log files 31 | return tmp_path 32 | 33 | def write_excel(path: Path, data: pd.DataFrame, sheet_name="Sheet1", extras=None): 34 | with pd.ExcelWriter(path) as writer: 35 | data.to_excel(writer, index=False, sheet_name=sheet_name) 36 | if extras: 37 | for name, df in extras.items(): 38 | df.to_excel(writer, index=False, sheet_name=name) 39 | 40 | def valid_df(n=3): 41 | return pd.DataFrame({ 42 | "date": pd.to_datetime(["2024-01-01","2024-01-02","2024-01-03"]).date, 43 | "description": ["Coffee","Lunch","Taxi"], 44 | "amount": [Decimal("3.50"), Decimal("12.00"), Decimal("25.00")], 45 | "currency": ["USD","USD","USD"], 46 | "account_id": ["A1","A1","A2"], 47 | "category": ["Food","Food","Travel"] 48 | }) 49 | 50 | def test_happy_path_single_sheet(tmp_excel_dir: Path): 51 | p = tmp_excel_dir / "ok.xlsx" 52 | write_excel(p, valid_df()) 53 | loader = ExcelTransactionLoader(EXPECTED) 54 | df, errs = loader.load(p, fail_on_any_error=False) 55 | assert errs == [] 56 | assert len(df) == 3 57 | assert set(df.columns) == set(EXPECTED) 58 | 59 | def test_empty_file_0_bytes(tmp_excel_dir: Path): 60 | p = tmp_excel_dir / "empty.xlsx" 61 | p.write_bytes(b"") # 0-byte file 62 | loader = ExcelTransactionLoader(EXPECTED) 63 | with pytest.raises(EmptyFileError): 64 | loader.load(p) 65 | 66 | def test_headers_but_no_rows(tmp_excel_dir: Path): 67 | p = tmp_excel_dir / "headers_only.xlsx" 68 | df = pd.DataFrame(columns=EXPECTED) 69 | write_excel(p, df) 70 | loader = ExcelTransactionLoader(EXPECTED) 71 | with pytest.raises(NoContentError): 72 | loader.load(p) 73 | 74 | def test_multiple_sheets_requires_explicit_sheet(tmp_excel_dir: Path): 75 | p = tmp_excel_dir / "multi.xlsx" 76 | write_excel(p, valid_df(), extras={"Other": valid_df()}) 77 | loader = ExcelTransactionLoader(EXPECTED) 78 | with pytest.raises(MultipleSheetsError): 79 | loader.load(p) 80 | 81 | def test_multiple_sheets_with_sheet_selected(tmp_excel_dir: Path): 82 | p = tmp_excel_dir / "multi_ok.xlsx" 83 | write_excel(p, valid_df(), extras={"Other": valid_df()}) 84 | loader = ExcelTransactionLoader(EXPECTED) 85 | df, errs = loader.load(p, sheet="Sheet1", fail_on_any_error=False) 86 | assert len(df) == 3 87 | assert errs == [] 88 | 89 | def test_wrong_file_type(tmp_excel_dir: Path): 90 | p = tmp_excel_dir / "not_excel.csv" 91 | valid_df().to_csv(p, index=False) 92 | loader = ExcelTransactionLoader(EXPECTED) 93 | with pytest.raises(WrongFileTypeError): 94 | loader.load(p) 95 | 96 | def test_corrupt_excel(tmp_excel_dir: Path): 97 | p = tmp_excel_dir / "corrupt.xlsx" 98 | p.write_bytes(os.urandom(256)) # random bytes 99 | loader = ExcelTransactionLoader(EXPECTED) 100 | with pytest.raises(CorruptFileError): 101 | loader.load(p, sheet=0) 102 | 103 | def test_missing_required_columns(tmp_excel_dir: Path): 104 | p = tmp_excel_dir / "missing_cols.xlsx" 105 | df = valid_df().drop(columns=["account_id"]) 106 | write_excel(p, df) 107 | loader = ExcelTransactionLoader(EXPECTED) 108 | with pytest.raises(MissingColumnsError): 109 | loader.load(p) 110 | 111 | def test_duplicate_columns(tmp_excel_dir: Path): 112 | p = tmp_excel_dir / "dup_cols.xlsx" 113 | df = valid_df() 114 | # create duplicate by renaming a column to existing name 115 | df.columns = ["date","description","amount","currency","date","category"] 116 | write_excel(p, df) 117 | loader = ExcelTransactionLoader(EXPECTED) 118 | with pytest.raises(DuplicateColumnsError): 119 | loader.load(p) 120 | 121 | def test_bad_data_rows_are_reported(tmp_excel_dir: Path): 122 | p = tmp_excel_dir / "bad_rows.xlsx" 123 | df = valid_df() 124 | # introduce bad values 125 | df.loc[1, "amount"] = "abc" # invalid number 126 | df.loc[2, "currency"] = "usd" # lowercase not allowed 127 | write_excel(p, df) 128 | loader = ExcelTransactionLoader(EXPECTED) 129 | with pytest.raises(DataValidationError) as exc: 130 | loader.load(p) 131 | err = exc.value 132 | # Expect two bad rows (index 1 and 2) 133 | assert len(err.errors) >= 2 134 | fields = [e.field for e in err.errors] 135 | assert "amount" in fields 136 | assert "currency" in fields 137 | 138 | def test_fail_on_any_error_false_returns_valid_and_errors(tmp_excel_dir: Path): 139 | p = tmp_excel_dir / "bad_rows2.xlsx" 140 | df = valid_df() 141 | df.loc[1, "amount"] = "NaN" 142 | write_excel(p, df) 143 | loader = ExcelTransactionLoader(EXPECTED) 144 | valid, errors = loader.load(p, fail_on_any_error=False) 145 | # 2 valid rows, 1 invalid 146 | assert len(valid) == 2 147 | assert len(errors) == 1 148 | 149 | def test_safe_loader_user_payload(tmp_excel_dir: Path): 150 | p = tmp_excel_dir / "safe.xlsx" 151 | df = valid_df() 152 | df.loc[0, "currency"] = "usd" # invalid 153 | write_excel(p, df) 154 | payload = safe_load_transactions(p, EXPECTED, log_dir=tmp_excel_dir, fail_on_any_error=True) 155 | assert payload["ok"] is False 156 | assert payload["type"] == "DataValidationError" 157 | assert "errors" in payload and len(payload["errors"]) >= 1 158 | -------------------------------------------------------------------------------- /streamlit_app.py: -------------------------------------------------------------------------------- 1 | 2 | import io 3 | import os 4 | from pathlib import Path 5 | from datetime import datetime 6 | from typing import List, Dict, Any 7 | 8 | import streamlit as st 9 | import pandas as pd 10 | from pydantic import ValidationError 11 | 12 | from etl_loader import ( 13 | configure_logging, 14 | ExcelTransactionLoader, 15 | DataValidationError, 16 | ETLError, 17 | TransactionRecord, 18 | ) 19 | 20 | st.set_page_config(page_title="ETL Validator (Pydantic + Pytest)", layout="wide") 21 | 22 | # ------------------------------ 23 | # Setup & Session State 24 | # ------------------------------ 25 | 26 | BASE_DIR = Path(__file__).parent 27 | LOG_DIR = BASE_DIR / "logs" 28 | SAMPLE_DIR = BASE_DIR / "sample_data" 29 | LOG_DIR.mkdir(exist_ok=True, parents=True) 30 | SAMPLE_DIR.mkdir(exist_ok=True, parents=True) 31 | 32 | # configure logger once per session 33 | if "logger_ready" not in st.session_state: 34 | configure_logging(LOG_DIR) 35 | st.session_state["logger_ready"] = True 36 | 37 | DEFAULT_COLUMNS = "date, description, amount, currency, account_id, category" 38 | 39 | def parse_columns(s: str) -> List[str]: 40 | return [c.strip() for c in s.split(",") if c.strip()] 41 | 42 | def schema_markdown() -> str: 43 | schema = TransactionRecord.model_json_schema() 44 | lines = ["### Pydantic Schema (TransactionRecord)"] 45 | lines.append("```json") 46 | import json 47 | lines.append(json.dumps(schema, indent=2, default=str)) 48 | lines.append("```") 49 | return "\n".join(lines) 50 | 51 | def tail_text(path: Path, lines: int = 400) -> str: 52 | if not path.exists(): 53 | return "(no log file yet)" 54 | with path.open("r", encoding="utf-8", errors="ignore") as f: 55 | buf = f.readlines() 56 | return "".join(buf[-lines:]) if buf else "(empty log)" 57 | 58 | def run_pytest() -> Dict[str, Any]: 59 | """Run pytest programmatically and capture output.""" 60 | import pytest, sys 61 | import contextlib 62 | stdout = io.StringIO() 63 | stderr = io.StringIO() 64 | with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr): 65 | # -q for quiet, but still summary; add -k to filter if ever needed 66 | exit_code = pytest.main(["-q"]) 67 | return { 68 | "exit_code": exit_code, 69 | "stdout": stdout.getvalue(), 70 | "stderr": stderr.getvalue(), 71 | } 72 | 73 | def save_uploaded_file(uploaded, suffix: str) -> Path: 74 | ext = os.path.splitext(uploaded.name)[1] or suffix 75 | temp_name = f"upload_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}{ext}" 76 | p = BASE_DIR / temp_name 77 | with open(p, "wb") as f: 78 | f.write(uploaded.getbuffer()) 79 | return p 80 | 81 | def generate_samples(): 82 | import numpy as np 83 | from decimal import Decimal 84 | 85 | # valid 86 | df_valid = pd.DataFrame({ 87 | "date": pd.to_datetime(["2024-01-01","2024-01-02","2024-01-03"]).date, 88 | "description": ["Coffee","Lunch","Taxi"], 89 | "amount": [Decimal("3.50"), Decimal("12.00"), Decimal("25.00")], 90 | "currency": ["USD","USD","USD"], 91 | "account_id": ["A1","A1","A2"], 92 | "category": ["Food","Food","Travel"] 93 | }) 94 | (SAMPLE_DIR / "valid.xlsx").unlink(missing_ok=True) 95 | with pd.ExcelWriter(SAMPLE_DIR / "valid.xlsx") as w: 96 | df_valid.to_excel(w, sheet_name="Sheet1", index=False) 97 | 98 | # headers only 99 | df_headers = pd.DataFrame(columns=["date","description","amount","currency","account_id","category"]) 100 | (SAMPLE_DIR / "headers_only.xlsx").unlink(missing_ok=True) 101 | with pd.ExcelWriter(SAMPLE_DIR / "headers_only.xlsx") as w: 102 | df_headers.to_excel(w, sheet_name="Sheet1", index=False) 103 | 104 | # multiple sheets 105 | (SAMPLE_DIR / "multi.xlsx").unlink(missing_ok=True) 106 | with pd.ExcelWriter(SAMPLE_DIR / "multi.xlsx") as w: 107 | df_valid.to_excel(w, sheet_name="Sheet1", index=False) 108 | df_valid.to_excel(w, sheet_name="Other", index=False) 109 | 110 | # missing columns 111 | df_missing = df_valid.drop(columns=["account_id"]) 112 | (SAMPLE_DIR / "missing_cols.xlsx").unlink(missing_ok=True) 113 | with pd.ExcelWriter(SAMPLE_DIR / "missing_cols.xlsx") as w: 114 | df_missing.to_excel(w, sheet_name="Sheet1", index=False) 115 | 116 | # duplicate columns 117 | df_dup = df_valid.copy() 118 | df_dup.columns = ["date","description","amount","currency","date","category"] 119 | (SAMPLE_DIR / "dup_cols.xlsx").unlink(missing_ok=True) 120 | with pd.ExcelWriter(SAMPLE_DIR / "dup_cols.xlsx") as w: 121 | df_dup.to_excel(w, sheet_name="Sheet1", index=False) 122 | 123 | # bad rows 124 | df_bad = df_valid.copy() 125 | df_bad.loc[1, "amount"] = "abc" # invalid number 126 | df_bad.loc[2, "currency"] = "usd" # lowercase invalid 127 | (SAMPLE_DIR / "bad_rows.xlsx").unlink(missing_ok=True) 128 | with pd.ExcelWriter(SAMPLE_DIR / "bad_rows.xlsx") as w: 129 | df_bad.to_excel(w, sheet_name="Sheet1", index=False) 130 | 131 | # wrong type (csv) 132 | df_valid.to_csv(SAMPLE_DIR / "not_excel.csv", index=False) 133 | 134 | # corrupt file 135 | (SAMPLE_DIR / "corrupt.xlsx").write_bytes(os.urandom(256)) 136 | 137 | # empty (0 bytes) 138 | (SAMPLE_DIR / "empty.xlsx").write_bytes(b"") 139 | 140 | # ------------------------------ 141 | # UI 142 | # ------------------------------ 143 | 144 | st.title("ETL Validator • Pydantic + Pandas + Pytest") 145 | st.caption("Validate Excel-based financial transaction data. View logs. Run tests.") 146 | 147 | tabs = st.tabs(["🔍 Upload & Validate", "🧪 Run Test Suite", "📜 Logs", "⚙️ Settings & Schema", "📦 Sample Files"]) 148 | 149 | # ---- Upload & Validate 150 | with tabs[0]: 151 | st.subheader("Validate an uploaded file") 152 | cols_input = st.text_input("Expected columns (comma-separated):", value=DEFAULT_COLUMNS) 153 | expected_cols = parse_columns(cols_input) 154 | 155 | fail_any = st.checkbox("Fail on any invalid row (raise error)", value=True) 156 | 157 | uploaded = st.file_uploader( 158 | "Upload an Excel file (.xlsx, .xls, .xlsm). You can also upload a .csv to trigger the 'wrong file type' path.", 159 | type=["xlsx", "xls", "xlsm", "csv"], 160 | ) 161 | 162 | sheet_name = None 163 | if uploaded and uploaded.name.lower().endswith((".xlsx",".xls",".xlsm")): 164 | # Try to peek sheets for selection 165 | try: 166 | tmp = save_uploaded_file(uploaded, suffix=".xlsx") 167 | xl = pd.ExcelFile(tmp) 168 | sheets = xl.sheet_names 169 | if len(sheets) > 1: 170 | sheet_name = st.selectbox("Select sheet", sheets, index=0, key="sheet_select") 171 | else: 172 | sheet_name = sheets[0] if sheets else None 173 | except Exception as e: 174 | st.info("Couldn't read sheet names (maybe corrupt or wrong type). You can still try validating to see proper errors.") 175 | tmp = save_uploaded_file(uploaded, suffix=".xlsx") 176 | elif uploaded: 177 | tmp = save_uploaded_file(uploaded, suffix=".xlsx") 178 | else: 179 | tmp = None 180 | 181 | if st.button("Validate file", disabled=(tmp is None)): 182 | loader = ExcelTransactionLoader(expected_cols) 183 | try: 184 | valid_df, errs = loader.load(tmp, sheet=sheet_name, fail_on_any_error=fail_any) 185 | st.success(f"Validation OK ✅ | Valid rows: {len(valid_df)} | Invalid rows: {len(errs)}") 186 | if len(valid_df) > 0: 187 | st.dataframe(valid_df.head(50)) 188 | if errs: 189 | st.subheader("Row-level validation messages") 190 | st.dataframe(pd.DataFrame([e.to_dict() for e in errs])) 191 | except DataValidationError as dve: 192 | st.error(f"{dve.__class__.__name__}: {dve.message}") 193 | err_df = pd.DataFrame([e.to_dict() for e in dve.errors]) 194 | st.dataframe(err_df) 195 | except ETLError as ee: 196 | st.error(f"{ee.__class__.__name__}: {ee.message}") 197 | st.json(ee.to_dict()) 198 | except Exception as ex: 199 | st.exception(ex) 200 | 201 | # ---- Run Test Suite 202 | with tabs[1]: 203 | st.subheader("Pytest: run the built-in test suite") 204 | st.write("Runs the unit tests shipped with this repo (happy path + all edge cases).") 205 | if st.button("Run tests now"): 206 | result = run_pytest() 207 | ok = (result["exit_code"] == 0) 208 | st.write("Exit code:", result["exit_code"]) 209 | if ok: 210 | st.success("All tests passed ✔️") 211 | else: 212 | st.error("Some tests failed ❌") 213 | st.subheader("stdout") 214 | st.code(result["stdout"] or "(empty)") 215 | st.subheader("stderr") 216 | st.code(result["stderr"] or "(empty)") 217 | 218 | # ---- Logs 219 | with tabs[2]: 220 | st.subheader("Logs (auto-generated by the loader)") 221 | info_log = LOG_DIR / "etl.log" 222 | err_log = LOG_DIR / "etl_errors.log" 223 | 224 | cols = st.columns(2) 225 | with cols[0]: 226 | st.markdown("**etl.log (INFO+)**") 227 | st.download_button("Download etl.log", data=info_log.read_bytes() if info_log.exists() else b"", file_name="etl.log") 228 | st.text_area("Tail of etl.log", tail_text(info_log), height=300) 229 | 230 | with cols[1]: 231 | st.markdown("**etl_errors.log (ERROR+)**") 232 | st.download_button("Download etl_errors.log", data=err_log.read_bytes() if err_log.exists() else b"", file_name="etl_errors.log") 233 | st.text_area("Tail of etl_errors.log", tail_text(err_log), height=300) 234 | 235 | if st.button("Clear logs"): 236 | for p in [info_log, err_log]: 237 | p.write_text("") 238 | st.success("Logs cleared.") 239 | 240 | # ---- Settings & Schema 241 | with tabs[3]: 242 | st.subheader("Pydantic Schema") 243 | st.markdown(schema_markdown()) 244 | 245 | st.subheader("Advanced") 246 | st.write("- Log directory:", str(LOG_DIR)) 247 | st.write("- Base directory:", str(BASE_DIR)) 248 | 249 | # ---- Sample Files 250 | with tabs[4]: 251 | st.subheader("Generate sample files for manual testing") 252 | st.write("This will create a variety of sample files under `sample_data/` which you can download below.") 253 | if st.button("Generate sample files"): 254 | generate_samples() 255 | st.success("Sample files generated.") 256 | 257 | if SAMPLE_DIR.exists(): 258 | for p in sorted(SAMPLE_DIR.glob("*")): 259 | st.download_button( 260 | label=f"Download {p.name}", 261 | data=p.read_bytes(), 262 | file_name=p.name, 263 | ) 264 | -------------------------------------------------------------------------------- /etl_loader/loader.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import annotations 3 | 4 | import logging 5 | from logging.handlers import RotatingFileHandler 6 | from dataclasses import dataclass, asdict 7 | from decimal import Decimal, InvalidOperation 8 | from datetime import date 9 | from pathlib import Path 10 | from typing import List, Optional, Tuple, Union, Iterable, Dict, Any 11 | 12 | import pandas as pd 13 | from pydantic import BaseModel, Field, ValidationError, field_validator, ConfigDict 14 | 15 | # ------------------------------ 16 | # Logging 17 | # ------------------------------ 18 | 19 | def configure_logging(log_dir: Union[str, Path] = ".", base_name: str = "etl") -> logging.Logger: 20 | """ 21 | Configure a logger with two rotating file handlers: 22 | - {base_name}.log (INFO and above): high-level operational log. 23 | - {base_name}_errors.log (ERROR and above): errors-only log for quick triage. 24 | Returns a logger named 'etl'. 25 | """ 26 | log_dir = Path(log_dir) 27 | log_dir.mkdir(parents=True, exist_ok=True) 28 | logger = logging.getLogger("etl") 29 | logger.setLevel(logging.DEBUG) 30 | logger.propagate = False # avoid double logging in notebooks/apps 31 | 32 | # Clear old handlers if reconfiguring 33 | if logger.handlers: 34 | for h in list(logger.handlers): 35 | logger.removeHandler(h) 36 | h.close() 37 | 38 | info_path = log_dir / f"{base_name}.log" 39 | errors_path = log_dir / f"{base_name}_errors.log" 40 | 41 | info_handler = RotatingFileHandler(info_path, maxBytes=1_000_000, backupCount=3) 42 | info_handler.setLevel(logging.INFO) 43 | info_handler.setFormatter(logging.Formatter( 44 | "%(asctime)s | %(levelname)s | %(name)s | %(message)s" 45 | )) 46 | 47 | error_handler = RotatingFileHandler(errors_path, maxBytes=1_000_000, backupCount=5) 48 | error_handler.setLevel(logging.ERROR) 49 | error_handler.setFormatter(logging.Formatter( 50 | "%(asctime)s | %(levelname)s | %(name)s | %(message)s" 51 | )) 52 | 53 | logger.addHandler(info_handler) 54 | logger.addHandler(error_handler) 55 | logger.info("Logger initialized. info_log=%s errors_log=%s", info_path, errors_path) 56 | return logger 57 | 58 | # ------------------------------ 59 | # Error Types 60 | # ------------------------------ 61 | 62 | class ETLError(Exception): 63 | """Base class for ETL-related errors with context for user-facing messages.""" 64 | def __init__(self, message: str, **context: Any) -> None: 65 | super().__init__(message) 66 | self.message = message 67 | self.context = context 68 | 69 | def to_dict(self) -> Dict[str, Any]: 70 | return {"message": self.message, "context": self.context} 71 | 72 | class WrongFileTypeError(ETLError): ... 73 | class EmptyFileError(ETLError): ... 74 | class NoContentError(ETLError): ... 75 | class MultipleSheetsError(ETLError): ... 76 | class MissingColumnsError(ETLError): ... 77 | class DuplicateColumnsError(ETLError): ... 78 | class CorruptFileError(ETLError): ... 79 | 80 | @dataclass(frozen=True) 81 | class ErrorDetail: 82 | row_index: int 83 | field: str 84 | value: Any 85 | error: str 86 | 87 | def to_dict(self) -> Dict[str, Any]: 88 | return asdict(self) 89 | 90 | class DataValidationError(ETLError): 91 | def __init__(self, message: str, errors: List[ErrorDetail], **context: Any) -> None: 92 | super().__init__(message, **context) 93 | self.errors = errors 94 | 95 | def to_dict(self) -> Dict[str, Any]: 96 | base = super().to_dict() 97 | base["errors"] = [e.to_dict() for e in self.errors] 98 | return base 99 | 100 | # ------------------------------ 101 | # Pydantic Model 102 | # ------------------------------ 103 | 104 | class TransactionRecord(BaseModel): 105 | """ 106 | Strict schema for a single financial transaction record. 107 | """ 108 | model_config = ConfigDict(extra="forbid", frozen=False) 109 | 110 | date: date 111 | description: str = Field(min_length=1, max_length=200) 112 | amount: Decimal 113 | currency: str = Field(pattern=r"^[A-Z]{3}$", description="ISO 4217 currency code (e.g., USD)") 114 | account_id: str = Field(min_length=1, max_length=50) 115 | category: Optional[str] = Field(default=None, max_length=100) 116 | 117 | @field_validator("amount") 118 | @classmethod 119 | def valid_amount(cls, v: Any) -> Decimal: 120 | """ 121 | Ensure amount parses as Decimal and is finite. 122 | """ 123 | if isinstance(v, float): 124 | # Convert float to string first to avoid binary float artifacts 125 | v = str(v) 126 | try: 127 | d = Decimal(v) 128 | except (InvalidOperation, ValueError) as e: 129 | raise ValueError(f"amount must be a valid number; got {v!r}") from e 130 | if d.is_nan(): 131 | raise ValueError("amount may not be NaN") 132 | if d == Decimal("0"): 133 | # Allow zero? Many pipelines disallow zero transactions. Adjust if needed. 134 | return d 135 | return d 136 | 137 | @field_validator("description") 138 | @classmethod 139 | def description_trimmed(cls, v: str) -> str: 140 | v = v.strip() 141 | if not v: 142 | raise ValueError("description cannot be empty/whitespace") 143 | return v 144 | 145 | # ------------------------------ 146 | # Loader 147 | # ------------------------------ 148 | 149 | class ExcelTransactionLoader: 150 | def __init__( 151 | self, 152 | expected_columns: Iterable[str], 153 | logger: Optional[logging.Logger] = None, 154 | ) -> None: 155 | self.expected_columns = [c.strip() for c in expected_columns] 156 | self.logger = logger or logging.getLogger("etl") 157 | 158 | @staticmethod 159 | def _check_extension(path: Path) -> None: 160 | if path.suffix.lower() not in {".xlsx", ".xls", ".xlsm"}: 161 | raise WrongFileTypeError( 162 | f"Unsupported file type {path.suffix!r}; expected an Excel file (.xlsx, .xls, .xlsm)", 163 | path=str(path), 164 | ) 165 | 166 | def _excel_sheets(self, path: Path) -> List[str]: 167 | try: 168 | xl = pd.ExcelFile(path) # type: ignore[no-untyped-call] 169 | return list(xl.sheet_names) 170 | except Exception as e: 171 | self.logger.error("Failed to open Excel file (possibly corrupt): %s", path, exc_info=True) 172 | raise CorruptFileError("Unable to open Excel file; the file may be corrupt or unreadable", path=str(path)) from e 173 | 174 | def _read_sheet(self, path: Path, sheet: Optional[Union[int, str]]) -> pd.DataFrame: 175 | try: 176 | return pd.read_excel(path, sheet_name=sheet) # type: ignore[no-untyped-call] 177 | except ValueError as ve: 178 | # pandas raises ValueError for bad sheet names/indices 179 | raise MultipleSheetsError(str(ve), path=str(path), sheet=sheet) from ve 180 | except Exception as e: 181 | self.logger.error("Failed to read Excel sheet", exc_info=True) 182 | raise CorruptFileError("Failed to read Excel sheet; the file or sheet may be corrupt", path=str(path), sheet=sheet) from e 183 | 184 | def _normalize_columns(self, cols: Iterable[str]) -> List[str]: 185 | return [str(c).strip() for c in cols] 186 | 187 | def _validate_columns(self, df: pd.DataFrame, path: Path) -> None: 188 | cols = self._normalize_columns(df.columns.tolist()) 189 | duplicates = [c for c in cols if cols.count(c) > 1] 190 | if duplicates: 191 | raise DuplicateColumnsError(f"Duplicate header columns found: {sorted(set(duplicates))}", path=str(path)) 192 | missing = [c for c in self.expected_columns if c not in cols] 193 | if missing: 194 | raise MissingColumnsError(f"Missing required columns: {missing}", path=str(path), expected=self.expected_columns, found=cols) 195 | 196 | def validate_rows(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[ErrorDetail]]: 197 | """ 198 | Validate all rows via Pydantic. Returns a dataframe of valid rows and a list of ErrorDetail for invalid rows. 199 | """ 200 | errors: List[ErrorDetail] = [] 201 | valid_records: List[Dict[str, Any]] = [] 202 | 203 | # Align columns used by the model; ignore extras 204 | projection = {c: c for c in self.expected_columns if c in df.columns} 205 | 206 | for idx, row in df.iterrows(): 207 | raw = {k: row[v] for k, v in projection.items()} 208 | try: 209 | record = TransactionRecord(**raw) 210 | valid_records.append(record.model_dump()) 211 | except ValidationError as ve: 212 | for e in ve.errors(): 213 | fld = e["loc"][0] if e.get("loc") else "" 214 | msg = e.get("msg", "validation error") 215 | val = raw.get(fld, None) 216 | errors.append(ErrorDetail(row_index=int(idx), field=str(fld), value=val, error=msg)) 217 | 218 | valid_df = pd.DataFrame(valid_records) if valid_records else pd.DataFrame(columns=self.expected_columns) 219 | return valid_df, errors 220 | 221 | def load( 222 | self, 223 | path: Union[str, Path], 224 | *, 225 | sheet: Optional[Union[int, str]] = None, 226 | fail_on_any_error: bool = True, 227 | ) -> Tuple[pd.DataFrame, List[ErrorDetail]]: 228 | """ 229 | Load and validate an Excel file. 230 | - fail_on_any_error=True: raise DataValidationError if any row fails validation. 231 | Returns (valid_df, errors) where errors is a list of ErrorDetail for rows that failed. 232 | """ 233 | p = Path(path) 234 | self.logger.info("Starting load | path=%s sheet=%s", p, sheet) 235 | 236 | # Basic checks 237 | if not p.exists(): 238 | raise EmptyFileError("File not found", path=str(p)) 239 | if p.stat().st_size == 0: 240 | raise EmptyFileError("File is empty (0 bytes)", path=str(p)) 241 | 242 | self._check_extension(p) 243 | 244 | # Sheet handling 245 | sheets = self._excel_sheets(p) 246 | if len(sheets) > 1 and sheet is None: 247 | raise MultipleSheetsError( 248 | f"Multiple sheets present ({sheets}); specify a sheet by name or index.", 249 | path=str(p), 250 | sheets=sheets, 251 | ) 252 | 253 | # Read data 254 | df = self._read_sheet(p, sheet if sheet is not None else sheets[0] if sheets else 0) 255 | 256 | if df.empty: 257 | raise NoContentError("The selected sheet has headers but no rows.", path=str(p), sheet=sheet) 258 | 259 | self._validate_columns(df, p) 260 | 261 | valid_df, errors = self.validate_rows(df) 262 | 263 | if errors and fail_on_any_error: 264 | self.logger.error("Validation failed | invalid_rows=%d", len(errors)) 265 | raise DataValidationError( 266 | f"Validation failed for {len(errors)} row(s)", 267 | errors=errors, 268 | path=str(p), 269 | sheet=sheet, 270 | ) 271 | 272 | self.logger.info("Load complete | total_rows=%d valid_rows=%d invalid_rows=%d", 273 | len(df), len(valid_df), len(errors)) 274 | return valid_df, errors 275 | 276 | # ------------------------------ 277 | # Convenience Wrapper 278 | # ------------------------------ 279 | 280 | def safe_load_transactions( 281 | path: Union[str, Path], 282 | expected_columns: Iterable[str], 283 | *, 284 | sheet: Optional[Union[int, str]] = None, 285 | log_dir: Union[str, Path] = ".", 286 | fail_on_any_error: bool = True, 287 | ) -> Dict[str, Any]: 288 | """ 289 | Convenience one-call function that configures logging, loads, and captures exceptions 290 | into a user-friendly dict suitable for APIs/UI layers. 291 | """ 292 | logger = configure_logging(log_dir) 293 | loader = ExcelTransactionLoader(expected_columns=expected_columns, logger=logger) 294 | try: 295 | df, errors = loader.load(path, sheet=sheet, fail_on_any_error=fail_on_any_error) 296 | return { 297 | "ok": True, 298 | "valid_row_count": len(df), 299 | "invalid_row_count": len(errors), 300 | "errors": [e.to_dict() for e in errors], 301 | # Avoid returning the full DF to keep payloads light 302 | } 303 | except ETLError as e: 304 | logger.exception("ETL load failed with ETLError") 305 | payload = {"ok": False, "type": e.__class__.__name__, **e.to_dict()} 306 | if isinstance(e, DataValidationError): 307 | payload["errors"] = [er.to_dict() for er in e.errors] 308 | return payload 309 | except Exception as e: # pragma: no cover 310 | logger.exception("Unexpected error during ETL load") 311 | return {"ok": False, "type": "UnexpectedError", "message": str(e)} 312 | --------------------------------------------------------------------------------