├── src
    ├── __init__.py
    ├── sample
    │   └── parquet
    │   │   └── titanic.parquet
    └── parqv
    │   ├── views
    │       ├── components
    │       │   ├── __init__.py
    │       │   ├── loading_display.py
    │       │   ├── error_display.py
    │       │   └── enhanced_data_table.py
    │       ├── utils
    │       │   ├── __init__.py
    │       │   ├── data_formatters.py
    │       │   ├── visualization.py
    │       │   └── stats_formatters.py
    │       ├── __init__.py
    │       ├── metadata_view.py
    │       ├── base.py
    │       ├── data_view.py
    │       └── schema_view.py
    │   ├── data_sources
    │       ├── formats
    │       │   ├── __init__.py
    │       │   ├── csv.py
    │       │   ├── json.py
    │       │   └── parquet.py
    │       ├── base
    │       │   ├── __init__.py
    │       │   ├── exceptions.py
    │       │   └── handler.py
    │       └── __init__.py
    │   ├── core
    │       ├── config.py
    │       ├── __init__.py
    │       ├── logging.py
    │       ├── file_utils.py
    │       └── handler_factory.py
    │   ├── __init__.py
    │   ├── parqv.css
    │   ├── cli.py
    │   └── app.py
├── assets
    └── parqv.gif
├── .gitignore
├── pyproject.toml
├── README.md
└── LICENSE


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/parqv.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sanspareilsmyn/parqv/HEAD/assets/parqv.gif


--------------------------------------------------------------------------------
/src/sample/parquet/titanic.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sanspareilsmyn/parqv/HEAD/src/sample/parquet/titanic.parquet


--------------------------------------------------------------------------------
/src/parqv/views/components/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reusable UI components for parqv views.
 3 | """
 4 | 
 5 | from .error_display import ErrorDisplay
 6 | from .loading_display import LoadingDisplay
 7 | from .enhanced_data_table import EnhancedDataTable
 8 | 
 9 | __all__ = [
10 |     "ErrorDisplay",
11 |     "LoadingDisplay", 
12 |     "EnhancedDataTable",
13 | ] 


--------------------------------------------------------------------------------
/src/parqv/data_sources/formats/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Format-specific data handlers for parqv.
 3 | """
 4 | 
 5 | from .parquet import ParquetHandler, ParquetHandlerError
 6 | from .json import JsonHandler, JsonHandlerError
 7 | from .csv import CsvHandler, CsvHandlerError
 8 | 
 9 | __all__ = [
10 |     # Parquet format
11 |     "ParquetHandler",
12 |     "ParquetHandlerError",
13 |     
14 |     # JSON format  
15 |     "JsonHandler",
16 |     "JsonHandlerError",
17 |     
18 |     # CSV format
19 |     "CsvHandler",
20 |     "CsvHandlerError",
21 | ] 


--------------------------------------------------------------------------------
/src/parqv/views/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for parqv views.
 3 | """
 4 | 
 5 | from .data_formatters import format_metadata_for_display, format_value_for_display
 6 | from .stats_formatters import format_stats_for_display, format_column_info
 7 | from .visualization import create_text_histogram, should_show_histogram
 8 | 
 9 | __all__ = [
10 |     # Data formatting
11 |     "format_metadata_for_display",
12 |     "format_value_for_display",
13 |     "format_stats_for_display",
14 |     "format_column_info",
15 | 
16 |     # Visualization
17 |     "create_text_histogram",
18 |     "should_show_histogram",
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/parqv/data_sources/base/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base classes and interfaces for data sources.
 3 | """
 4 | 
 5 | from .handler import DataHandler
 6 | from .exceptions import (
 7 |     DataSourceError,
 8 |     DataHandlerError,
 9 |     FileValidationError,
10 |     UnsupportedFormatError,
11 |     DataReadError,
12 |     SchemaError,
13 |     MetadataError,
14 | )
15 | 
16 | __all__ = [
17 |     # Base handler interface
18 |     "DataHandler",
19 |     
20 |     # Exception classes
21 |     "DataSourceError",
22 |     "DataHandlerError",
23 |     "FileValidationError",
24 |     "UnsupportedFormatError",
25 |     "DataReadError",
26 |     "SchemaError",
27 |     "MetadataError",
28 | ] 


--------------------------------------------------------------------------------
/src/parqv/core/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration constants and settings for parqv application.
 3 | """
 4 | 
 5 | from typing import Dict, Type, List
 6 | from pathlib import Path
 7 | 
 8 | # File extensions and their corresponding handler types
 9 | SUPPORTED_EXTENSIONS: Dict[str, str] = {
10 |     ".parquet": "parquet",
11 |     ".json": "json", 
12 |     ".ndjson": "json",
13 |     ".csv": "csv"
14 | }
15 | 
16 | # Application constants
17 | LOG_FILENAME = "parqv.log"
18 | LOG_MAX_BYTES = 1024 * 1024 * 5  # 5MB
19 | LOG_BACKUP_COUNT = 3
20 | LOG_ENCODING = "utf-8"
21 | 
22 | # UI Constants
23 | DEFAULT_PREVIEW_ROWS = 50
24 | 
25 | # CSS Path (relative to the app module)
26 | CSS_PATH = "parqv.css" 


--------------------------------------------------------------------------------
/src/parqv/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | parqv - A Textual application for visualizing Parquet and JSON files.
 3 | """
 4 | 
 5 | from .app import ParqV
 6 | from .cli import run_app
 7 | from .core import (
 8 |     SUPPORTED_EXTENSIONS, 
 9 |     DEFAULT_PREVIEW_ROWS,
10 |     FileValidationError, 
11 |     validate_and_detect_file,
12 |     HandlerFactory, 
13 |     HandlerCreationError,
14 |     setup_logging, 
15 |     get_logger
16 | )
17 | 
18 | __version__ = "1.0.0"
19 | 
20 | __all__ = [
21 |     "ParqV",
22 |     "run_app", 
23 |     "SUPPORTED_EXTENSIONS",
24 |     "DEFAULT_PREVIEW_ROWS",
25 |     "FileValidationError",
26 |     "validate_and_detect_file",
27 |     "HandlerFactory",
28 |     "HandlerCreationError",
29 |     "setup_logging",
30 |     "get_logger",
31 | ]
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 | 
28 | # PyInstaller
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .nox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *.cover
45 | *.py,cover
46 | .hypothesis/
47 | .pytest_cache/
48 | 
49 | # Environments
50 | .env
51 | .venv
52 | env/
53 | venv/
54 | ENV/
55 | env.bak/
56 | venv.bak/


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "parqv"
 7 | version = "0.3.0"
 8 | description = "An interactive Python TUI for visualizing, exploring, and analyzing files directly in your terminal."
 9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | license = "Apache-2.0"
12 | authors = [{ name = "Sangmin Yoon", email = "sanspareilsmyn@gmail.com" }]
13 | 
14 | dependencies = [
15 |     "textual>=1.0.0",
16 |     "pyarrow>=16.0.0",
17 |     "pandas>=2.0.0",
18 |     "numpy>=1.20.0",
19 |     "duckdb>=1.2.0"
20 | ]
21 | 
22 | [project.scripts]
23 | parqv = "parqv.app:run_app"
24 | 
25 | [tool.setuptools]
26 | package-dir = {"" = "src"}
27 | 
28 | [tool.setuptools.packages.find]
29 | where = ["src"]
30 | 
31 | [tool.setuptools.package-data]
32 | "parqv" = ["*.css"]


--------------------------------------------------------------------------------
/src/parqv/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core modules for parqv application.
 3 | 
 4 | This package contains fundamental configuration, utilities, and factory classes.
 5 | """
 6 | 
 7 | from .config import SUPPORTED_EXTENSIONS, DEFAULT_PREVIEW_ROWS, CSS_PATH
 8 | from .logging import setup_logging, get_logger
 9 | from .file_utils import FileValidationError, validate_and_detect_file, validate_file_path, detect_file_type
10 | from .handler_factory import HandlerFactory, HandlerCreationError
11 | 
12 | __all__ = [
13 |     # Configuration
14 |     "SUPPORTED_EXTENSIONS",
15 |     "DEFAULT_PREVIEW_ROWS", 
16 |     "CSS_PATH",
17 |     
18 |     # Logging
19 |     "setup_logging",
20 |     "get_logger",
21 |     
22 |     # File utilities
23 |     "FileValidationError",
24 |     "validate_and_detect_file",
25 |     "validate_file_path",
26 |     "detect_file_type",
27 |     
28 |     # Factory
29 |     "HandlerFactory",
30 |     "HandlerCreationError",
31 | ] 


--------------------------------------------------------------------------------
/src/parqv/data_sources/base/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Exception classes for data sources.
 3 | """
 4 | 
 5 | 
 6 | class DataSourceError(Exception):
 7 |     """Base exception for all data source errors."""
 8 |     pass
 9 | 
10 | 
11 | class DataHandlerError(DataSourceError):
12 |     """Base exception for all data handler errors."""
13 |     pass
14 | 
15 | 
16 | class FileValidationError(DataSourceError):
17 |     """Exception raised when file validation fails."""
18 |     pass
19 | 
20 | 
21 | class UnsupportedFormatError(DataSourceError):
22 |     """Exception raised when an unsupported file format is encountered."""
23 |     pass
24 | 
25 | 
26 | class DataReadError(DataSourceError):
27 |     """Exception raised when data reading fails."""
28 |     pass
29 | 
30 | 
31 | class SchemaError(DataSourceError):
32 |     """Exception raised when schema operations fail."""
33 |     pass
34 | 
35 | 
36 | class MetadataError(DataSourceError):
37 |     """Exception raised when metadata operations fail."""
38 |     pass 


--------------------------------------------------------------------------------
/src/parqv/views/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Views package for parqv application.
 3 | 
 4 | This package contains all UI views and their supporting components and utilities.
 5 | """
 6 | 
 7 | # Main views
 8 | from .metadata_view import MetadataView
 9 | from .data_view import DataView
10 | from .schema_view import SchemaView
11 | 
12 | # Base classes
13 | from .base import BaseView
14 | 
15 | # Components (optional, for advanced usage)
16 | from .components import ErrorDisplay, LoadingDisplay, EnhancedDataTable
17 | 
18 | # Utilities (optional, for advanced usage)
19 | from .utils import format_metadata_for_display, format_stats_for_display
20 | 
21 | __all__ = [
22 |     # Main views - these are the primary exports
23 |     "MetadataView",
24 |     "DataView", 
25 |     "SchemaView",
26 |     
27 |     # Base class - for extending functionality
28 |     "BaseView",
29 |     
30 |     # Components - for custom view development
31 |     "ErrorDisplay",
32 |     "LoadingDisplay",
33 |     "EnhancedDataTable",
34 |     
35 |     # Utilities - for data formatting
36 |     "format_metadata_for_display",
37 |     "format_stats_for_display",
38 | ]
39 | 


--------------------------------------------------------------------------------
/src/parqv/data_sources/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data sources package for parqv application.
 3 | 
 4 | This package provides adapters for various data file formats,
 5 | offering a unified interface for data access.
 6 | """
 7 | 
 8 | # Base classes and exceptions
 9 | from .base import (
10 |     DataHandler,
11 |     DataHandlerError,
12 |     DataSourceError,
13 |     FileValidationError,
14 |     UnsupportedFormatError,
15 |     DataReadError,
16 |     SchemaError,
17 |     MetadataError,
18 | )
19 | 
20 | # Format-specific handlers
21 | from .formats import (
22 |     ParquetHandler,
23 |     ParquetHandlerError,
24 |     JsonHandler,
25 |     JsonHandlerError,
26 |     CsvHandler,
27 |     CsvHandlerError,
28 | )
29 | 
30 | __all__ = [
31 |     # Base interface and exceptions
32 |     "DataHandler",
33 |     "DataHandlerError",
34 |     "DataSourceError",
35 |     "FileValidationError",
36 |     "UnsupportedFormatError", 
37 |     "DataReadError",
38 |     "SchemaError",
39 |     "MetadataError",
40 |     
41 |     # Format handlers
42 |     "ParquetHandler",
43 |     "ParquetHandlerError",
44 |     "JsonHandler",
45 |     "JsonHandlerError",
46 |     "CsvHandler",
47 |     "CsvHandlerError",
48 | ]


--------------------------------------------------------------------------------
/src/parqv/core/logging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging configuration for parqv application.
 3 | """
 4 | 
 5 | import logging
 6 | import sys
 7 | from logging.handlers import RotatingFileHandler
 8 | 
 9 | from .config import LOG_FILENAME, LOG_MAX_BYTES, LOG_BACKUP_COUNT, LOG_ENCODING
10 | 
11 | 
12 | def setup_logging() -> logging.Logger:
13 |     """
14 |     Sets up logging configuration for the parqv application.
15 |     
16 |     Returns:
17 |         The root logger instance configured for parqv.
18 |     """
19 |     file_handler = RotatingFileHandler(
20 |         LOG_FILENAME, 
21 |         maxBytes=LOG_MAX_BYTES, 
22 |         backupCount=LOG_BACKUP_COUNT, 
23 |         encoding=LOG_ENCODING
24 |     )
25 |     
26 |     logging.basicConfig(
27 |         level=logging.INFO,
28 |         format="%(asctime)s [%(levelname)-5.5s] %(name)s (%(filename)s:%(lineno)d) - %(message)s",
29 |         handlers=[file_handler, logging.StreamHandler(sys.stdout)],
30 |         force=True  # Override any existing configuration
31 |     )
32 |     
33 |     return logging.getLogger(__name__)
34 | 
35 | 
36 | def get_logger(name: str) -> logging.Logger:
37 |     """
38 |     Gets a logger instance for the given name.
39 |     
40 |     Args:
41 |         name: The name for the logger (typically __name__)
42 |         
43 |     Returns:
44 |         A logger instance.
45 |     """
46 |     return logging.getLogger(name) 


--------------------------------------------------------------------------------
/src/parqv/views/components/loading_display.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Loading display component for parqv views.
 3 | """
 4 | 
 5 | from textual.containers import Center, Middle
 6 | from textual.widgets import LoadingIndicator, Label
 7 | 
 8 | 
 9 | class LoadingDisplay(Center):
10 |     """
11 |     A reusable component for displaying loading states in a consistent format.
12 |     """
13 |     
14 |     def __init__(self, message: str = "Loading...", **kwargs):
15 |         """
16 |         Initialize the loading display.
17 |         
18 |         Args:
19 |             message: Loading message to display
20 |             **kwargs: Additional arguments for Center container
21 |         """
22 |         super().__init__(**kwargs)
23 |         self.message = message
24 |     
25 |     def compose(self):
26 |         """Compose the loading display layout."""
27 |         with Middle():
28 |             yield LoadingIndicator()
29 |             yield Label(self.message, classes="loading-message")
30 |     
31 |     @classmethod
32 |     def data_loading(cls, **kwargs) -> 'LoadingDisplay':
33 |         """Create a loading display for data loading operations."""
34 |         return cls(message="Loading data...", **kwargs)
35 |     
36 |     @classmethod
37 |     def metadata_loading(cls, **kwargs) -> 'LoadingDisplay':
38 |         """Create a loading display for metadata loading operations."""
39 |         return cls(message="Loading metadata...", **kwargs)
40 |     
41 |     @classmethod
42 |     def schema_loading(cls, **kwargs) -> 'LoadingDisplay':
43 |         """Create a loading display for schema loading operations."""
44 |         return cls(message="Loading schema...", **kwargs) 


--------------------------------------------------------------------------------
/src/parqv/views/metadata_view.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Metadata view for displaying file metadata information.
 3 | """
 4 | 
 5 | from textual.containers import VerticalScroll
 6 | from textual.widgets import Pretty
 7 | 
 8 | from .base import BaseView
 9 | from .components import ErrorDisplay
10 | from .utils import format_metadata_for_display
11 | 
12 | 
13 | class MetadataView(BaseView):
14 |     """
15 |     View for displaying metadata information about the loaded file.
16 |     
17 |     Shows file statistics, format information, and other metadata
18 |     in a formatted display.
19 |     """
20 |     
21 |     def load_content(self) -> None:
22 |         """Load and display metadata content."""
23 |         if not self.check_handler_available():
24 |             return
25 |         
26 |         try:
27 |             # Get raw metadata from handler
28 |             raw_metadata = self.handler.get_metadata_summary()
29 |             
30 |             # Format metadata for display
31 |             formatted_metadata = format_metadata_for_display(raw_metadata)
32 |             
33 |             # Check if there's an error in the formatted data
34 |             if "Error" in formatted_metadata and len(formatted_metadata) == 1:
35 |                 self.show_error(formatted_metadata["Error"])
36 |                 return
37 |             
38 |             # Display the formatted metadata
39 |             self._display_metadata(formatted_metadata)
40 |             
41 |             self.logger.info("Metadata loaded successfully")
42 |             
43 |         except Exception as e:
44 |             self.show_error("Failed to load metadata", e)
45 |     
46 |     def _display_metadata(self, metadata: dict) -> None:
47 |         """
48 |         Display the formatted metadata using Pretty widget.
49 |         
50 |         Args:
51 |             metadata: Formatted metadata dictionary
52 |         """
53 |         try:
54 |             pretty_widget = Pretty(metadata, id="metadata-pretty")
55 |             self.mount(pretty_widget)
56 |         except Exception as e:
57 |             self.logger.error(f"Failed to create Pretty widget: {e}")
58 |             self.show_error("Failed to display metadata")
59 |     
60 |     def refresh_metadata(self) -> None:
61 |         """Refresh the metadata display."""
62 |         self.clear_content()
63 |         self.load_content()


--------------------------------------------------------------------------------
/src/parqv/core/file_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | File utilities for parqv application.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | from typing import Optional, Tuple
 7 | 
 8 | from .config import SUPPORTED_EXTENSIONS
 9 | from .logging import get_logger
10 | 
11 | log = get_logger(__name__)
12 | 
13 | 
14 | class FileValidationError(Exception):
15 |     """Exception raised when file validation fails."""
16 |     pass
17 | 
18 | 
19 | def validate_file_path(file_path_str: Optional[str]) -> Path:
20 |     """
21 |     Validates and resolves the file path.
22 |     
23 |     Args:
24 |         file_path_str: String representation of the file path
25 |         
26 |     Returns:
27 |         Resolved Path object
28 |         
29 |     Raises:
30 |         FileValidationError: If file path is invalid or file doesn't exist
31 |     """
32 |     if not file_path_str:
33 |         raise FileValidationError("No file path provided.")
34 |     
35 |     file_path = Path(file_path_str)
36 |     log.debug(f"Validating file path: {file_path}")
37 |     
38 |     if not file_path.is_file():
39 |         raise FileValidationError(f"File not found or is not a regular file: {file_path}")
40 |     
41 |     return file_path
42 | 
43 | 
44 | def detect_file_type(file_path: Path) -> str:
45 |     """
46 |     Detects the file type based on its extension.
47 |     
48 |     Args:
49 |         file_path: Path object representing the file
50 |         
51 |     Returns:
52 |         String representing the detected file type ('parquet' or 'json')
53 |         
54 |     Raises:
55 |         FileValidationError: If file extension is not supported
56 |     """
57 |     file_suffix = file_path.suffix.lower()
58 |     
59 |     if file_suffix not in SUPPORTED_EXTENSIONS:
60 |         supported_exts = ", ".join(SUPPORTED_EXTENSIONS.keys())
61 |         raise FileValidationError(
62 |             f"Unsupported file extension: '{file_suffix}'. "
63 |             f"Only {supported_exts} are supported."
64 |         )
65 |     
66 |     detected_type = SUPPORTED_EXTENSIONS[file_suffix]
67 |     log.info(f"Detected '{file_suffix}' extension, type: {detected_type}")
68 |     
69 |     return detected_type
70 | 
71 | 
72 | def validate_and_detect_file(file_path_str: Optional[str]) -> Tuple[Path, str]:
73 |     """
74 |     Convenience function that validates file path and detects file type.
75 |     
76 |     Args:
77 |         file_path_str: String representation of the file path
78 |         
79 |     Returns:
80 |         Tuple of (validated_path, detected_type)
81 |         
82 |     Raises:
83 |         FileValidationError: If validation or type detection fails
84 |     """
85 |     file_path = validate_file_path(file_path_str)
86 |     file_type = detect_file_type(file_path)
87 |     
88 |     return file_path, file_type 


--------------------------------------------------------------------------------
/src/parqv/views/components/error_display.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Error display component for parqv views.
 3 | """
 4 | 
 5 | from typing import Optional
 6 | 
 7 | from textual.containers import VerticalScroll
 8 | from textual.widgets import Static, Label
 9 | 
10 | 
11 | class ErrorDisplay(VerticalScroll):
12 |     """
13 |     A reusable component for displaying error messages in a consistent format.
14 |     """
15 |     
16 |     def __init__(self, 
17 |                  title: str = "Error", 
18 |                  message: str = "An error occurred",
19 |                  details: Optional[str] = None,
20 |                  **kwargs):
21 |         """
22 |         Initialize the error display.
23 |         
24 |         Args:
25 |             title: Error title/category
26 |             message: Main error message
27 |             details: Optional detailed error information
28 |             **kwargs: Additional arguments for VerticalScroll
29 |         """
30 |         super().__init__(**kwargs)
31 |         self.title = title
32 |         self.message = message
33 |         self.details = details
34 |     
35 |     def compose(self):
36 |         """Compose the error display layout."""
37 |         yield Label(self.title, classes="error-title")
38 |         yield Static(f"[red]{self.message}[/red]", classes="error-content")
39 |         
40 |         if self.details:
41 |             yield Static("Details:", classes="error-details-label")
42 |             yield Static(f"[dim]{self.details}[/dim]", classes="error-details")
43 |     
44 |     @classmethod
45 |     def file_not_found(cls, file_path: str, **kwargs) -> 'ErrorDisplay':
46 |         """Create an error display for file not found errors."""
47 |         return cls(
48 |             title="File Not Found",
49 |             message=f"Could not find file: {file_path}",
50 |             details="Please check that the file path is correct and the file exists.",
51 |             **kwargs
52 |         )
53 |     
54 |     @classmethod
55 |     def handler_not_available(cls, **kwargs) -> 'ErrorDisplay':
56 |         """Create an error display for missing data handler."""
57 |         return cls(
58 |             title="Data Handler Not Available",
59 |             message="No data handler is currently loaded",
60 |             details="This usually means the file could not be processed or loaded.",
61 |             **kwargs
62 |         )
63 |     
64 |     @classmethod
65 |     def data_loading_error(cls, error_msg: str, **kwargs) -> 'ErrorDisplay':
66 |         """Create an error display for data loading errors."""
67 |         return cls(
68 |             title="Data Loading Error",
69 |             message="Failed to load data from the file",
70 |             details=f"Technical details: {error_msg}",
71 |             **kwargs
72 |         ) 


--------------------------------------------------------------------------------
/src/parqv/views/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base classes for parqv views.
 3 | """
 4 | 
 5 | from typing import Optional
 6 | 
 7 | from textual.containers import Container
 8 | from textual.widgets import Static
 9 | 
10 | from ..core import get_logger
11 | from ..data_sources import DataHandler
12 | 
13 | 
14 | class BaseView(Container):
15 |     """
16 |     Base class for all parqv views.
17 |     
18 |     Provides common functionality for data loading, error handling,
19 |     and handler access.
20 |     """
21 | 
22 |     def __init__(self, **kwargs):
23 |         super().__init__(**kwargs)
24 |         self._is_mounted = False
25 | 
26 |     @property
27 |     def logger(self):
28 |         """Get a logger for this view."""
29 |         return get_logger(f"{self.__class__.__module__}.{self.__class__.__name__}")
30 | 
31 |     @property
32 |     def handler(self) -> Optional[DataHandler]:
33 |         """Get the data handler from the app."""
34 |         if hasattr(self.app, 'handler'):
35 |             return self.app.handler
36 |         return None
37 | 
38 |     def on_mount(self) -> None:
39 |         """Called when the view is mounted."""
40 |         self._is_mounted = True
41 |         self.load_content()
42 | 
43 |     def load_content(self) -> None:
44 |         """
45 |         Load the main content for this view. Must be implemented by subclasses.
46 |         
47 |         Raises:
48 |             NotImplementedError: If not implemented by subclass
49 |         """
50 |         raise NotImplementedError("Subclasses must implement load_content()")
51 | 
52 |     def clear_content(self) -> None:
53 |         """Clear all content from the view."""
54 |         try:
55 |             self.query("*").remove()
56 |         except Exception as e:
57 |             self.logger.error(f"Error clearing content: {e}")
58 | 
59 |     def show_error(self, message: str, exception: Optional[Exception] = None) -> None:
60 |         """
61 |         Display an error message in the view.
62 |         
63 |         Args:
64 |             message: Error message to display
65 |             exception: Optional exception that caused the error
66 |         """
67 |         if exception:
68 |             self.logger.exception(f"Error in {self.__class__.__name__}: {message}")
69 |         else:
70 |             self.logger.error(f"Error in {self.__class__.__name__}: {message}")
71 | 
72 |         self.clear_content()
73 |         error_widget = Static(f"[red]Error: {message}[/red]", classes="error-content")
74 |         self.mount(error_widget)
75 | 
76 |     def show_info(self, message: str) -> None:
77 |         """
78 |         Display an informational message in the view.
79 |         
80 |         Args:
81 |             message: Info message to display
82 |         """
83 |         self.logger.info(f"Info in {self.__class__.__name__}: {message}")
84 |         self.clear_content()
85 |         info_widget = Static(f"[blue]Info: {message}[/blue]", classes="info-content")
86 |         self.mount(info_widget)
87 | 
88 |     def check_handler_available(self) -> bool:
89 |         """
90 |         Check if handler is available and show error if not.
91 |         
92 |         Returns:
93 |             True if handler is available, False otherwise
94 |         """
95 |         if not self.handler:
96 |             self.show_error("Data handler not available")
97 |             return False
98 |         return True
99 | 


--------------------------------------------------------------------------------
/src/parqv/core/handler_factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Handler factory for creating appropriate data handlers based on file type.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | from typing import Optional
 7 | 
 8 | from ..data_sources import DataHandler, DataHandlerError, ParquetHandler, JsonHandler, CsvHandler
 9 | from .logging import get_logger
10 | 
11 | log = get_logger(__name__)
12 | 
13 | 
14 | class HandlerCreationError(Exception):
15 |     """Exception raised when handler creation fails."""
16 |     pass
17 | 
18 | 
19 | class HandlerFactory:
20 |     """Factory class for creating data handlers."""
21 |     
22 |     # Registry of handler types to handler classes
23 |     _HANDLER_REGISTRY = {
24 |         "parquet": ParquetHandler,
25 |         "json": JsonHandler,
26 |         "csv": CsvHandler,
27 |     }
28 |     
29 |     @classmethod
30 |     def create_handler(cls, file_path: Path, handler_type: str) -> DataHandler:
31 |         """
32 |         Creates an appropriate handler for the given file type.
33 |         
34 |         Args:
35 |             file_path: Path to the data file
36 |             handler_type: Type of handler to create ('parquet' or 'json')
37 |             
38 |         Returns:
39 |             An instance of the appropriate DataHandler subclass
40 |             
41 |         Raises:
42 |             HandlerCreationError: If handler creation fails
43 |         """
44 |         if handler_type not in cls._HANDLER_REGISTRY:
45 |             available_types = ", ".join(cls._HANDLER_REGISTRY.keys())
46 |             raise HandlerCreationError(
47 |                 f"Unknown handler type: '{handler_type}'. "
48 |                 f"Available types: {available_types}"
49 |             )
50 |         
51 |         handler_class = cls._HANDLER_REGISTRY[handler_type]
52 |         
53 |         log.info(f"Creating {handler_type.capitalize()} handler for: {file_path}")
54 |         
55 |         try:
56 |             handler = handler_class(file_path)
57 |             log.info(f"{handler_type.capitalize()} handler created successfully.")
58 |             return handler
59 |             
60 |         except DataHandlerError as e:
61 |             log.error(f"Failed to create {handler_type} handler: {e}")
62 |             raise HandlerCreationError(f"Failed to initialize {handler_type} handler: {e}") from e
63 |             
64 |         except Exception as e:
65 |             log.exception(f"Unexpected error creating {handler_type} handler")
66 |             raise HandlerCreationError(
67 |                 f"Unexpected error during {handler_type} handler creation: {e}"
68 |             ) from e
69 |     
70 |     @classmethod
71 |     def get_supported_types(cls) -> list[str]:
72 |         """
73 |         Returns a list of supported handler types.
74 |         
75 |         Returns:
76 |             List of supported handler type strings
77 |         """
78 |         return list(cls._HANDLER_REGISTRY.keys())
79 |     
80 |     @classmethod
81 |     def register_handler(cls, handler_type: str, handler_class: type[DataHandler]) -> None:
82 |         """
83 |         Registers a new handler type (for extensibility).
84 |         
85 |         Args:
86 |             handler_type: String identifier for the handler type
87 |             handler_class: Class that implements DataHandler interface
88 |         """
89 |         log.info(f"Registering handler type '{handler_type}' with class {handler_class.__name__}")
90 |         cls._HANDLER_REGISTRY[handler_type] = handler_class 


--------------------------------------------------------------------------------
/src/parqv/parqv.css:
--------------------------------------------------------------------------------
  1 | /* --- Base Screen Styles --- */
  2 | Screen {
  3 |     background: $surface;
  4 |     color: $text;
  5 | }
  6 | 
  7 | /* --- Header & Footer Styles --- */
  8 | Header {
  9 |     background: $primary;
 10 | }
 11 | Footer {
 12 |     background: $primary-darken-1;
 13 | }
 14 | Footer > .footer--key {
 15 |     color: $text-muted;
 16 | }
 17 | Footer > .footer--highlight-key {
 18 |     background: $accent-darken-1;
 19 |     color: $text;
 20 |     text-style: bold;
 21 | }
 22 | 
 23 | /* --- Tabbed Interface Styles --- */
 24 | TabbedContent {
 25 |     height: 100%;
 26 | }
 27 | TabbedContent > Tabs {
 28 |     background: $primary-darken-1;
 29 |     color: $text-muted;
 30 | }
 31 | TabbedContent > Tabs > Tab {
 32 |     padding: 1 2;
 33 | }
 34 | TabbedContent > Tabs > Tab:hover {
 35 |     background: $primary;
 36 | }
 37 | TabbedContent > Tabs > .--current {
 38 |      background: $accent;
 39 |      color: $text;
 40 |      text-style: bold;
 41 | }
 42 | TabbedContent > Content {
 43 |     padding: 1 2;
 44 |     height: 1fr;
 45 |     width: 100%;
 46 |     overflow: hidden;
 47 | }
 48 | TabbedContent > Content > * {
 49 |      height: 100%;
 50 |      width: 100%;
 51 | }
 52 | 
 53 | /* --- Schema Tab (#schema-view - VerticalScroll) --- */
 54 | #schema-view {
 55 |     padding: 0;
 56 | }
 57 | #schema-view > ListView#column-list-view {
 58 |     border: round $accent-lighten-2;
 59 |     margin-bottom: 1;
 60 |     background: $primary-background;
 61 |     overflow: auto;
 62 | }
 63 | #schema-view > ListView#column-list-view > ListItem {
 64 |     padding: 0 1;
 65 |     height: auto;
 66 | }
 67 | #schema-view > ListView#column-list-view > ListItem.--highlight {
 68 |     background: $accent;
 69 |     color: $text;
 70 | }
 71 | #schema-view > ListView#column-list-view > ListItem.--highlight Label {
 72 |     color: $text;
 73 | }
 74 | #schema-view > LoadingIndicator#schema-loading-indicator {
 75 |     margin: 1 0;
 76 |     width: 100%;
 77 |     text-align: center;
 78 | }
 79 | #schema-view > Container#schema-stats-content {
 80 |     padding: 1;
 81 |     overflow: auto;
 82 | }
 83 | #schema-view .stats-line {
 84 |     margin-bottom: 0;
 85 |     height: auto;
 86 |     width: 100%;
 87 | }
 88 | #schema-view .stats-code {
 89 |     background: $panel-darken-1;
 90 |     border: solid $accent-lighten-1;
 91 |     padding: 0 1;
 92 |     margin: 1 0;
 93 |     width: 100%;
 94 |     height: auto;
 95 |     overflow: auto;
 96 | }
 97 | #schema-view .stats-error {
 98 |     color: $error;
 99 | }
100 | 
101 | /* --- Metadata Tab (#metadata-view - VerticalScroll) --- */
102 | #metadata-view {
103 |     overflow-y: auto;
104 | }
105 | #metadata-view > Pretty {
106 |     width: 100%;
107 | }
108 | 
109 | /* --- Data Preview Tab (#data-view - Container) --- */
110 | #data-view {
111 | }
112 | #data-view > DataTable {
113 |     height: 100%;
114 |     width: 100%;
115 | }
116 | 
117 | /* --- Row Groups Tab (#rowgroup-view - VerticalScroll) --- */
118 | #rowgroup-view {
119 |     overflow-y: auto;
120 | }
121 | #rowgroup-view > DataTable {
122 |     width: 100%;
123 | }
124 | 
125 | /* --- General Widget Styles --- */
126 | DataTable {
127 |     margin-top: 1;
128 | }
129 | DataTable > Header {
130 |     background: $secondary;
131 |     color: $text;
132 |     text-style: bold;
133 | }
134 | DataTable > Body > Row.--cursor {
135 |      background: $accent;
136 |      color: $text;
137 | }
138 | DataTable > Body > Row:hover {
139 |      background: $secondary-darken-2;
140 | }
141 | 
142 | /* --- Error Message Styles (Used in App Level Error) --- */
143 | .error-title {
144 |     color: $error;
145 |     text-style: bold;
146 |     margin-bottom: 1;
147 | }
148 | .error-content {
149 |     color: $error;
150 | }


--------------------------------------------------------------------------------
/src/parqv/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command Line Interface for parqv application.
  3 | """
  4 | 
  5 | import sys
  6 | 
  7 | from .app import ParqV
  8 | from .core import SUPPORTED_EXTENSIONS, FileValidationError, validate_and_detect_file, setup_logging, get_logger
  9 | 
 10 | 
 11 | def _print_user_message(message: str, log_level: str = "info") -> None:
 12 |     """
 13 |     Show a message to the user and log it.
 14 |     
 15 |     Args:
 16 |         message: message to display and log
 17 |         log_level: log level ('info', 'error', 'warning')
 18 |     """
 19 |     log = get_logger(__name__)
 20 | 
 21 |     print(message, file=sys.stderr)
 22 | 
 23 |     if log_level == "error":
 24 |         log.error(message)
 25 |     elif log_level == "warning":
 26 |         log.warning(message)
 27 |     else:
 28 |         log.info(message)
 29 | 
 30 | 
 31 | def validate_cli_arguments() -> str:
 32 |     """
 33 |     Validates command line arguments.
 34 |     
 35 |     Returns:
 36 |         The file path string from command line arguments
 37 |         
 38 |     Raises:
 39 |         SystemExit: If arguments are invalid
 40 |     """
 41 |     log = get_logger(__name__)
 42 | 
 43 |     if len(sys.argv) < 2:
 44 |         usage_message = "Usage: parqv <path_to_parquet_or_json_file>"
 45 |         supported_message = f"Supported file types: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
 46 | 
 47 |         _print_user_message(usage_message, "error")
 48 |         _print_user_message(supported_message, "info")
 49 | 
 50 |         log.error("No file path provided via CLI arguments")
 51 |         sys.exit(1)
 52 | 
 53 |     file_path_str = sys.argv[1]
 54 |     log.debug(f"File path received from CLI: {file_path_str}")
 55 |     return file_path_str
 56 | 
 57 | 
 58 | def run_app() -> None:
 59 |     """
 60 |     Main entry point for the parqv CLI application.
 61 |     
 62 |     This function:
 63 |     1. Sets up logging
 64 |     2. Validates command line arguments
 65 |     3. Validates the file path and type
 66 |     4. Creates and runs the Textual app
 67 |     """
 68 |     # Setup logging first
 69 |     log = setup_logging()
 70 |     log.info("--- parqv CLI started ---")
 71 | 
 72 |     try:
 73 |         # Get and validate CLI arguments
 74 |         file_path_str = validate_cli_arguments()
 75 | 
 76 |         # Validate file path and detect type (for early validation)
 77 |         file_path, file_type = validate_and_detect_file(file_path_str)
 78 |         log.info(f"File validated successfully: {file_path} (type: {file_type})")
 79 | 
 80 |         # Create and run the app
 81 |         log.info("Starting parqv application...")
 82 |         app = ParqV(file_path_str=file_path_str)
 83 |         app.run()
 84 | 
 85 |         log.info("parqv application finished successfully")
 86 | 
 87 |     except FileValidationError as e:
 88 |         log.error(f"File validation failed: {e}")
 89 | 
 90 |         error_message = f"Error: {e}"
 91 |         help_message = f"Please provide a file with one of these extensions: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
 92 | 
 93 |         _print_user_message(error_message, "error")
 94 |         _print_user_message(help_message, "info")
 95 | 
 96 |         log.error("Exiting due to file validation error")
 97 |         sys.exit(1)
 98 | 
 99 |     except KeyboardInterrupt:
100 |         log.info("Application interrupted by user (Ctrl+C)")
101 |         _print_user_message("\nApplication interrupted by user.", "info")
102 |         sys.exit(0)
103 | 
104 |     except Exception as e:
105 |         log.exception(f"Unexpected error in CLI: {e}")
106 |         _print_user_message(f"An unexpected error occurred: {e}", "error")
107 |         _print_user_message("Check the log file for more details.", "info")
108 |         sys.exit(1)
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     run_app()
113 | 


--------------------------------------------------------------------------------
/src/parqv/views/data_view.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data view for displaying tabular data preview.
  3 | """
  4 | 
  5 | from typing import Optional
  6 | 
  7 | import pandas as pd
  8 | from textual.app import ComposeResult
  9 | 
 10 | from .base import BaseView
 11 | from .components import EnhancedDataTable
 12 | from ..core import DEFAULT_PREVIEW_ROWS
 13 | 
 14 | 
 15 | class DataView(BaseView):
 16 |     """
 17 |     View for displaying a preview of the data in tabular format.
 18 |     
 19 |     Shows the first N rows of data in an interactive table format
 20 |     with proper error handling and loading states.
 21 |     """
 22 | 
 23 |     def __init__(self, preview_rows: int = DEFAULT_PREVIEW_ROWS, **kwargs):
 24 |         """
 25 |         Initialize the data view.
 26 |         
 27 |         Args:
 28 |             preview_rows: Number of rows to show in preview
 29 |             **kwargs: Additional arguments for BaseView
 30 |         """
 31 |         super().__init__(**kwargs)
 32 |         self.preview_rows = preview_rows
 33 |         self._data_table: Optional[EnhancedDataTable] = None
 34 | 
 35 |     def compose(self) -> ComposeResult:
 36 |         """Compose the data view layout."""
 37 |         self._data_table = EnhancedDataTable(id="data-preview-table")
 38 |         yield self._data_table
 39 | 
 40 |     def load_content(self) -> None:
 41 |         """Load and display data content."""
 42 |         if not self.check_handler_available():
 43 |             return
 44 | 
 45 |         if not self._data_table:
 46 |             self.show_error("Data table component not initialized")
 47 |             return
 48 | 
 49 |         try:
 50 |             # Get data preview from handler
 51 |             self.logger.info(f"Loading data preview ({self.preview_rows} rows)")
 52 |             df = self.handler.get_data_preview(num_rows=self.preview_rows)
 53 | 
 54 |             # Validate DataFrame
 55 |             if df is None:
 56 |                 self.show_error("Could not load data preview - handler returned None")
 57 |                 return
 58 | 
 59 |             # Handle error DataFrame (some handlers return error as DataFrame)
 60 |             if self._is_error_dataframe(df):
 61 |                 error_msg = self._extract_error_from_dataframe(df)
 62 |                 self.show_error(error_msg)
 63 |                 return
 64 | 
 65 |             # Load DataFrame into table
 66 |             success = self._data_table.load_dataframe(df, max_rows=self.preview_rows)
 67 | 
 68 |             if success:
 69 |                 self.logger.info(f"Data preview loaded successfully: {len(df)} rows")
 70 |             else:
 71 |                 self.show_error("Failed to load data into table component")
 72 | 
 73 |         except Exception as e:
 74 |             self.show_error("Failed to load data preview", e)
 75 | 
 76 |     def _is_error_dataframe(self, df: pd.DataFrame) -> bool:
 77 |         """
 78 |         Check if the DataFrame represents an error condition.
 79 |         
 80 |         Args:
 81 |             df: DataFrame to check
 82 |             
 83 |         Returns:
 84 |             True if the DataFrame contains error information
 85 |         """
 86 |         return (
 87 |                 not df.empty and
 88 |                 "error" in df.columns and
 89 |                 len(df.columns) == 1
 90 |         )
 91 | 
 92 |     def _extract_error_from_dataframe(self, df: pd.DataFrame) -> str:
 93 |         """
 94 |         Extract error message from an error DataFrame.
 95 |         
 96 |         Args:
 97 |             df: Error DataFrame
 98 |             
 99 |         Returns:
100 |             Error message string
101 |         """
102 |         try:
103 |             if not df.empty and "error" in df.columns:
104 |                 return str(df["error"].iloc[0])
105 |         except Exception:
106 |             pass
107 |         return "Unknown error in data loading"
108 | 
109 |     def refresh_data(self) -> None:
110 |         """Refresh the data display."""
111 |         self.clear_content()
112 |         self.load_content()
113 | 
114 |     def set_preview_rows(self, new_rows: int) -> None:
115 |         """
116 |         Update the number of preview rows and refresh display.
117 |         
118 |         Args:
119 |             new_rows: New number of rows to preview
120 |         """
121 |         if new_rows > 0:
122 |             self.preview_rows = new_rows
123 |             self.refresh_data()
124 |         else:
125 |             self.logger.warning(f"Invalid preview_rows value: {new_rows}")
126 | 
127 |     def get_current_data(self) -> Optional[pd.DataFrame]:
128 |         """
129 |         Get the currently displayed data if available.
130 |         
131 |         Returns:
132 |             Currently loaded DataFrame or None
133 |         """
134 |         if not self.handler:
135 |             return None
136 | 
137 |         try:
138 |             return self.handler.get_data_preview(num_rows=self.preview_rows)
139 |         except Exception as e:
140 |             self.logger.error(f"Failed to get current data: {e}")
141 |             return None
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # parqv
 2 | 
 3 | [![Python Version](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://www.python.org/)
 4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
 5 | [![PyPI version](https://badge.fury.io/py/parqv.svg)](https://badge.fury.io/py/parqv) <!-- TODO: Link after first PyPI release -->
 6 | [![Built with Textual](https://img.shields.io/badge/Built%20with-Textual-blueviolet.svg)](https://textual.textualize.io/)
 7 | 
 8 | ---
 9 | 
10 | **Supported File Formats:** ✅ **Parquet** | ✅ **JSON** / **JSON Lines (ndjson)** | ✅ **CSV / TSV** | *(More planned!)*
11 | 
12 | ---
13 | 
14 | **`parqv` is a Python-based interactive TUI (Text User Interface) tool designed to explore, analyze, and understand various data file formats directly within your terminal.** `parqv` aims to provide a unified, visual experience for quick data inspection without leaving your console.
15 | 
16 | ## 💻 Demo
17 | ![parqv.gif](assets/parqv.gif)
18 | *(Demo shows Parquet features; UI adapts for other formats)*
19 | 
20 | ## 🤔 Why `parqv`?
21 | 1.  **Unified Interface:** Launch `parqv <your_data_file>` to access **metadata, schema, data preview, and column statistics** all within a single, navigable terminal window. No more juggling different commands for different file types.
22 | 2.  **Interactive Exploration:**
23 |     *   **🖱️ Keyboard & Mouse Driven:** Navigate using familiar keys (arrows, `hjkl`, Tab) or even your mouse (thanks to `Textual`).
24 |     *   **📜 Scrollable Views:** Easily scroll through large schemas, data tables, or column lists.
25 |     *   **🌲 Clear Schema View:** Understand column names, data types, and nullability at a glance. (Complex nested structures visualization might vary by format).
26 |     *   **📊 Dynamic Stats:** Select a column and instantly see its detailed statistics (counts, nulls, min/max, mean, distinct values, etc.).
27 | 3.  **Cross-Format Consistency:**
28 |     *   **🎨 Rich Display:** Leverages `rich` and `Textual` for colorful, readable tables and text across supported formats.
29 |     *   **📈 Quick Stats:** Get key statistical insights consistently, regardless of the underlying file type.
30 |     *   **🔌 Extensible:** Designed with a handler interface to easily add support for more file formats in the future (like CSV, Arrow IPC, etc.).
31 | 
32 | ## ✨ Features (TUI Mode)
33 | *   **Multi-Format Support:** Now supports **Parquet** (`.parquet`), **JSON/JSON Lines** (`.json`, `.ndjson`), and **CSV/TSV** (`.csv`, `.tsv`). Run `parqv <your_file.{parquet,json,ndjson,csv,tsv}>`.
34 | *   **Metadata Panel:** Displays key file information (path, format, size, total rows, column count, etc.). *Fields may vary slightly depending on the file format.*
35 | *   **Schema Explorer:**
36 |     *   Interactive list view of columns.
37 |     *   Clearly shows column names, data types, and nullability.
38 | *   **Data Table Viewer:**
39 |     *   Scrollable table preview of the file's data.
40 |     *   Attempts to preserve data types for better representation.
41 | *   **Column Statistics Viewer:**
42 |     *   Select a column in the Schema tab to view detailed statistics.
43 |     *   Shows counts (total, valid, null), percentages, and type-specific stats (min/max, mean, stddev, distinct counts, length stats, boolean value counts where applicable).
44 | *   **Row Group Inspector (Parquet Specific):**
45 |     *   *This panel only appears when viewing Parquet files.*
46 |     *   Lists row groups with stats (row count, compressed/uncompressed size).
47 |     *   (Planned) Select a row group for more details.
48 | 
49 | ## 🚀 Getting Started
50 | 
51 | **1. Prerequisites:**
52 | *   **Python:** Version 3.10 or higher.
53 | *   **pip:** The Python package installer.
54 | 
55 | **2. Install `parqv`:**
56 | *   Open your terminal and run:
57 |     ```bash
58 |     pip install parqv
59 |     ```
60 |     *(This will also install dependencies like `textual`, `pyarrow`, `pandas`, and `duckdb`)*
61 | *   **Updating `parqv`:**
62 |     ```bash
63 |     pip install --upgrade parqv
64 |     ```
65 | 
66 | **3. Run `parqv`:**
67 | *   Point `parqv` to your data file:
68 |     ```bash
69 |     #parquet
70 |     parqv /path/to/your/data.parquet
71 |     
72 |     # json
73 |     parqv /path/to/your/data.json
74 | *   The interactive TUI will launch. Use your keyboard (and mouse, if supported by your terminal) to navigate:
75 |     *   **Arrow Keys / `j`,`k` (in lists):** Move selection up/down.
76 |     *   **`Tab` / `Shift+Tab`:** Cycle focus between the main tab content and potentially other areas. (Focus handling might evolve).
77 |     *   **`Enter` (in column list):** Select a column to view statistics.
78 |     *   **View Switching:** Use `Ctrl+N` (Next Tab) and `Ctrl+P` (Previous Tab) or click on the tabs (Metadata, Schema, Data Preview).
79 |     *   **Scrolling:** Use `PageUp` / `PageDown` / `Home` / `End` or arrow keys/mouse wheel within scrollable areas (like Schema stats or Data Preview).
80 |     *   **`q` / `Ctrl+C`:** Quit `parqv`.
81 |     *   *(Help Screen `?` is planned)*
82 | 
83 | ---
84 | 
85 | ## 📄 License
86 | 
87 | Licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for the full license text.
88 | 


--------------------------------------------------------------------------------
/src/parqv/data_sources/base/handler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Base data handler interface for parqv data sources.
  3 | """
  4 | 
  5 | from abc import ABC, abstractmethod
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List, Optional
  8 | 
  9 | import pandas as pd
 10 | 
 11 | from ...core import get_logger
 12 | 
 13 | 
 14 | class DataHandler(ABC):
 15 |     """
 16 |     Abstract Base Class for data handlers.
 17 |     
 18 |     Defines the common interface required by the ParqV application
 19 |     to interact with different data file formats.
 20 |     """
 21 | 
 22 |     def __init__(self, file_path: Path):
 23 |         """
 24 |         Initialize the handler with the file path.
 25 |         
 26 |         Subclasses should open the file or set up necessary resources here.
 27 | 
 28 |         Args:
 29 |             file_path: Path to the data file.
 30 | 
 31 |         Raises:
 32 |             DataHandlerError: If initialization fails (e.g., file not found, format error).
 33 |         """
 34 |         self.file_path = file_path
 35 |         self.logger = get_logger(f"{self.__class__.__module__}.{self.__class__.__name__}")
 36 | 
 37 |     @abstractmethod
 38 |     def close(self) -> None:
 39 |         """
 40 |         Close any open resources (files, connections, etc.).
 41 |         
 42 |         Must be implemented by subclasses.
 43 |         """
 44 |         pass
 45 | 
 46 |     @abstractmethod
 47 |     def get_metadata_summary(self) -> Dict[str, Any]:
 48 |         """
 49 |         Get a dictionary containing summary metadata about the data source.
 50 |         
 51 |         Keys should be human-readable strings. Values can be of various types.
 52 |         Should include an 'error' key if metadata retrieval fails.
 53 | 
 54 |         Returns:
 55 |             A dictionary with metadata summary or an error dictionary.
 56 |         """
 57 |         pass
 58 | 
 59 |     @abstractmethod
 60 |     def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
 61 |         """
 62 |         Get the schema as a list of dictionaries.
 63 |         
 64 |         Each dictionary should represent a column and ideally contain keys:
 65 |         - 'name' (str): Column name.
 66 |         - 'type' (str): Formatted data type string.
 67 |         - 'nullable' (Any): Indicator of nullability (e.g., bool, str "YES"/"NO").
 68 | 
 69 |         Returns:
 70 |             A list of schema dictionaries, an empty list if no columns,
 71 |             or None if schema retrieval failed.
 72 |         """
 73 |         pass
 74 | 
 75 |     @abstractmethod
 76 |     def get_data_preview(self, num_rows: int = 50) -> Optional[pd.DataFrame]:
 77 |         """
 78 |         Fetch a preview of the data.
 79 | 
 80 |         Args:
 81 |             num_rows: The maximum number of rows to fetch.
 82 | 
 83 |         Returns:
 84 |             A pandas DataFrame with preview data, an empty DataFrame if no data,
 85 |             a DataFrame with an 'error' column on failure, or None on critical failure.
 86 |         """
 87 |         pass
 88 | 
 89 |     @abstractmethod
 90 |     def get_column_stats(self, column_name: str) -> Dict[str, Any]:
 91 |         """
 92 |         Calculate and return statistics for a specific column.
 93 |         
 94 |         The returned dictionary should ideally contain keys like:
 95 |         - 'column' (str): Column name.
 96 |         - 'type' (str): Formatted data type string.
 97 |         - 'nullable' (Any): Nullability indicator.
 98 |         - 'calculated' (Dict[str, Any]): Dictionary of computed statistics.
 99 |         - 'error' (Optional[str]): Error message if calculation failed.
100 |         - 'message' (Optional[str]): Informational message.
101 | 
102 |         Args:
103 |             column_name: The name of the column.
104 | 
105 |         Returns:
106 |             A dictionary containing column statistics or error information.
107 |         """
108 |         pass
109 | 
110 |     def format_size(self, num_bytes: int) -> str:
111 |         """
112 |         Format bytes into a human-readable string.
113 |         
114 |         Args:
115 |             num_bytes: Number of bytes to format
116 |             
117 |         Returns:
118 |             Human-readable size string
119 |         """
120 |         if num_bytes < 1024:
121 |             return f"{num_bytes} bytes"
122 |         elif num_bytes < 1024 ** 2:
123 |             return f"{num_bytes / 1024:.1f} KB"
124 |         elif num_bytes < 1024 ** 3:
125 |             return f"{num_bytes / 1024 ** 2:.1f} MB"
126 |         else:
127 |             return f"{num_bytes / 1024 ** 3:.1f} GB"
128 | 
129 |     def __enter__(self):
130 |         """Enter the runtime context related to this object."""
131 |         return self
132 | 
133 |     def __exit__(self, exc_type, exc_val, exc_tb):
134 |         """Exit the runtime context related to this object, ensuring cleanup."""
135 |         self.close()
136 | 
137 |     def __del__(self):
138 |         """Attempt to close the handler when the object is garbage collected (best effort)."""
139 |         try:
140 |             self.close()
141 |         except Exception:
142 |             # Ignore exceptions during garbage collection
143 |             pass
144 | 


--------------------------------------------------------------------------------
/src/parqv/views/components/enhanced_data_table.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Enhanced data table component for parqv views.
  3 | """
  4 | 
  5 | from typing import Optional, List, Tuple, Any
  6 | 
  7 | import pandas as pd
  8 | from textual.containers import Container
  9 | from textual.widgets import DataTable, Static
 10 | 
 11 | from ...core import get_logger
 12 | 
 13 | log = get_logger(__name__)
 14 | 
 15 | 
 16 | class EnhancedDataTable(Container):
 17 |     """
 18 |     An enhanced data table component that handles DataFrame display with better error handling.
 19 |     """
 20 |     
 21 |     def __init__(self, **kwargs):
 22 |         super().__init__(**kwargs)
 23 |         self._table: Optional[DataTable] = None
 24 |     
 25 |     def compose(self):
 26 |         """Compose the data table layout."""
 27 |         self._table = DataTable(id="enhanced-data-table")
 28 |         self._table.cursor_type = "row"
 29 |         yield self._table
 30 |     
 31 |     def clear_table(self) -> bool:
 32 |         """
 33 |         Clear the table contents safely.
 34 |         
 35 |         Returns:
 36 |             True if cleared successfully, False if recreation was needed
 37 |         """
 38 |         if not self._table:
 39 |             return False
 40 |             
 41 |         try:
 42 |             self._table.clear(columns=True)
 43 |             return True
 44 |         except Exception as e:
 45 |             log.warning(f"Failed to clear table, recreating: {e}")
 46 |             return self._recreate_table()
 47 |     
 48 |     def _recreate_table(self) -> bool:
 49 |         """
 50 |         Recreate the table if clearing failed.
 51 |         
 52 |         Returns:
 53 |             True if recreation was successful, False otherwise
 54 |         """
 55 |         try:
 56 |             if self._table:
 57 |                 self._table.remove()
 58 |             
 59 |             self._table = DataTable(id="enhanced-data-table")
 60 |             self._table.cursor_type = "row"
 61 |             self.mount(self._table)
 62 |             return True
 63 |         except Exception as e:
 64 |             log.error(f"Failed to recreate table: {e}")
 65 |             return False
 66 |     
 67 |     def load_dataframe(self, df: pd.DataFrame, max_rows: Optional[int] = None) -> bool:
 68 |         """
 69 |         Load a pandas DataFrame into the table.
 70 |         
 71 |         Args:
 72 |             df: The DataFrame to load
 73 |             max_rows: Optional maximum number of rows to display
 74 |             
 75 |         Returns:
 76 |             True if loaded successfully, False otherwise
 77 |         """
 78 |         if not self._table:
 79 |             log.error("Table not initialized")
 80 |             return False
 81 |         
 82 |         try:
 83 |             # Clear existing content
 84 |             if not self.clear_table():
 85 |                 return False
 86 |             
 87 |             # Handle empty DataFrame
 88 |             if df.empty:
 89 |                 self._show_empty_message()
 90 |                 return True
 91 |             
 92 |             # Limit rows if specified
 93 |             display_df = df.head(max_rows) if max_rows else df
 94 |             
 95 |             # Add columns
 96 |             columns = [str(col) for col in display_df.columns]
 97 |             self._table.add_columns(*columns)
 98 |             
 99 |             # Add rows
100 |             rows_data = self._prepare_rows_data(display_df)
101 |             self._table.add_rows(rows_data)
102 |             
103 |             log.info(f"Loaded {len(display_df)} rows and {len(columns)} columns into table")
104 |             return True
105 |             
106 |         except Exception as e:
107 |             log.exception(f"Error loading DataFrame into table: {e}")
108 |             self._show_error_message(f"Failed to load data: {e}")
109 |             return False
110 |     
111 |     def _prepare_rows_data(self, df: pd.DataFrame) -> List[Tuple[str, ...]]:
112 |         """
113 |         Prepare DataFrame rows for the DataTable.
114 |         
115 |         Args:
116 |             df: The DataFrame to process
117 |             
118 |         Returns:
119 |             List of tuples representing table rows
120 |         """
121 |         rows_data = []
122 |         for row in df.itertuples(index=False, name=None):
123 |             # Convert each item to string, handling NaN values
124 |             row_strings = tuple(
125 |                 str(item) if pd.notna(item) else "" 
126 |                 for item in row
127 |             )
128 |             rows_data.append(row_strings)
129 |         return rows_data
130 |     
131 |     def _show_empty_message(self) -> None:
132 |         """Show a message when the DataFrame is empty."""
133 |         try:
134 |             self.query("Static").remove()  # Remove any existing messages
135 |             empty_msg = Static("No data available in the selected range or file is empty.", 
136 |                              classes="info-content")
137 |             self.mount(empty_msg)
138 |         except Exception as e:
139 |             log.error(f"Failed to show empty message: {e}")
140 |     
141 |     def _show_error_message(self, message: str) -> None:
142 |         """Show an error message in the table area."""
143 |         try:
144 |             self.query("DataTable, Static").remove()  # Remove table and any messages
145 |             error_msg = Static(f"[red]{message}[/red]", classes="error-content")
146 |             self.mount(error_msg)
147 |         except Exception as e:
148 |             log.error(f"Failed to show error message: {e}")
149 |     
150 |     def get_table(self) -> Optional[DataTable]:
151 |         """Get the underlying DataTable widget."""
152 |         return self._table 


--------------------------------------------------------------------------------
/src/parqv/views/utils/data_formatters.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data formatting utilities for parqv views.
  3 | """
  4 | 
  5 | from typing import Any, Dict, Union
  6 | from rich.text import Text
  7 | 
  8 | 
  9 | def format_metadata_for_display(metadata: Dict[str, Any]) -> Dict[str, Any]:
 10 |     """
 11 |     Format metadata dictionary for consistent display.
 12 |     
 13 |     Args:
 14 |         metadata: Raw metadata dictionary from handler
 15 |         
 16 |     Returns:
 17 |         Formatted metadata dictionary ready for display
 18 |     """
 19 |     if not metadata:
 20 |         return {"Error": "No metadata available"}
 21 |     
 22 |     # Check for error in metadata
 23 |     if "error" in metadata:
 24 |         return {"Error": metadata["error"]}
 25 |     
 26 |     formatted = {}
 27 |     
 28 |     # Format specific known fields with better presentation
 29 |     field_formatters = {
 30 |         "File Path": lambda x: str(x),
 31 |         "Path": lambda x: str(x),
 32 |         "Format": lambda x: str(x).upper(),
 33 |         "Total Rows": lambda x: _format_number(x),
 34 |         "Total Columns": lambda x: _format_number(x),
 35 |         "Columns": lambda x: _format_number(x),
 36 |         "Size": lambda x: _format_size_if_bytes(x),
 37 |         "Memory Usage": lambda x: _format_size_if_bytes(x),
 38 |         "DuckDB View": lambda x: f"`{x}`" if x else "N/A",
 39 |     }
 40 |     
 41 |     for key, value in metadata.items():
 42 |         if isinstance(value, dict):
 43 |             # Handle nested dictionaries (like grouped metadata)
 44 |             formatted[key] = _format_nested_metadata(value, field_formatters)
 45 |         elif key in field_formatters:
 46 |             formatted[key] = field_formatters[key](value)
 47 |         else:
 48 |             formatted[key] = format_value_for_display(value)
 49 |     
 50 |     return formatted
 51 | 
 52 | 
 53 | def _format_nested_metadata(nested_dict: Dict[str, Any], field_formatters: Dict) -> Dict[str, Any]:
 54 |     """Format nested metadata dictionaries."""
 55 |     formatted_nested = {}
 56 |     
 57 |     for key, value in nested_dict.items():
 58 |         if isinstance(value, dict):
 59 |             # Handle further nesting if needed
 60 |             formatted_nested[key] = _format_nested_metadata(value, field_formatters)
 61 |         elif key in field_formatters:
 62 |             formatted_nested[key] = field_formatters[key](value)
 63 |         else:
 64 |             formatted_nested[key] = format_value_for_display(value)
 65 |     
 66 |     return formatted_nested
 67 | 
 68 | 
 69 | def format_value_for_display(value: Any) -> str:
 70 |     """
 71 |     Format a single value for display in the UI.
 72 |     
 73 |     Args:
 74 |         value: The value to format
 75 |         
 76 |     Returns:
 77 |         String representation suitable for display
 78 |     """
 79 |     if value is None:
 80 |         return "N/A"
 81 |     
 82 |     if isinstance(value, (int, float)):
 83 |         return _format_number(value)
 84 |     
 85 |     if isinstance(value, bool):
 86 |         return "Yes" if value else "No"
 87 |     
 88 |     if isinstance(value, str):
 89 |         # Handle empty strings
 90 |         if not value.strip():
 91 |             return "N/A"
 92 |         return value
 93 |     
 94 |     # For other types, convert to string
 95 |     return str(value)
 96 | 
 97 | 
 98 | def _format_number(value: Union[str, int, float]) -> str:
 99 |     """
100 |     Format numbers with thousand separators.
101 |     
102 |     Args:
103 |         value: Numeric value or string representation
104 |         
105 |     Returns:
106 |         Formatted number string
107 |     """
108 |     if isinstance(value, str):
109 |         # Try to extract number from string like "1,234" or "1234"
110 |         try:
111 |             # Remove existing commas and convert
112 |             clean_str = value.replace(",", "").strip()
113 |             if clean_str.isdigit():
114 |                 return f"{int(clean_str):,}"
115 |             elif "." in clean_str:
116 |                 return f"{float(clean_str):,.2f}"
117 |             else:
118 |                 return value  # Return as-is if not numeric
119 |         except (ValueError, AttributeError):
120 |             return value
121 |     
122 |     if isinstance(value, int):
123 |         return f"{value:,}"
124 |     
125 |     if isinstance(value, float):
126 |         return f"{value:,.2f}"
127 |     
128 |     return str(value)
129 | 
130 | 
131 | def _format_size_if_bytes(value: Union[str, int]) -> str:
132 |     """
133 |     Format size values, detecting if they represent bytes.
134 |     
135 |     Args:
136 |         value: Size value that might be in bytes
137 |         
138 |     Returns:
139 |         Formatted size string
140 |     """
141 |     if isinstance(value, str):
142 |         # If it already contains size units, return as-is
143 |         if any(unit in value.lower() for unit in ["kb", "mb", "gb", "tb", "bytes"]):
144 |             return value
145 |         
146 |         # Try to parse as number and format as bytes
147 |         try:
148 |             clean_str = value.replace(",", "").strip()
149 |             if "bytes" in value.lower():
150 |                 num_bytes = int(clean_str.split()[0])
151 |                 return _format_bytes(num_bytes)
152 |             else:
153 |                 return value
154 |         except (ValueError, IndexError):
155 |             return value
156 |     
157 |     if isinstance(value, int):
158 |         # Assume it's bytes if it's a large integer
159 |         if value > 1024:
160 |             return _format_bytes(value)
161 |         else:
162 |             return f"{value:,}"
163 |     
164 |     return str(value)
165 | 
166 | 
167 | def _format_bytes(num_bytes: int) -> str:
168 |     """
169 |     Format bytes into human-readable format.
170 |     
171 |     Args:
172 |         num_bytes: Number of bytes
173 |         
174 |     Returns:
175 |         Human-readable size string
176 |     """
177 |     if num_bytes < 1024:
178 |         return f"{num_bytes:,} bytes"
179 |     elif num_bytes < 1024 ** 2:
180 |         return f"{num_bytes / 1024:.1f} KB"
181 |     elif num_bytes < 1024 ** 3:
182 |         return f"{num_bytes / 1024 ** 2:.1f} MB"
183 |     else:
184 |         return f"{num_bytes / 1024 ** 3:.1f} GB" 


--------------------------------------------------------------------------------
/src/parqv/app.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Optional
  3 | 
  4 | from textual.app import App, ComposeResult, Binding
  5 | from textual.containers import Container
  6 | from textual.widgets import Header, Footer, Static, Label, TabbedContent, TabPane
  7 | 
  8 | from .core import CSS_PATH, FileValidationError, validate_and_detect_file, HandlerFactory, HandlerCreationError, get_logger
  9 | from .data_sources import DataHandler
 10 | from .views.data_view import DataView
 11 | from .views.metadata_view import MetadataView
 12 | from .views.schema_view import SchemaView
 13 | 
 14 | log = get_logger(__name__)
 15 | 
 16 | 
 17 | class ParqV(App[None]):
 18 |     """A Textual app to visualize Parquet or JSON files."""
 19 | 
 20 |     CSS_PATH = CSS_PATH
 21 |     BINDINGS = [
 22 |         Binding("q", "quit", "Quit", priority=True),
 23 |     ]
 24 | 
 25 |     def __init__(self, file_path_str: Optional[str] = None, *args, **kwargs):
 26 |         """
 27 |         Initialize the ParqV application.
 28 |         
 29 |         Args:
 30 |             file_path_str: Path to the file to visualize
 31 |             *args, **kwargs: Additional arguments for the Textual App
 32 |         """
 33 |         super().__init__(*args, **kwargs)
 34 |         
 35 |         # Application state
 36 |         self.file_path: Optional[Path] = None
 37 |         self.handler: Optional[DataHandler] = None
 38 |         self.handler_type: Optional[str] = None
 39 |         self.error_message: Optional[str] = None
 40 |         
 41 |         # Initialize with file if provided
 42 |         if file_path_str:
 43 |             self._initialize_file_handler(file_path_str)
 44 | 
 45 |     def _initialize_file_handler(self, file_path_str: str) -> None:
 46 |         """
 47 |         Initialize the file handler for the given file path.
 48 |         
 49 |         Args:
 50 |             file_path_str: Path to the file to process
 51 |         """
 52 |         try:
 53 |             # Validate file and detect type
 54 |             self.file_path, self.handler_type = validate_and_detect_file(file_path_str)
 55 |             
 56 |             # Create appropriate handler
 57 |             self.handler = HandlerFactory.create_handler(self.file_path, self.handler_type)
 58 |             
 59 |             log.info(f"Successfully initialized {self.handler_type} handler for: {self.file_path.name}")
 60 |             
 61 |         except (FileValidationError, HandlerCreationError) as e:
 62 |             self.error_message = str(e)
 63 |             log.error(f"Failed to initialize handler: {e}")
 64 |             
 65 |         except Exception as e:
 66 |             self.error_message = f"An unexpected error occurred: {e}"
 67 |             log.exception("Unexpected error during handler initialization")
 68 | 
 69 |     def compose(self) -> ComposeResult:
 70 |         """Compose the UI layout."""
 71 |         yield Header()
 72 |         
 73 |         if self.error_message:
 74 |             log.debug(f"Displaying error message: {self.error_message}")
 75 |             yield Container(
 76 |                 Label("Error Loading File:", classes="error-title"),
 77 |                 Static(self.error_message, classes="error-content"),
 78 |                 id="error-container"
 79 |             )
 80 |         elif self.handler:
 81 |             log.debug(f"Composing main layout with TabbedContent for {self.handler_type} handler.")
 82 |             with TabbedContent(id="main-tabs"):
 83 |                 yield TabPane("Metadata", MetadataView(id="metadata-view"), id="tab-metadata")
 84 |                 yield TabPane("Schema", SchemaView(id="schema-view"), id="tab-schema")
 85 |                 yield TabPane("Data Preview", DataView(id="data-view"), id="tab-data")
 86 |         else:
 87 |             log.warning("No handler available and no error message set")
 88 |             yield Container(
 89 |                 Label("No file loaded.", classes="error-title"),
 90 |                 Static("Please provide a valid file path.", classes="error-content"),
 91 |                 id="no-file-container"
 92 |             )
 93 |             
 94 |         yield Footer()
 95 | 
 96 |     def on_mount(self) -> None:
 97 |         """Handle app mount event - set up header information."""
 98 |         log.debug("App mounted.")
 99 |         self._update_header()
100 | 
101 |     def _update_header(self) -> None:
102 |         """Update the header with file and format information."""
103 |         try:
104 |             header = self.query_one(Header)
105 |             
106 |             if self.handler and self.file_path and self.handler_type:
107 |                 display_name = self.file_path.name
108 |                 format_name = self.handler_type.capitalize()
109 |                 header.title = f"parqv - {display_name}"
110 |                 header.sub_title = f"Format: {format_name}"
111 |             elif self.error_message:
112 |                 header.title = "parqv - Error"
113 |                 header.sub_title = "Failed to load file"
114 |             else:
115 |                 header.title = "parqv"
116 |                 header.sub_title = "File Viewer"
117 |                 
118 |         except Exception as e:
119 |             log.error(f"Failed to update header: {e}")
120 | 
121 |     def action_quit(self) -> None:
122 |         """Handle quit action - cleanup and exit."""
123 |         log.info("Quit action triggered.")
124 |         self._cleanup()
125 |         self.exit()
126 | 
127 |     def _cleanup(self) -> None:
128 |         """Clean up resources before exit."""
129 |         if self.handler:
130 |             try:
131 |                 self.handler.close()
132 |                 log.info("Handler closed successfully.")
133 |             except Exception as e:
134 |                 log.error(f"Error during handler cleanup: {e}")
135 | 
136 | 
137 | # For backward compatibility, keep the old CLI entry point
138 | def run_app():
139 |     """
140 |     Legacy CLI entry point for backward compatibility.
141 |     
142 |     Note: New code should use parqv.cli.run_app() instead.
143 |     """
144 |     from .cli import run_app as new_run_app
145 |     log.warning("Using legacy run_app(). Consider importing from parqv.cli instead.")
146 |     new_run_app()
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     run_app()
151 | 


--------------------------------------------------------------------------------
/src/parqv/views/utils/visualization.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Visualization utilities for parqv views.
  3 | 
  4 | Provides text-based data visualization functions like ASCII histograms.
  5 | """
  6 | import math
  7 | from typing import List, Union, Optional
  8 | 
  9 | TICK_CHARS = [' ', '▂', '▃', '▄', '▅', '▆', '▇', '█']
 10 | 
 11 | 
 12 | def create_text_histogram(
 13 |         data: List[Union[int, float]],
 14 |         bins: int = 15,
 15 |         width: int = 60,
 16 |         height: int = 8,
 17 |         title: Optional[str] = None
 18 | ) -> List[str]:
 19 |     """
 20 |     Create a professional, text-based histogram from numerical data.
 21 | 
 22 |     Args:
 23 |         data: List of numerical values.
 24 |         bins: The number of bins for the histogram.
 25 |         width: The total character width of the output histogram.
 26 |         height: The maximum height of the histogram bars in lines.
 27 |         title: An optional title for the histogram.
 28 | 
 29 |     Returns:
 30 |         A list of strings representing the histogram, ready for printing.
 31 |     """
 32 |     if not data:
 33 |         return ["(No data available for histogram)"]
 34 | 
 35 |     # 1. Sanitize the input data
 36 |     clean_data = [float(val) for val in data if isinstance(val, (int, float)) and math.isfinite(val)]
 37 | 
 38 |     if not clean_data:
 39 |         return ["(No valid numerical data to plot)"]
 40 | 
 41 |     min_val, max_val = min(clean_data), max(clean_data)
 42 | 
 43 |     if min_val == max_val:
 44 |         return [f"(All values are identical: {_format_number(min_val)})"]
 45 | 
 46 |     # 2. Create bins and count frequencies
 47 |     # Add a small epsilon to the range to ensure max_val falls into the last bin
 48 |     epsilon = (max_val - min_val) / 1e9
 49 |     value_range = (max_val - min_val) + epsilon
 50 |     bin_width = value_range / bins
 51 | 
 52 |     bin_counts = [0] * bins
 53 |     for value in clean_data:
 54 |         bin_index = int((value - min_val) / bin_width)
 55 |         bin_counts[bin_index] += 1
 56 | 
 57 |     # 3. Render the histogram
 58 |     return _render_histogram(
 59 |         bin_counts=bin_counts,
 60 |         min_val=min_val,
 61 |         max_val=max_val,
 62 |         width=width,
 63 |         height=height,
 64 |         title=title
 65 |     )
 66 | 
 67 | 
 68 | def _render_histogram(
 69 |         bin_counts: List[int],
 70 |         min_val: float,
 71 |         max_val: float,
 72 |         width: int,
 73 |         height: int,
 74 |         title: Optional[str]
 75 | ) -> List[str]:
 76 |     """
 77 |     Internal function to render the histogram components into ASCII art.
 78 |     """
 79 |     lines = []
 80 |     if title:
 81 |         lines.append(title.center(width))
 82 | 
 83 |     max_count = max(bin_counts) if bin_counts else 0
 84 |     if max_count == 0:
 85 |         return lines + ["(No data falls within histogram bins)"]
 86 | 
 87 |     # --- Layout Calculations ---
 88 |     y_axis_width = len(str(max_count))
 89 |     plot_width = width - y_axis_width - 3  # Reserve space for "| " and axis
 90 |     if plot_width <= 0:
 91 |         return ["(Terminal width too narrow to draw histogram)"]
 92 | 
 93 |     # Resample the data bins to fit the available plot_width.
 94 |     # This stretches or shrinks the histogram to match the screen space.
 95 |     display_bins = []
 96 |     num_data_bins = len(bin_counts)
 97 |     for i in range(plot_width):
 98 |         # Find the corresponding data bin for this screen column
 99 |         data_bin_index = int(i * num_data_bins / plot_width)
100 |         display_bins.append(bin_counts[data_bin_index])
101 | 
102 |     # --- Y-Axis and Bars (Top to Bottom) ---
103 |     for row in range(height, -1, -1):
104 |         line = ""
105 |         # Y-axis labels
106 |         if row == height:
107 |             line += f"{max_count:<{y_axis_width}} | "
108 |         elif row == 0:
109 |             line += f"{0:<{y_axis_width}} +-"
110 |         else:
111 |             line += " " * y_axis_width + " | "
112 | 
113 |         # Bars - now iterate over the resampled display_bins
114 |         for count in display_bins:
115 |             # Scale current count to the available height
116 |             scaled_height = (count / max_count) * height
117 | 
118 |             # Determine character based on height relative to current row
119 |             if scaled_height >= row:
120 |                 line += TICK_CHARS[-1]  # Full block for the solid part of the bar
121 |             elif scaled_height > row - 1:
122 |                 # This is the top of the bar, use a partial character
123 |                 partial_index = int((scaled_height - row + 1) * (len(TICK_CHARS) - 1))
124 |                 line += TICK_CHARS[max(0, partial_index)]
125 |             elif row == 0:
126 |                 line += "-"  # X-axis line
127 |             else:
128 |                 line += " "  # Empty space above the bar
129 | 
130 |         lines.append(line)
131 | 
132 |     # --- X-Axis Labels ---
133 |     x_axis_labels = _create_x_axis_labels(min_val, max_val, plot_width)
134 |     label_line = " " * (y_axis_width + 3) + x_axis_labels
135 |     lines.append(label_line)
136 | 
137 |     return lines
138 | 
139 | 
140 | def _create_x_axis_labels(min_val: float, max_val: float, plot_width: int) -> str:
141 |     """Create a formatted string for the X-axis labels."""
142 |     min_label = _format_number(min_val)
143 |     max_label = _format_number(max_val)
144 | 
145 |     available_width = plot_width - len(min_label) - len(max_label)
146 | 
147 |     if available_width < 4:
148 |         return f"{min_label}{' ' * (plot_width - len(min_label) - len(max_label))}{max_label}"
149 | 
150 |     mid_val = (min_val + max_val) / 2
151 |     mid_label = _format_number(mid_val)
152 | 
153 |     spacing1 = (plot_width // 2) - len(min_label) - (len(mid_label) // 2)
154 |     spacing2 = (plot_width - (plot_width // 2)) - (len(mid_label) - (len(mid_label) // 2)) - len(max_label)
155 | 
156 |     if spacing1 < 1 or spacing2 < 1:
157 |         return f"{min_label}{' ' * (plot_width - len(min_label) - len(max_label))}{max_label}"
158 | 
159 |     return f"{min_label}{' ' * spacing1}{mid_label}{' ' * spacing2}{max_label}"
160 | 
161 | 
162 | def _format_number(value: float) -> str:
163 |     """Format a number nicely for display on an axis."""
164 |     if abs(value) < 1e-4 and value != 0:
165 |         return f"{value:.1e}"
166 |     if abs(value) >= 1e5:
167 |         return f"{value:.1e}"
168 |     if math.isclose(value, int(value)):
169 |         return str(int(value))
170 |     if abs(value) < 10:
171 |         return f"{value:.2f}"
172 |     if abs(value) < 100:
173 |         return f"{value:.1f}"
174 |     return str(int(value))
175 | 
176 | 
177 | def should_show_histogram(data_type: str, distinct_count: int, total_count: int) -> bool:
178 |     """
179 |     Determine if a histogram should be shown for this data.
180 |     This function uses a set of heuristics to decide if the data is
181 |     continuous enough to warrant a histogram visualization.
182 |     """
183 |     # 1. Type Check: Histograms are only meaningful for numeric data.
184 |     if 'numeric' not in data_type and 'integer' not in data_type and 'float' not in data_type:
185 |         return False
186 | 
187 |     # 2. Data Volume Check: Don't render if there's too little data or no variation.
188 |     if total_count < 20 or distinct_count <= 1:
189 |         return False
190 | 
191 |     # 3. Categorical Data Filter: If the number of distinct values is very low,
192 |     #    treat it as categorical data (e.g., ratings from 1-10, months 1-12).
193 |     if distinct_count < 15:
194 |         return False
195 | 
196 |     # 4. High Cardinality Filter: If almost every value is unique (like an ID or index),
197 |     #    a histogram is not useful as most bars would have a height of 1.
198 |     distinct_ratio = distinct_count / total_count
199 |     if distinct_ratio > 0.95:
200 |         return False
201 | 
202 |     # 5. Pass: If the data passes all the above filters, it is considered
203 |     #    sufficiently continuous to be visualized with a histogram.
204 |     return True
205 | 


--------------------------------------------------------------------------------
/src/parqv/views/utils/stats_formatters.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Statistics formatting utilities for parqv views.
  3 | """
  4 | 
  5 | from typing import Any, Dict, List, Union
  6 | 
  7 | from rich.text import Text
  8 | 
  9 | from .visualization import create_text_histogram, should_show_histogram
 10 | 
 11 | 
 12 | def format_stats_for_display(stats_data: Dict[str, Any]) -> List[Union[str, Text]]:
 13 |     """
 14 |     Format statistics dictionary for display as lines of rich text.
 15 |     
 16 |     Args:
 17 |         stats_data: Raw statistics dictionary from handler
 18 |         
 19 |     Returns:
 20 |         List of formatted lines ready for display
 21 |     """
 22 |     if not stats_data:
 23 |         return [Text.from_markup("[red]No statistics data available.[/red]")]
 24 | 
 25 |     lines: List[Union[str, Text]] = []
 26 | 
 27 |     # Extract basic column information
 28 |     col_name = stats_data.get("column", "N/A")
 29 |     col_type = stats_data.get("type", "Unknown")
 30 |     nullable_val = stats_data.get("nullable")
 31 | 
 32 |     # Format column header
 33 |     lines.extend(_format_column_header(col_name, col_type, nullable_val))
 34 | 
 35 |     # Handle calculation errors
 36 |     calc_error = stats_data.get("error")
 37 |     if calc_error:
 38 |         lines.extend(_format_error_section(calc_error))
 39 | 
 40 |     # Add informational messages
 41 |     message = stats_data.get("message")
 42 |     if message:
 43 |         lines.extend(_format_message_section(message))
 44 | 
 45 |     # Format calculated statistics
 46 |     calculated = stats_data.get("calculated")
 47 |     if calculated:
 48 |         lines.extend(_format_calculated_stats(calculated, has_error=bool(calc_error)))
 49 | 
 50 |     return lines
 51 | 
 52 | 
 53 | def format_column_info(column_name: str, column_type: str, nullable: Any) -> List[Union[str, Text]]:
 54 |     """
 55 |     Format basic column information for display.
 56 |     
 57 |     Args:
 58 |         column_name: Name of the column
 59 |         column_type: Type of the column
 60 |         nullable: Nullability information
 61 |         
 62 |     Returns:
 63 |         List of formatted lines for column info
 64 |     """
 65 |     return _format_column_header(column_name, column_type, nullable)
 66 | 
 67 | 
 68 | def _format_column_header(col_name: str, col_type: str, nullable_val: Any) -> List[Union[str, Text]]:
 69 |     """Format the column header section."""
 70 |     # Determine nullability display
 71 |     if nullable_val is True:
 72 |         nullable_str = "Nullable"
 73 |     elif nullable_val is False:
 74 |         nullable_str = "Required"
 75 |     else:
 76 |         nullable_str = "Unknown Nullability"
 77 | 
 78 |     lines = [
 79 |         Text.assemble(("Column: ", "bold"), f"`{col_name}`"),
 80 |         Text.assemble(("Type:   ", "bold"), f"{col_type} ({nullable_str})"),
 81 |         "─" * (len(col_name) + len(col_type) + 20)
 82 |     ]
 83 | 
 84 |     return lines
 85 | 
 86 | 
 87 | def _format_error_section(calc_error: str) -> List[Union[str, Text]]:
 88 |     """Format the error section."""
 89 |     return [
 90 |         Text("Calculation Error:", style="bold red"),
 91 |         f"```\n{calc_error}\n```",
 92 |         ""
 93 |     ]
 94 | 
 95 | 
 96 | def _format_message_section(message: str) -> List[Union[str, Text]]:
 97 |     """Format the informational message section."""
 98 |     return [
 99 |         Text(f"Info: {message}", style="italic cyan"),
100 |         ""
101 |     ]
102 | 
103 | 
104 | def _format_calculated_stats(calculated: Dict[str, Any], has_error: bool = False) -> List[Union[str, Text]]:
105 |     """Format the calculated statistics section."""
106 |     lines = [Text("Calculated Statistics:", style="bold")]
107 | 
108 |     # Define the order of statistics to display
109 |     stats_order = [
110 |         "Total Count", "Valid Count", "Null Count", "Null Percentage",
111 |         "Distinct Count", "Distinct Values (Approx)",
112 |         "Min", "Max", "Mean", "Median (50%)", "StdDev", "Variance",
113 |         "True Count", "False Count",
114 |         "Value Counts"
115 |     ]
116 | 
117 |     found_stats = False
118 | 
119 |     for key in stats_order:
120 |         if key in calculated:
121 |             found_stats = True
122 |             value = calculated[key]
123 |             lines.extend(_format_single_stat(key, value))
124 | 
125 |     # Add any additional stats not in the predefined order (excluding internal histogram data)
126 |     for key, value in calculated.items():
127 |         if key not in stats_order and not key.startswith('_'):  # Skip internal fields
128 |             found_stats = True
129 |             lines.extend(_format_single_stat(key, value))
130 | 
131 |     # Handle case where no stats were found
132 |     if not found_stats and not has_error:
133 |         lines.append(Text("  (No specific stats calculated for this type)", style="dim"))
134 | 
135 |     # Add histogram visualization for numeric data
136 |     if "_histogram_data" in calculated and "_data_type" in calculated:
137 |         if calculated["_data_type"] == "numeric":
138 |             lines.extend(_format_histogram_visualization(calculated))
139 | 
140 |     return lines
141 | 
142 | 
143 | def _format_single_stat(key: str, value: Any) -> List[Union[str, Text]]:
144 |     """Format a single statistic entry."""
145 |     lines = []
146 | 
147 |     if key == "Value Counts" and isinstance(value, dict):
148 |         lines.append(f"  - {key}:")
149 |         for sub_key, sub_val in value.items():
150 |             sub_val_str = _format_stat_value(sub_val)
151 |             lines.append(f"    - {sub_key}: {sub_val_str}")
152 |     else:
153 |         formatted_value = _format_stat_value(value)
154 |         lines.append(f"  - {key}: {formatted_value}")
155 | 
156 |     return lines
157 | 
158 | 
159 | def _format_stat_value(value: Any) -> str:
160 |     """Format a single statistic value."""
161 |     if isinstance(value, (int, float)):
162 |         if isinstance(value, int):
163 |             return f"{value:,}"
164 |         else:
165 |             return f"{value:,.4f}"
166 |     else:
167 |         return str(value)
168 | 
169 | 
170 | def _format_histogram_visualization(calculated: Dict[str, Any]) -> List[Union[str, Text]]:
171 |     """Format histogram visualization for numeric data."""
172 |     lines = []
173 | 
174 |     try:
175 |         histogram_data = calculated.get("_histogram_data", [])
176 |         if not histogram_data:
177 |             return lines
178 | 
179 |         # Check if we should show histogram
180 |         distinct_count_str = calculated.get("Distinct Count", "0")
181 |         try:
182 |             # Remove commas and convert to int
183 |             distinct_count = int(distinct_count_str.replace(",", ""))
184 |         except (ValueError, AttributeError):
185 |             distinct_count = len(set(histogram_data))
186 | 
187 |         total_count = len(histogram_data)
188 | 
189 |         if should_show_histogram("numeric", distinct_count, total_count):
190 |             lines.append("")
191 |             lines.append(Text("Data Distribution:", style="bold cyan"))
192 | 
193 |             # Create histogram
194 |             histogram_lines = create_text_histogram(
195 |                 data=histogram_data,
196 |                 bins=15,
197 |                 width=50,
198 |                 height=8,
199 |                 title=None
200 |             )
201 | 
202 |             # Add each histogram line
203 |             for line in histogram_lines:
204 |                 if isinstance(line, str):
205 |                     lines.append(f"  {line}")
206 |                 else:
207 |                     lines.append(line)
208 |         else:
209 |             # For discrete data, show a note
210 |             if distinct_count < total_count * 0.1:  # Less than 10% unique values
211 |                 lines.append("")
212 |                 lines.append(Text("Note: Data appears to be discrete/categorical", style="dim italic"))
213 |                 lines.append(Text("(Histogram not shown for discrete values)", style="dim italic"))
214 | 
215 |     except Exception as e:
216 |         # Don't fail the whole stats display if histogram fails
217 |         lines.append("")
218 |         lines.append(Text(f"Note: Could not generate histogram: {e}", style="dim red"))
219 | 
220 |     return lines
221 | 


--------------------------------------------------------------------------------
/src/parqv/views/schema_view.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Schema view for displaying column schema and statistics.
  3 | """
  4 | 
  5 | from typing import Dict, Any, Optional, List
  6 | 
  7 | from rich.text import Text
  8 | from textual.app import ComposeResult
  9 | from textual.containers import VerticalScroll, Container, Horizontal
 10 | from textual.reactive import var
 11 | from textual.widgets import Static, ListView, ListItem, Label, LoadingIndicator
 12 | 
 13 | from .base import BaseView
 14 | from .utils import format_stats_for_display
 15 | 
 16 | 
 17 | class ColumnListItem(ListItem):
 18 |     """A ListItem that stores the column name for schema display."""
 19 | 
 20 |     def __init__(self, column_name: str) -> None:
 21 |         # Ensure IDs are CSS-safe (replace spaces, etc.)
 22 |         safe_id_name = "".join(c if c.isalnum() else '_' for c in column_name)
 23 |         super().__init__(Label(column_name), name=column_name, id=f"col-item-{safe_id_name}")
 24 |         self.column_name = column_name
 25 | 
 26 | 
 27 | class SchemaView(BaseView):
 28 |     """
 29 |     View for displaying schema information and column statistics.
 30 |     
 31 |     Shows a list of columns on the left and detailed statistics
 32 |     for the selected column on the right.
 33 |     """
 34 | 
 35 |     DEFAULT_STATS_MESSAGE = "Select a column from the list to view its statistics."
 36 | 
 37 |     # Reactive variable for loading state
 38 |     loading = var(False)
 39 | 
 40 |     def __init__(self, **kwargs):
 41 |         super().__init__(**kwargs)
 42 |         self._columns_data: Optional[List[Dict[str, Any]]] = None
 43 |         self._current_column: Optional[str] = None
 44 | 
 45 |     def compose(self) -> ComposeResult:
 46 |         """Compose the schema view layout."""
 47 |         with Horizontal():
 48 |             # Left side: Column list
 49 |             with Container(id="column-list-container", classes="column-list"):
 50 |                 yield Static("Columns", classes="section-title")
 51 |                 yield ListView(id="column-list-view")
 52 | 
 53 |             # Right side: Column statistics
 54 |             with Container(id="stats-container", classes="column-stats"):
 55 |                 yield Static("Column Statistics", classes="section-title")
 56 |                 with VerticalScroll(id="schema-stats-scroll"):
 57 |                     yield Container(id="schema-stats-content")
 58 |                 yield LoadingIndicator(id="schema-loading-indicator")
 59 | 
 60 |     def load_content(self) -> None:
 61 |         """Load schema content."""
 62 |         if not self.check_handler_available():
 63 |             return
 64 | 
 65 |         try:
 66 |             # Load column list
 67 |             self._load_column_list()
 68 | 
 69 |             # Display default message in stats area
 70 |             self._display_default_message()
 71 | 
 72 |             self.logger.info("Schema loaded successfully")
 73 | 
 74 |         except Exception as e:
 75 |             self.show_error("Failed to load schema", e)
 76 | 
 77 |     def _load_column_list(self) -> None:
 78 |         """Load the list of columns from the data handler."""
 79 |         try:
 80 |             list_view = self.query_one("#column-list-view", ListView)
 81 |             list_view.clear()
 82 | 
 83 |             # Get schema data from handler
 84 |             self._columns_data = self.handler.get_schema_data()
 85 |             self.logger.debug(f"Received schema data: {self._columns_data}")
 86 | 
 87 |             if self._columns_data is None:
 88 |                 self._show_list_error("Could not load schema data")
 89 |                 return
 90 | 
 91 |             if not self._columns_data:
 92 |                 self._show_list_warning("Schema has no columns")
 93 |                 return
 94 | 
 95 |             # Populate column list
 96 |             column_count = 0
 97 |             for col_info in self._columns_data:
 98 |                 column_name = col_info.get("name")
 99 |                 if column_name:
100 |                     list_view.append(ColumnListItem(column_name))
101 |                     column_count += 1
102 |                 else:
103 |                     self.logger.warning("Found column info without a 'name' key")
104 | 
105 |             self.logger.info(f"Populated column list with {column_count} columns")
106 | 
107 |         except Exception as e:
108 |             self.logger.exception("Error loading column list")
109 |             self._show_list_error(f"Error loading schema: {e}")
110 | 
111 |     def _show_list_error(self, message: str) -> None:
112 |         """Show error message in the column list."""
113 |         try:
114 |             list_view = self.query_one("#column-list-view", ListView)
115 |             list_view.clear()
116 |             list_view.append(ListItem(Label(f"[red]{message}[/red]")))
117 |         except Exception as e:
118 |             self.logger.error(f"Failed to show list error: {e}")
119 | 
120 |     def _show_list_warning(self, message: str) -> None:
121 |         """Show warning message in the column list."""
122 |         try:
123 |             list_view = self.query_one("#column-list-view", ListView)
124 |             list_view.clear()
125 |             list_view.append(ListItem(Label(f"[yellow]{message}[/yellow]")))
126 |         except Exception as e:
127 |             self.logger.error(f"Failed to show list warning: {e}")
128 | 
129 |     def _display_default_message(self) -> None:
130 |         """Display the initial message in the stats area."""
131 |         try:
132 |             stats_container = self.query_one("#schema-stats-content", Container)
133 |             stats_container.query("*").remove()
134 |             stats_container.mount(Static(self.DEFAULT_STATS_MESSAGE, classes="stats-line"))
135 |         except Exception as e:
136 |             self.logger.error(f"Failed to display default stats message: {e}")
137 | 
138 |     def on_list_view_selected(self, event: ListView.Selected) -> None:
139 |         """Handle column selection from the list."""
140 |         if hasattr(event.item, 'column_name'):
141 |             column_name = event.item.column_name
142 |             self._current_column = column_name
143 |             self._load_column_stats(column_name)
144 |         else:
145 |             self.logger.warning("Selected item does not have column_name attribute")
146 | 
147 |     def _load_column_stats(self, column_name: str) -> None:
148 |         """
149 |         Load and display statistics for the selected column.
150 |         
151 |         Args:
152 |             column_name: Name of the column to analyze
153 |         """
154 |         if not self.handler:
155 |             self._show_stats_error("Data handler not available")
156 |             return
157 | 
158 |         try:
159 |             # Set loading state
160 |             self.loading = True
161 | 
162 |             # Get column statistics
163 |             self.logger.debug(f"Loading stats for column: {column_name}")
164 |             raw_stats = self.handler.get_column_stats(column_name)
165 | 
166 |             # Format stats for display
167 |             formatted_lines = format_stats_for_display(raw_stats)
168 | 
169 |             # Display the formatted stats
170 |             self._display_column_stats(formatted_lines)
171 | 
172 |         except Exception as e:
173 |             self.logger.exception(f"Error loading stats for column {column_name}")
174 |             self._show_stats_error(f"Failed to load statistics: {e}")
175 |         finally:
176 |             self.loading = False
177 | 
178 |     def _display_column_stats(self, formatted_lines: List) -> None:
179 |         """
180 |         Display formatted column statistics.
181 |         
182 |         Args:
183 |             formatted_lines: List of formatted text lines to display
184 |         """
185 |         try:
186 |             stats_container = self.query_one("#schema-stats-content", Container)
187 |             stats_container.query("*").remove()
188 | 
189 |             for line in formatted_lines:
190 |                 if isinstance(line, Text):
191 |                     stats_container.mount(Static(line, classes="stats-line"))
192 |                 else:
193 |                     stats_container.mount(Static(str(line), classes="stats-line"))
194 | 
195 |         except Exception as e:
196 |             self.logger.error(f"Failed to display column stats: {e}")
197 |             self._show_stats_error("Failed to display statistics")
198 | 
199 |     def _show_stats_error(self, message: str) -> None:
200 |         """Show error message in the stats area."""
201 |         try:
202 |             stats_container = self.query_one("#schema-stats-content", Container)
203 |             stats_container.query("*").remove()
204 |             stats_container.mount(Static(f"[red]Error: {message}[/red]", classes="error-content"))
205 |         except Exception as e:
206 |             self.logger.error(f"Failed to show stats error: {e}")
207 | 
208 |     def watch_loading(self, loading: bool) -> None:
209 |         """React to changes in the loading state."""
210 |         try:
211 |             loading_indicator = self.query_one("#schema-loading-indicator", LoadingIndicator)
212 |             stats_scroll = self.query_one("#schema-stats-scroll", VerticalScroll)
213 | 
214 |             if loading:
215 |                 loading_indicator.display = True
216 |                 stats_scroll.display = False
217 |             else:
218 |                 loading_indicator.display = False
219 |                 stats_scroll.display = True
220 | 
221 |         except Exception as e:
222 |             self.logger.error(f"Error updating loading state: {e}")
223 | 
224 |     def refresh_schema(self) -> None:
225 |         """Refresh the schema display."""
226 |         self._current_column = None
227 |         self.clear_content()
228 |         self.load_content()
229 | 
230 |     def get_current_column(self) -> Optional[str]:
231 |         """Get the currently selected column name."""
232 |         return self._current_column
233 | 
234 |     def get_columns_data(self) -> Optional[List[Dict[str, Any]]]:
235 |         """Get the current columns data."""
236 |         return self._columns_data
237 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/src/parqv/data_sources/formats/csv.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CSV file handler for parqv data sources.
  3 | """
  4 | 
  5 | from pathlib import Path
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from ..base import DataHandler, DataHandlerError
 11 | 
 12 | 
 13 | class CsvHandlerError(DataHandlerError):
 14 |     """Custom exception for CSV handling errors."""
 15 |     pass
 16 | 
 17 | 
 18 | class CsvHandler(DataHandler):
 19 |     """
 20 |     Handles CSV file interactions using pandas.
 21 |     
 22 |     Provides methods to access metadata, schema, data preview, and column statistics
 23 |     for CSV files using pandas DataFrame operations.
 24 |     """
 25 | 
 26 |     def __init__(self, file_path: Path):
 27 |         """
 28 |         Initialize the CsvHandler by validating the path and reading the CSV file.
 29 | 
 30 |         Args:
 31 |             file_path: Path to the CSV file.
 32 | 
 33 |         Raises:
 34 |             CsvHandlerError: If the file is not found, not a file, or cannot be read.
 35 |         """
 36 |         super().__init__(file_path)
 37 |         self.df: Optional[pd.DataFrame] = None
 38 |         self._original_dtypes: Optional[Dict[str, str]] = None
 39 | 
 40 |         try:
 41 |             # Validate file existence
 42 |             if not self.file_path.is_file():
 43 |                 raise FileNotFoundError(f"CSV file not found or is not a regular file: {self.file_path}")
 44 | 
 45 |             # Read the CSV file with pandas
 46 |             self._read_csv_file()
 47 | 
 48 |             self.logger.info(f"Successfully initialized CsvHandler for: {self.file_path.name}")
 49 | 
 50 |         except FileNotFoundError as fnf_e:
 51 |             self.logger.error(f"File not found during CsvHandler initialization: {fnf_e}")
 52 |             raise CsvHandlerError(str(fnf_e)) from fnf_e
 53 |         except pd.errors.EmptyDataError as empty_e:
 54 |             self.logger.error(f"CSV file is empty: {empty_e}")
 55 |             raise CsvHandlerError(f"CSV file '{self.file_path.name}' is empty") from empty_e
 56 |         except pd.errors.ParserError as parse_e:
 57 |             self.logger.error(f"CSV parsing error: {parse_e}")
 58 |             raise CsvHandlerError(f"Failed to parse CSV file '{self.file_path.name}': {parse_e}") from parse_e
 59 |         except Exception as e:
 60 |             self.logger.exception(f"Unexpected error initializing CsvHandler for {self.file_path.name}")
 61 |             raise CsvHandlerError(f"Failed to initialize CSV handler '{self.file_path.name}': {e}") from e
 62 | 
 63 |     def _read_csv_file(self) -> None:
 64 |         """Read the CSV file using pandas with appropriate settings."""
 65 |         try:
 66 |             # Read CSV with automatic type inference
 67 |             self.df = pd.read_csv(
 68 |                 self.file_path,
 69 |                 # Basic settings
 70 |                 encoding='utf-8',
 71 |                 # Handle various separators automatically
 72 |                 sep=None,  # Let pandas auto-detect
 73 |                 engine='python',  # More flexible parsing
 74 |                 # Preserve original string representation for better type info
 75 |                 dtype=str,  # Read everything as string first
 76 |                 na_values=['', 'NULL', 'null', 'None', 'N/A', 'n/a', 'NaN', 'nan'],
 77 |                 keep_default_na=True,
 78 |             )
 79 | 
 80 |             # Store original dtypes before conversion
 81 |             self._original_dtypes = {col: 'string' for col in self.df.columns}
 82 | 
 83 |             # Try to infer better types
 84 |             self._infer_types()
 85 | 
 86 |             self.logger.debug(f"Successfully read CSV with shape: {self.df.shape}")
 87 | 
 88 |         except UnicodeDecodeError:
 89 |             # Try with different encodings
 90 |             for encoding in ['latin1', 'cp1252', 'iso-8859-1']:
 91 |                 try:
 92 |                     self.logger.warning(f"Trying encoding: {encoding}")
 93 |                     self.df = pd.read_csv(
 94 |                         self.file_path,
 95 |                         encoding=encoding,
 96 |                         sep=None,
 97 |                         engine='python',
 98 |                         dtype=str,
 99 |                         na_values=['', 'NULL', 'null', 'None', 'N/A', 'n/a', 'NaN', 'nan'],
100 |                         keep_default_na=True,
101 |                     )
102 |                     self._original_dtypes = {col: 'string' for col in self.df.columns}
103 |                     self._infer_types()
104 |                     self.logger.info(f"Successfully read CSV with encoding: {encoding}")
105 |                     break
106 |                 except UnicodeDecodeError:
107 |                     continue
108 |             else:
109 |                 raise CsvHandlerError(f"Could not decode CSV file with any common encoding")
110 | 
111 |     def _infer_types(self) -> None:
112 |         """Infer appropriate data types for columns."""
113 |         if self.df is None:
114 |             return
115 | 
116 |         for col in self.df.columns:
117 |             # Try to convert to numeric
118 |             numeric_converted = pd.to_numeric(self.df[col], errors='coerce')
119 |             if not numeric_converted.isna().all():
120 |                 # If most values can be converted to numeric, use numeric type
121 |                 non_na_original = self.df[col].notna().sum()
122 |                 non_na_converted = numeric_converted.notna().sum()
123 | 
124 |                 if non_na_converted / max(non_na_original, 1) > 0.8:  # 80% conversion success
125 |                     self.df[col] = numeric_converted
126 |                     if (numeric_converted == numeric_converted.astype('Int64', errors='ignore')).all():
127 |                         self._original_dtypes[col] = 'integer'
128 |                     else:
129 |                         self._original_dtypes[col] = 'float'
130 |                     continue
131 | 
132 |             # Try to convert to datetime
133 |             try:
134 |                 datetime_converted = pd.to_datetime(self.df[col], errors='coerce', infer_datetime_format=True)
135 |                 if not datetime_converted.isna().all():
136 |                     non_na_original = self.df[col].notna().sum()
137 |                     non_na_converted = datetime_converted.notna().sum()
138 | 
139 |                     if non_na_converted / max(non_na_original, 1) > 0.8:  # 80% conversion success
140 |                         self.df[col] = datetime_converted
141 |                         self._original_dtypes[col] = 'datetime'
142 |                         continue
143 |             except (ValueError, TypeError):
144 |                 pass
145 | 
146 |             # Try to convert to boolean
147 |             bool_values = self.df[col].str.lower().isin(['true', 'false', 't', 'f', '1', '0', 'yes', 'no', 'y', 'n'])
148 |             if bool_values.sum() / len(self.df[col]) > 0.8:
149 |                 bool_mapping = {
150 |                     'true': True, 'false': False, 't': True, 'f': False,
151 |                     '1': True, '0': False, 'yes': True, 'no': False,
152 |                     'y': True, 'n': False
153 |                 }
154 |                 self.df[col] = self.df[col].str.lower().map(bool_mapping)
155 |                 self._original_dtypes[col] = 'boolean'
156 |                 continue
157 | 
158 |             # Keep as string
159 |             self._original_dtypes[col] = 'string'
160 | 
161 |     def close(self) -> None:
162 |         """Close and cleanup resources (CSV data is held in memory)."""
163 |         if self.df is not None:
164 |             self.logger.info(f"Closed CSV handler for: {self.file_path.name}")
165 |             self.df = None
166 |             self._original_dtypes = None
167 | 
168 |     def get_metadata_summary(self) -> Dict[str, Any]:
169 |         """
170 |         Get a summary dictionary of the CSV file's metadata.
171 | 
172 |         Returns:
173 |             A dictionary containing metadata like file path, format, row count, columns, size.
174 |         """
175 |         if self.df is None:
176 |             return {"error": "CSV data not loaded or handler closed."}
177 | 
178 |         try:
179 |             file_size = self.file_path.stat().st_size
180 |             size_str = self.format_size(file_size)
181 |         except Exception as e:
182 |             self.logger.warning(f"Could not get file size for {self.file_path}: {e}")
183 |             size_str = "N/A"
184 | 
185 |         # Create a well-structured metadata summary
186 |         summary = {
187 |             "File Information": {
188 |                 "Path": str(self.file_path),
189 |                 "Format": "CSV",
190 |                 "Size": size_str
191 |             },
192 |             "Data Structure": {
193 |                 "Total Rows": f"{len(self.df):,}",
194 |                 "Total Columns": f"{len(self.df.columns):,}",
195 |                 "Memory Usage": f"{self.df.memory_usage(deep=True).sum():,} bytes"
196 |             },
197 |             "Column Types Summary": self._get_column_types_summary()
198 |         }
199 | 
200 |         return summary
201 | 
202 |     def _get_column_types_summary(self) -> Dict[str, int]:
203 |         """Get a summary of column types in the CSV data."""
204 |         if self.df is None or self._original_dtypes is None:
205 |             return {}
206 | 
207 |         type_counts = {}
208 |         for col_type in self._original_dtypes.values():
209 |             type_counts[col_type] = type_counts.get(col_type, 0) + 1
210 | 
211 |         # Format for better display
212 |         formatted_summary = {}
213 |         type_labels = {
214 |             'string': 'Text Columns',
215 |             'integer': 'Integer Columns',
216 |             'float': 'Numeric Columns',
217 |             'datetime': 'Date/Time Columns',
218 |             'boolean': 'Boolean Columns'
219 |         }
220 | 
221 |         for type_key, count in type_counts.items():
222 |             label = type_labels.get(type_key, f'{type_key.title()} Columns')
223 |             formatted_summary[label] = f"{count:,}"
224 | 
225 |         return formatted_summary
226 | 
227 |     def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
228 |         """
229 |         Get the schema of the CSV data.
230 | 
231 |         Returns:
232 |             A list of dictionaries describing columns (name, type, nullable),
233 |             or None if schema couldn't be determined.
234 |         """
235 |         if self.df is None:
236 |             self.logger.warning("DataFrame is not available for schema data")
237 |             return None
238 | 
239 |         schema_list = []
240 | 
241 |         for col in self.df.columns:
242 |             try:
243 |                 # Get the inferred type
244 |                 col_type = self._original_dtypes.get(col, 'string')
245 | 
246 |                 # Check for null values
247 |                 has_nulls = self.df[col].isna().any()
248 | 
249 |                 schema_list.append({
250 |                     "name": str(col),
251 |                     "type": col_type,
252 |                     "nullable": bool(has_nulls)
253 |                 })
254 | 
255 |             except Exception as e:
256 |                 self.logger.error(f"Error processing column '{col}' for schema data: {e}")
257 |                 schema_list.append({
258 |                     "name": str(col),
259 |                     "type": f"[Error: {e}]",
260 |                     "nullable": None
261 |                 })
262 | 
263 |         return schema_list
264 | 
265 |     def get_data_preview(self, num_rows: int = 50) -> Optional[pd.DataFrame]:
266 |         """
267 |         Fetch a preview of the data.
268 | 
269 |         Args:
270 |             num_rows: The maximum number of rows to fetch.
271 | 
272 |         Returns:
273 |             A pandas DataFrame with preview data, an empty DataFrame if no data,
274 |             or a DataFrame with an 'error' column on failure.
275 |         """
276 |         if self.df is None:
277 |             self.logger.warning("CSV data not available for preview")
278 |             return pd.DataFrame({"error": ["CSV data not loaded or handler closed."]})
279 | 
280 |         try:
281 |             if self.df.empty:
282 |                 self.logger.info("CSV file has no data rows")
283 |                 return pd.DataFrame(columns=self.df.columns)
284 | 
285 |             # Return first num_rows
286 |             preview_df = self.df.head(num_rows).copy()
287 |             self.logger.info(f"Generated preview of {len(preview_df)} rows for {self.file_path.name}")
288 |             return preview_df
289 | 
290 |         except Exception as e:
291 |             self.logger.exception(f"Error generating data preview from CSV file: {self.file_path.name}")
292 |             return pd.DataFrame({"error": [f"Failed to generate preview: {e}"]})
293 | 
294 |     def get_column_stats(self, column_name: str) -> Dict[str, Any]:
295 |         """
296 |         Calculate and return statistics for a specific column.
297 | 
298 |         Args:
299 |             column_name: The name of the column.
300 | 
301 |         Returns:
302 |             A dictionary containing column statistics or error information.
303 |         """
304 |         if self.df is None:
305 |             return self._create_stats_result(
306 |                 column_name, "Unknown", {}, error="CSV data not loaded or handler closed."
307 |             )
308 | 
309 |         if column_name not in self.df.columns:
310 |             return self._create_stats_result(
311 |                 column_name, "Unknown", {}, error=f"Column '{column_name}' not found in CSV data."
312 |             )
313 | 
314 |         try:
315 |             col_series = self.df[column_name]
316 |             col_type = self._original_dtypes.get(column_name, 'string')
317 | 
318 |             # Basic counts
319 |             total_count = len(col_series)
320 |             null_count = col_series.isna().sum()
321 |             valid_count = total_count - null_count
322 |             null_percentage = (null_count / total_count * 100) if total_count > 0 else 0
323 | 
324 |             stats = {
325 |                 "Total Count": f"{total_count:,}",
326 |                 "Valid Count": f"{valid_count:,}",
327 |                 "Null Count": f"{null_count:,}",
328 |                 "Null Percentage": f"{null_percentage:.2f}%"
329 |             }
330 | 
331 |             # Type-specific statistics
332 |             if valid_count > 0:
333 |                 valid_series = col_series.dropna()
334 | 
335 |                 # Distinct count (always applicable)
336 |                 distinct_count = valid_series.nunique()
337 |                 stats["Distinct Count"] = f"{distinct_count:,}"
338 | 
339 |                 if col_type in ['integer', 'float']:
340 |                     # Numeric statistics
341 |                     stats.update(self._calculate_numeric_stats_pandas(valid_series))
342 |                 elif col_type == 'datetime':
343 |                     # Datetime statistics
344 |                     stats.update(self._calculate_datetime_stats_pandas(valid_series))
345 |                 elif col_type == 'boolean':
346 |                     # Boolean statistics
347 |                     stats.update(self._calculate_boolean_stats_pandas(valid_series))
348 |                 elif col_type == 'string':
349 |                     # String statistics (min/max by alphabetical order)
350 |                     stats.update(self._calculate_string_stats_pandas(valid_series))
351 | 
352 |             return self._create_stats_result(column_name, col_type, stats, nullable=null_count > 0)
353 | 
354 |         except Exception as e:
355 |             self.logger.exception(f"Error calculating stats for column '{column_name}'")
356 |             return self._create_stats_result(
357 |                 column_name, "Unknown", {}, error=f"Failed to calculate statistics: {e}"
358 |             )
359 | 
360 |     def _calculate_numeric_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
361 |         """Calculate statistics for numeric columns using pandas."""
362 |         stats = {}
363 |         try:
364 |             stats["Min"] = series.min()
365 |             stats["Max"] = series.max()
366 |             stats["Mean"] = f"{series.mean():.4f}"
367 |             stats["Median (50%)"] = series.median()
368 |             stats["StdDev"] = f"{series.std():.4f}"
369 |             
370 |             # Add histogram data for visualization
371 |             try:
372 |                 # Sample data if too large for performance
373 |                 sample_size = min(10000, len(series))
374 |                 if len(series) > sample_size:
375 |                     sampled_series = series.sample(n=sample_size, random_state=42)
376 |                 else:
377 |                     sampled_series = series
378 |                 
379 |                 # Convert to list for histogram
380 |                 clean_data = sampled_series.tolist()
381 |                 
382 |                 if len(clean_data) > 10:  # Only create histogram if we have enough data
383 |                     stats["_histogram_data"] = clean_data
384 |                     stats["_data_type"] = "numeric"
385 |                     
386 |             except Exception as e:
387 |                 self.logger.warning(f"Failed to prepare histogram data: {e}")
388 |                 
389 |         except Exception as e:
390 |             self.logger.warning(f"Error calculating numeric stats: {e}")
391 |             stats["Calculation Error"] = str(e)
392 |         return stats
393 | 
394 |     def _calculate_datetime_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
395 |         """Calculate statistics for datetime columns using pandas."""
396 |         stats = {}
397 |         try:
398 |             stats["Min"] = series.min()
399 |             stats["Max"] = series.max()
400 |             # Calculate time range
401 |             time_range = series.max() - series.min()
402 |             stats["Range"] = str(time_range)
403 |         except Exception as e:
404 |             self.logger.warning(f"Error calculating datetime stats: {e}")
405 |             stats["Calculation Error"] = str(e)
406 |         return stats
407 | 
408 |     def _calculate_boolean_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
409 |         """Calculate statistics for boolean columns using pandas."""
410 |         stats = {}
411 |         try:
412 |             value_counts = series.value_counts()
413 |             stats["True Count"] = f"{value_counts.get(True, 0):,}"
414 |             stats["False Count"] = f"{value_counts.get(False, 0):,}"
415 |             if len(value_counts) > 0:
416 |                 true_pct = (value_counts.get(True, 0) / len(series) * 100)
417 |                 stats["True Percentage"] = f"{true_pct:.2f}%"
418 |         except Exception as e:
419 |             self.logger.warning(f"Error calculating boolean stats: {e}")
420 |             stats["Calculation Error"] = str(e)
421 |         return stats
422 | 
423 |     def _calculate_string_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
424 |         """Calculate statistics for string columns using pandas."""
425 |         stats = {}
426 |         try:
427 |             # Only min/max for strings (alphabetical order)
428 |             stats["Min"] = str(series.min())
429 |             stats["Max"] = str(series.max())
430 | 
431 |             # Most common values
432 |             value_counts = series.value_counts().head(5)
433 |             if len(value_counts) > 0:
434 |                 top_values = {}
435 |                 for value, count in value_counts.items():
436 |                     top_values[str(value)] = f"{count:,}"
437 |                 stats["Top Values"] = top_values
438 |         except Exception as e:
439 |             self.logger.warning(f"Error calculating string stats: {e}")
440 |             stats["Calculation Error"] = str(e)
441 |         return stats
442 | 
443 |     def _create_stats_result(
444 |             self,
445 |             column_name: str,
446 |             col_type: str,
447 |             calculated_stats: Dict[str, Any],
448 |             nullable: Optional[bool] = None,
449 |             error: Optional[str] = None,
450 |             message: Optional[str] = None
451 |     ) -> Dict[str, Any]:
452 |         """Package the stats results consistently."""
453 |         return {
454 |             "column": column_name,
455 |             "type": col_type,
456 |             "nullable": nullable if nullable is not None else "Unknown",
457 |             "calculated": calculated_stats or {},
458 |             "error": error,
459 |             "message": message,
460 |         }
461 | 


--------------------------------------------------------------------------------
/src/parqv/data_sources/formats/json.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any, Dict, List, Optional, Tuple
  3 | 
  4 | import duckdb
  5 | import pandas as pd
  6 | 
  7 | from ..base import DataHandler, DataHandlerError
  8 | 
  9 | 
 10 | class JsonHandlerError(DataHandlerError):
 11 |     """Custom exception for JSON handling errors."""
 12 |     pass
 13 | 
 14 | 
 15 | class JsonHandler(DataHandler):
 16 |     """
 17 |     Handles JSON file interactions using DuckDB.
 18 | 
 19 |     Leverages DuckDB's `read_json_auto` for parsing standard JSON and JSON Lines (ndjson)
 20 |     and `SUMMARIZE` for efficient statistics calculation.
 21 | 
 22 |     Attributes:
 23 |         file_path (Path): Path to the JSON file.
 24 |     """
 25 |     DEFAULT_VIEW_NAME = "json_data_view"
 26 | 
 27 |     def __init__(self, file_path: Path):
 28 |         """
 29 |         Initializes the JsonHandler.
 30 | 
 31 |         Args:
 32 |             file_path: Path to the JSON file.
 33 | 
 34 |         Raises:
 35 |             JsonHandlerError: If the file doesn't exist, isn't a file, or if
 36 |                               initialization fails (e.g., DuckDB connection, view creation).
 37 |         """
 38 |         super().__init__(file_path)
 39 | 
 40 |         self.file_path = self._validate_file_path(file_path)
 41 |         self._db_conn: Optional[duckdb.DuckDBPyConnection] = None
 42 |         self._view_name: str = self.DEFAULT_VIEW_NAME
 43 |         self._schema: Optional[List[Dict[str, Any]]] = None
 44 |         self._row_count: Optional[int] = None
 45 | 
 46 |         try:
 47 |             self._connect_db()
 48 |             self._create_duckdb_view()
 49 |             self._load_metadata()
 50 |             self.logger.info(f"JsonHandler initialized successfully for: {self.file_path}")
 51 |         except Exception as e:
 52 |             self.logger.exception(f"Error during JsonHandler initialization for {self.file_path}")
 53 |             self.close()
 54 |             if isinstance(e, JsonHandlerError):
 55 |                 raise
 56 |             raise JsonHandlerError(f"Failed to initialize JSON handler: {e}") from e
 57 | 
 58 |     def _validate_file_path(self, file_path: Path) -> Path:
 59 |         """Checks if the file path is valid."""
 60 |         resolved_path = file_path.resolve()
 61 |         if not resolved_path.is_file():
 62 |             raise JsonHandlerError(f"JSON file not found or is not a file: {resolved_path}")
 63 |         return resolved_path
 64 | 
 65 |     def _connect_db(self):
 66 |         """Establishes a connection to an in-memory DuckDB database."""
 67 |         try:
 68 |             self._db_conn = duckdb.connect(database=':memory:', read_only=False)
 69 |             self.logger.debug("DuckDB in-memory connection established.")
 70 |         except Exception as e:
 71 |             self.logger.exception("Failed to initialize DuckDB connection.")
 72 |             raise JsonHandlerError(f"DuckDB connection failed: {e}") from e
 73 | 
 74 |     def _create_duckdb_view(self):
 75 |         """Creates a DuckDB view pointing to the JSON file."""
 76 |         if not self._db_conn:
 77 |             raise JsonHandlerError("DuckDB connection not available for view creation.")
 78 | 
 79 |         file_path_str = str(self.file_path)
 80 |         safe_view_name = f'"{self._view_name}"'
 81 |         load_query = f"CREATE OR REPLACE VIEW {safe_view_name} AS SELECT * FROM read_json_auto('{file_path_str}');"
 82 | 
 83 |         try:
 84 |             self._db_conn.sql(load_query)
 85 |             self.logger.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
 86 |         except duckdb.Error as db_err:
 87 |             self.logger.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
 88 |             if "Could not open file" in str(db_err):
 89 |                 raise JsonHandlerError(
 90 |                     f"DuckDB could not open file: {file_path_str}. Check permissions or path. Error: {db_err}") from db_err
 91 |             elif "JSON Error" in str(db_err) or "Parse Error" in str(db_err):
 92 |                 raise JsonHandlerError(
 93 |                     f"DuckDB failed to parse JSON file: {file_path_str}. Check format. Error: {db_err}") from db_err
 94 |             else:
 95 |                 raise JsonHandlerError(f"DuckDB failed create view for JSON file: {db_err}") from db_err
 96 |         except Exception as e:
 97 |             self.logger.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
 98 |             raise JsonHandlerError(f"Failed to create DuckDB view: {e}") from e
 99 | 
100 |     def _load_metadata(self):
101 |         """Fetches schema and row count from the DuckDB view."""
102 |         if not self._db_conn:
103 |             raise JsonHandlerError("Cannot fetch metadata, DuckDB connection not available.")
104 | 
105 |         try:
106 |             # Fetch Schema
107 |             describe_query = f"DESCRIBE \"{self._view_name}\";"
108 |             schema_result = self._db_conn.sql(describe_query).fetchall()
109 |             self._schema = self._parse_schema(schema_result)
110 |             self.logger.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
111 | 
112 |             # Fetch Row Count
113 |             count_query = f"SELECT COUNT(*) FROM \"{self._view_name}\";"
114 |             count_result = self._db_conn.sql(count_query).fetchone()
115 |             self._row_count = count_result[0] if count_result else 0
116 |             self.logger.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
117 | 
118 |         except duckdb.Error as db_err:
119 |             self.logger.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
120 |             self._schema = None
121 |             self._row_count = None
122 |         except Exception as e:
123 |             self.logger.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
124 |             self._schema = None
125 |             self._row_count = None
126 | 
127 |     def _parse_schema(self, describe_output: List[Tuple]) -> List[Dict[str, Any]]:
128 |         """Parses the output of DuckDB's DESCRIBE query."""
129 |         if not describe_output:
130 |             self.logger.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
131 |             return []
132 | 
133 |         parsed_schema = []
134 |         for row in describe_output:
135 |             # Handle potential variations in DESCRIBE output columns
136 |             if len(row) >= 3:
137 |                 name, type_str, null_str = row[0], row[1], row[2]
138 |                 is_nullable = None
139 |                 if isinstance(null_str, str):
140 |                     is_nullable = null_str.upper() == 'YES'
141 |                 parsed_schema.append({"name": name, "type": type_str, "nullable": is_nullable})
142 |             else:
143 |                 self.logger.warning(f"Unexpected format in DESCRIBE output row: {row}")
144 |         return parsed_schema
145 | 
146 |     def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
147 |         """
148 |         Returns the schema of the JSON data.
149 | 
150 |         Returns:
151 |             A list of dictionaries describing columns (name, type, nullable),
152 |             or None if schema couldn't be fetched.
153 |         """
154 |         if self._schema is None:
155 |             self.logger.warning("Schema is unavailable. It might not have been fetched successfully.")
156 |         return self._schema
157 | 
158 |     def get_metadata_summary(self) -> Dict[str, Any]:
159 |         """
160 |         Provides a summary dictionary of the JSON file's metadata.
161 | 
162 |         Returns:
163 |             A dictionary containing metadata like file path, format, row count, columns, size.
164 |         """
165 |         if not self._db_conn:
166 |             return {"error": "DuckDB connection not initialized or closed."}
167 | 
168 |         row_count_str = "N/A (Error fetching)"
169 |         if self._row_count is not None:
170 |             row_count_str = f"{self._row_count:,}"
171 | 
172 |         columns_str = "N/A (Error fetching)"
173 |         if self._schema is not None:
174 |             columns_str = f"{len(self._schema):,}"
175 | 
176 |         summary = {
177 |             "File Path": str(self.file_path),
178 |             "Format": "JSON/JSONL",
179 |             "DuckDB View": self._view_name,
180 |             "Total Rows": row_count_str,
181 |             "Columns": columns_str,
182 |         }
183 |         try:
184 |             summary["Size"] = f"{self.file_path.stat().st_size:,} bytes"
185 |         except Exception as e:
186 |             self.logger.warning(f"Could not get file size for {self.file_path}: {e}")
187 |             summary["Size"] = "N/A"
188 | 
189 |         return summary
190 | 
191 |     def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame:
192 |         """
193 |         Fetches a preview of the data.
194 | 
195 |         Args:
196 |             num_rows: The maximum number of rows to preview.
197 | 
198 |         Returns:
199 |             A pandas DataFrame containing the first `num_rows` of data,
200 |             an empty DataFrame if the file is empty, or a DataFrame with an
201 |             error message if fetching fails.
202 |         """
203 |         if not self._db_conn:
204 |             self.logger.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
205 |             return pd.DataFrame({"error": ["DuckDB connection not available."]})
206 |         if self._schema is None:
207 |             self.logger.warning("Data preview unavailable: Schema couldn't be determined.")
208 |             return pd.DataFrame({"error": ["Schema not available, cannot fetch preview."]})
209 |         if self._row_count == 0:
210 |             self.logger.info("Data preview: Source JSON view is empty.")
211 |             # Return empty DataFrame with correct columns if possible
212 |             if self._schema:
213 |                 return pd.DataFrame(columns=[col['name'] for col in self._schema])
214 |             else:
215 |                 return pd.DataFrame()  # Fallback
216 | 
217 |         try:
218 |             limit = max(1, num_rows)
219 |             preview_query = f"SELECT * FROM \"{self._view_name}\" LIMIT {limit};"
220 |             df = self._db_conn.sql(preview_query).df()
221 |             return df
222 |         except duckdb.Error as db_err:
223 |             self.logger.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
224 |             return pd.DataFrame({"error": [f"DuckDB error fetching preview: {db_err}"]})
225 |         except Exception as e:
226 |             self.logger.exception(f"Unexpected error getting data preview from '{self._view_name}'")
227 |             return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
228 | 
229 |     def _get_column_info(self, column_name: str) -> Optional[Dict[str, Any]]:
230 |         """Retrieves schema information for a specific column."""
231 |         if self._schema is None:
232 |             return None
233 |         return next((col for col in self._schema if col["name"] == column_name), None)
234 | 
235 |     def _is_complex_type(self, dtype_str: str) -> bool:
236 |         """Checks if a DuckDB data type string represents a complex type."""
237 |         if not isinstance(dtype_str, str):
238 |             return False
239 |         dtype_upper = dtype_str.upper()
240 |         return any(t in dtype_upper for t in ['STRUCT', 'LIST', 'MAP', 'UNION'])
241 | 
242 |     def get_column_stats(self, column_name: str) -> Dict[str, Any]:
243 |         """
244 |         Calculates statistics for a given column using DuckDB's SUMMARIZE or basic counts.
245 | 
246 |         Args:
247 |             column_name: The name of the column to analyze.
248 | 
249 |         Returns:
250 |             A dictionary containing calculated statistics, type information, and
251 |             any errors or messages.
252 |         """
253 |         if not self._db_conn:
254 |             return self._create_stats_result(column_name, "Unknown", {}, error="DuckDB connection not available.")
255 | 
256 |         col_info = self._get_column_info(column_name)
257 |         if not col_info:
258 |             return self._create_stats_result(column_name, "Unknown", {},
259 |                                              error=f"Column '{column_name}' not found in schema.")
260 | 
261 |         col_type = col_info["type"]
262 |         col_nullable = col_info["nullable"]  # Already boolean or None
263 |         is_complex = self._is_complex_type(col_type)
264 |         safe_column_name = f'"{column_name}"'  # Quote column name for safety
265 |         stats: Dict[str, Any] = {}
266 |         error_msg: Optional[str] = None
267 |         message: Optional[str] = None
268 | 
269 |         try:
270 |             if self._row_count == 0:
271 |                 message = "Table is empty. No statistics calculated."
272 |                 return self._create_stats_result(column_name, col_type, stats, nullable=col_nullable, message=message)
273 | 
274 |             if is_complex:
275 |                 # Use basic counts for complex types as SUMMARIZE is less informative
276 |                 self.logger.debug(f"Calculating basic counts for complex type column: {column_name}")
277 |                 stats = self._get_basic_column_counts(safe_column_name)
278 |                 message = f"Only basic counts calculated for complex type '{col_type}'."
279 |                 # Attempt distinct count for complex types (can be slow/error-prone)
280 |                 try:
281 |                     distinct_q = f"SELECT COUNT(DISTINCT {safe_column_name}) FROM \"{self._view_name}\" WHERE {safe_column_name} IS NOT NULL;"
282 |                     distinct_res = self._db_conn.sql(distinct_q).fetchone()
283 |                     if distinct_res and distinct_res[0] is not None:
284 |                         stats["Distinct Count"] = f"{distinct_res[0]:,}"
285 |                     else:
286 |                         stats["Distinct Count"] = "N/A"  # Or 0 if appropriate
287 |                 except duckdb.Error as distinct_err:
288 |                     self.logger.warning(
289 |                         f"Could not calculate distinct count for complex column '{column_name}': {distinct_err}")
290 |                     stats["Distinct Count"] = "Error"
291 | 
292 |             else:
293 |                 # Use SUMMARIZE for non-complex types
294 |                 self.logger.debug(f"Using SUMMARIZE for simple type column: {column_name}")
295 |                 summarize_query = f"SUMMARIZE SELECT {safe_column_name} FROM \"{self._view_name}\";"
296 |                 summarize_df = self._db_conn.sql(summarize_query).df()
297 | 
298 |                 if summarize_df.empty:
299 |                     message = "SUMMARIZE returned no results (column might be all NULLs or empty)."
300 |                     # Get basic counts as fallback if summarize is empty
301 |                     stats = self._get_basic_column_counts(safe_column_name)
302 |                 else:
303 |                     # SUMMARIZE puts results in the first row
304 |                     stats = self._format_summarize_stats(summarize_df.iloc[0])
305 | 
306 |                     # Add histogram data for numeric columns
307 |                     try:
308 |                         self._add_histogram_data_if_numeric(stats, safe_column_name)
309 |                     except Exception as hist_e:
310 |                         self.logger.warning(f"Failed to add histogram data for {column_name}: {hist_e}")
311 | 
312 |         except duckdb.Error as db_err:
313 |             self.logger.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
314 |             error_msg = f"DuckDB calculation failed: {db_err}"
315 |         except Exception as e:
316 |             self.logger.exception(f"Unexpected error calculating statistics for column '{column_name}'")
317 |             error_msg = f"Calculation failed unexpectedly: {e}"
318 | 
319 |         return self._create_stats_result(
320 |             column_name, col_type, stats, nullable=col_nullable, error=error_msg, message=message
321 |         )
322 | 
323 |     def _get_basic_column_counts(self, safe_column_name: str) -> Dict[str, Any]:
324 |         """Helper to get total, null, and valid counts for a column."""
325 |         stats = {}
326 |         if not self._db_conn or self._row_count is None:
327 |             return {"error": "Connection or row count unavailable for basic counts"}
328 | 
329 |         if self._row_count == 0:
330 |             stats["Total Count"] = "0"
331 |             stats["Valid Count"] = "0"
332 |             stats["Null Count"] = "0"
333 |             stats["Null Percentage"] = "N/A"
334 |             return stats
335 | 
336 |         try:
337 |             q_counts = f"""
338 |             SELECT
339 |                 SUM(CASE WHEN {safe_column_name} IS NULL THEN 1 ELSE 0 END) AS null_count
340 |             FROM "{self._view_name}";
341 |             """
342 |             counts_res = self._db_conn.sql(q_counts).fetchone()
343 | 
344 |             if counts_res:
345 |                 null_count = counts_res[0] if counts_res[0] is not None else 0
346 |                 total_count = self._row_count
347 |                 valid_count = total_count - null_count
348 |                 stats["Total Count"] = f"{total_count:,}"
349 |                 stats["Valid Count"] = f"{valid_count:,}"
350 |                 stats["Null Count"] = f"{null_count:,}"
351 |                 stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%" if total_count > 0 else "N/A"
352 |             else:
353 |                 stats["Total Count"] = f"{self._row_count:,}"
354 |                 stats["Valid Count"] = "Error"
355 |                 stats["Null Count"] = "Error"
356 |                 stats["Null Percentage"] = "Error"
357 | 
358 |         except duckdb.Error as db_err:
359 |             self.logger.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
360 |             stats["Counts Error"] = str(db_err)
361 |         return stats
362 | 
363 |     def _format_summarize_stats(self, summarize_row: pd.Series) -> Dict[str, Any]:
364 |         """Formats the output of DuckDB's SUMMARIZE into a stats dictionary."""
365 |         stats = {}
366 |         if 'count' in summarize_row and pd.notna(summarize_row['count']):
367 |             total_count = int(summarize_row['count'])
368 |             stats["Total Count"] = f"{total_count:,}"
369 |             null_count = 0
370 |             if 'null_percentage' in summarize_row and pd.notna(summarize_row['null_percentage']):
371 |                 null_perc = summarize_row['null_percentage']
372 |                 null_count = int(round(total_count * (null_perc / 100.0)))
373 |                 stats["Null Percentage"] = f"{null_perc:.2f}%"
374 |                 stats["Null Count"] = f"{null_count:,}"
375 |             else:
376 |                 stats["Null Percentage"] = "0.00%"  # Assume 0 if missing
377 |                 stats["Null Count"] = "0"
378 | 
379 |             stats["Valid Count"] = f"{total_count - null_count:,}"
380 |         else:
381 |             stats["Total Count"] = "N/A"
382 |             stats["Valid Count"] = "N/A"
383 |             stats["Null Count"] = "N/A"
384 |             stats["Null Percentage"] = "N/A"
385 | 
386 |         # Distinct Count
387 |         if 'distinct' in summarize_row and pd.notna(summarize_row['distinct']):
388 |             stats["Distinct Count"] = f"{int(summarize_row['distinct']):,}"
389 | 
390 |         # Numeric Stats
391 |         if 'min' in summarize_row and pd.notna(summarize_row['min']):
392 |             stats["Min"] = summarize_row['min']
393 |         if 'max' in summarize_row and pd.notna(summarize_row['max']):
394 |             stats["Max"] = summarize_row['max']
395 |         if 'mean' in summarize_row and pd.notna(summarize_row['mean']):
396 |             try:
397 |                 stats["Mean"] = f"{float(summarize_row['mean']):.4f}"
398 |             except (ValueError, TypeError):
399 |                 stats["Mean"] = str(summarize_row['mean'])
400 |         if 'std' in summarize_row and pd.notna(summarize_row['std']):
401 |             try:
402 |                 stats["StdDev"] = f"{float(summarize_row['std']):.4f}"
403 |             except (ValueError, TypeError):
404 |                 stats["StdDev"] = str(summarize_row['std'])
405 | 
406 |         # Quantiles (example for median)
407 |         if '50%' in summarize_row and pd.notna(summarize_row['50%']):
408 |             stats["Median (50%)"] = summarize_row['50%']
409 | 
410 |         return stats
411 | 
412 |     def _add_histogram_data_if_numeric(self, stats: Dict[str, Any], safe_column_name: str) -> None:
413 |         """Add histogram data for numeric columns by sampling from DuckDB."""
414 |         # Check if this looks like numeric data (has Mean, Min, Max)
415 |         if not all(key in stats for key in ["Mean", "Min", "Max"]):
416 |             return
417 | 
418 |         try:
419 |             # Sample data for histogram (limit to 10k samples for performance)
420 |             sample_query = f"""
421 |             SELECT {safe_column_name} 
422 |             FROM "{self._view_name}" 
423 |             WHERE {safe_column_name} IS NOT NULL 
424 |             USING SAMPLE 10000
425 |             """
426 | 
427 |             sample_df = self._db_conn.sql(sample_query).df()
428 | 
429 |             if not sample_df.empty and len(sample_df) > 10:
430 |                 # Extract the column data
431 |                 column_data = sample_df.iloc[:, 0].tolist()
432 | 
433 |                 # Filter out any remaining nulls
434 |                 clean_data = [val for val in column_data if val is not None]
435 | 
436 |                 if len(clean_data) > 10:
437 |                     stats["_histogram_data"] = clean_data
438 |                     stats["_data_type"] = "numeric"
439 | 
440 |         except Exception as e:
441 |             self.logger.warning(f"Failed to sample data for histogram: {e}")
442 | 
443 |     def _create_stats_result(
444 |             self,
445 |             column_name: str,
446 |             col_type: str,
447 |             calculated_stats: Dict[str, Any],
448 |             nullable: Optional[bool] = None,
449 |             error: Optional[str] = None,
450 |             message: Optional[str] = None
451 |     ) -> Dict[str, Any]:
452 |         """Packages the stats results consistently."""
453 |         return {
454 |             "column": column_name,
455 |             "type": col_type,
456 |             "nullable": nullable if nullable is not None else "Unknown",
457 |             "calculated": calculated_stats or {},
458 |             "basic_metadata_stats": None,
459 |             "metadata_stats_error": None,
460 |             "error": error,
461 |             "message": message,
462 |         }
463 | 
464 |     def close(self):
465 |         """Closes the DuckDB connection if it's open."""
466 |         if self._db_conn:
467 |             try:
468 |                 self._db_conn.close()
469 |                 self.logger.info(f"DuckDB connection closed for {self.file_path}.")
470 |                 self._db_conn = None
471 |             except Exception as e:
472 |                 # Log error but don't raise during close typically
473 |                 self.logger.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
474 |                 self._db_conn = None  # Assume closed even if error occurred
475 | 
476 |     def __enter__(self):
477 |         """Enter context management."""
478 |         return self
479 | 
480 |     def __exit__(self, exc_type, exc_val, exc_tb):
481 |         """Exit context management, ensuring connection closure."""
482 |         self.close()
483 | 
484 |     def __del__(self):
485 |         """Ensures connection is closed when object is garbage collected (best effort)."""
486 |         self.close()
487 | 


--------------------------------------------------------------------------------
/src/parqv/data_sources/formats/parquet.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any, Dict, List, Tuple, Optional, Union
  3 | 
  4 | import pandas as pd
  5 | import pyarrow as pa
  6 | import pyarrow.compute as pc
  7 | import pyarrow.parquet as pq
  8 | 
  9 | from ..base import DataHandler, DataHandlerError
 10 | 
 11 | 
 12 | class ParquetHandlerError(DataHandlerError):
 13 |     """Custom exception for Parquet Handler errors."""
 14 |     pass
 15 | 
 16 | 
 17 | class ParquetHandler(DataHandler):
 18 |     """
 19 |     Handles Parquet file interactions using PyArrow.
 20 | 
 21 |     Provides methods to access metadata, schema, data preview, and column statistics.
 22 |     Manages the Parquet file resource lifecycle.
 23 |     """
 24 | 
 25 |     def __init__(self, file_path: Path):
 26 |         """
 27 |         Initializes the ParquetHandler by validating the path and opening the Parquet file.
 28 | 
 29 |         Args:
 30 |             file_path: Path to the Parquet file.
 31 | 
 32 |         Raises:
 33 |             ParquetHandlerError: If the file is not found, not a file, or cannot be opened/read.
 34 |         """
 35 |         super().__init__(file_path)
 36 |         self.pq_file: Optional[pq.ParquetFile] = None
 37 |         self.schema: Optional[pa.Schema] = None
 38 |         self.metadata: Optional[pq.FileMetaData] = None
 39 | 
 40 |         try:
 41 |             # Validate file existence using the path stored by the base class
 42 |             if not self.file_path.is_file():
 43 |                 raise FileNotFoundError(f"Parquet file not found or is not a file: {self.file_path}")
 44 | 
 45 |             # Open the Parquet file
 46 |             self.pq_file = pq.ParquetFile(self.file_path)
 47 |             self.schema = self.pq_file.schema_arrow
 48 |             self.metadata = self.pq_file.metadata
 49 |             self.logger.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
 50 | 
 51 |         except FileNotFoundError as fnf_e:
 52 |             self.logger.error(f"File not found during ParquetHandler initialization: {fnf_e}")
 53 |             raise ParquetHandlerError(str(fnf_e)) from fnf_e
 54 |         except pa.lib.ArrowIOError as arrow_io_e:
 55 |             self.logger.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
 56 |             raise ParquetHandlerError(
 57 |                 f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e
 58 |         except Exception as e:
 59 |             self.logger.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
 60 |             self.close()
 61 |             raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e
 62 | 
 63 |     # Resource Management
 64 |     def close(self) -> None:
 65 |         """Closes the Parquet file resource if it's open."""
 66 |         if self.pq_file is not None:
 67 |             try:
 68 |                 # ParquetFile might not have a close method depending on source, check first
 69 |                 if hasattr(self.pq_file, 'close'):
 70 |                     self.pq_file.close()
 71 |                 self.logger.info(f"Closed Parquet file: {self.file_path.name}")
 72 |             except Exception as e:
 73 |                 # Log error during close but don't raise, as we're cleaning up
 74 |                 self.logger.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
 75 |             finally:
 76 |                 self.pq_file = None
 77 |                 self.schema = None
 78 |                 self.metadata = None
 79 | 
 80 |     def __enter__(self):
 81 |         """Enter the runtime context related to this object."""
 82 |         if not self.pq_file:
 83 |             raise ParquetHandlerError("Parquet file is not open or handler was closed.")
 84 |         return self
 85 | 
 86 |     def __exit__(self, exc_type, exc_val, exc_tb):
 87 |         """Exit the runtime context related to this object, ensuring cleanup."""
 88 |         self.close()
 89 | 
 90 |     def __del__(self):
 91 |         """Attempt to close the file when the object is garbage collected (best effort)."""
 92 |         self.close()
 93 | 
 94 |     def get_metadata_summary(self) -> Dict[str, Any]:
 95 |         """
 96 |         Provides a summary dictionary of the Parquet file's metadata.
 97 | 
 98 |         Returns:
 99 |             A dictionary containing key metadata attributes, or an error dictionary.
100 |         """
101 |         if not self.metadata or not self.schema:
102 |             self.logger.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
103 |             return {"error": "Metadata or schema not available"}
104 | 
105 |         try:
106 |             created_by = self._decode_metadata_bytes(self.metadata.created_by) or "N/A"
107 |             file_size = self.file_path.stat().st_size
108 |             summary = {
109 |                 "File Path": str(self.file_path.resolve()),
110 |                 "Format": "Parquet",
111 |                 "Size": self._format_size(file_size),
112 |                 "Total Rows": f"{self.metadata.num_rows:,}",
113 |                 "Row Groups": self.metadata.num_row_groups,
114 |                 "Columns": self.metadata.num_columns,
115 |                 "Format Version": self.metadata.format_version,
116 |                 "Creator": created_by,
117 |                 "Serialization Library": self._decode_metadata_bytes(
118 |                     self.metadata.serialized_size > 0 and self.metadata.created_by) or "N/A",
119 |             }
120 |             kv_meta = self._decode_key_value_metadata(self.metadata.metadata)
121 |             if kv_meta:
122 |                 summary["Key Value Metadata"] = kv_meta
123 | 
124 |             return summary
125 |         except Exception as e:
126 |             self.logger.exception(f"Error generating metadata summary for {self.file_path.name}")
127 |             return {"error": f"Error getting metadata summary: {e}"}
128 | 
129 |     def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
130 |         """
131 |         Returns a simplified list representation of the Arrow schema.
132 | 
133 |         Returns:
134 |             A list of dictionaries, each describing a column (name, type string, nullable bool),
135 |             or None if the schema is unavailable.
136 |         """
137 |         if not self.schema:
138 |             self.logger.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
139 |             return None
140 | 
141 |         schema_list = []
142 |         for field in self.schema:
143 |             try:
144 |                 type_str = self._format_pyarrow_type(field.type)
145 |                 schema_list.append({
146 |                     "name": field.name,
147 |                     "type": type_str,
148 |                     "nullable": field.nullable
149 |                 })
150 |             except Exception as e:
151 |                 self.logger.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
152 |                 schema_list.append({
153 |                     "name": field.name,
154 |                     "type": f"[Error: {e}]",
155 |                     "nullable": None
156 |                 })
157 |         return schema_list
158 | 
159 |     def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame:
160 |         """
161 |         Fetches a preview of the data from the Parquet file using efficient batch iteration.
162 | 
163 |         Args:
164 |             num_rows: The maximum number of rows to fetch.
165 | 
166 |         Returns:
167 |             A pandas DataFrame with the preview data, potentially using ArrowDTypes.
168 |             Returns an empty DataFrame if the file is empty or no data is read.
169 |             Returns a DataFrame with an 'error' column on failure.
170 |         """
171 |         if not self.pq_file:
172 |             self.logger.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
173 |             return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]})
174 | 
175 |         if self.metadata and self.metadata.num_rows == 0:
176 |             self.logger.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
177 |             if self.schema:
178 |                 return pd.DataFrame(columns=self.schema.names)
179 |             else:
180 |                 return pd.DataFrame()
181 | 
182 |         try:
183 |             # Determine rows to fetch, capped by file total
184 |             num_rows_to_fetch = num_rows
185 |             if self.metadata:
186 |                 num_rows_to_fetch = min(num_rows, self.metadata.num_rows)
187 | 
188 |             # Use iter_batches for memory efficiency
189 |             batches = []
190 |             rows_read = 0
191 |             internal_batch_size = min(max(num_rows_to_fetch // 2, 1024), 65536)
192 | 
193 |             for batch in self.pq_file.iter_batches(batch_size=internal_batch_size):
194 |                 if rows_read >= num_rows_to_fetch:
195 |                     break
196 |                 rows_needed_in_batch = num_rows_to_fetch - rows_read
197 |                 slice_len = min(len(batch), rows_needed_in_batch)
198 |                 batches.append(batch.slice(0, slice_len))
199 |                 rows_read += slice_len
200 |                 if rows_read >= num_rows_to_fetch:
201 |                     break
202 | 
203 |             if not batches:
204 |                 # Check if file might have rows but reading yielded nothing
205 |                 if self.metadata and self.metadata.num_rows > 0:
206 |                     self.logger.warning(
207 |                         f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}")
208 |                 else:
209 |                     self.logger.info(f"No data read for preview (file likely empty): {self.file_path.name}")
210 |                 # Return empty DF with columns if schema available
211 |                 if self.schema:
212 |                     return pd.DataFrame(columns=self.schema.names)
213 |                 else:
214 |                     return pd.DataFrame()
215 | 
216 |             # Combine batches and convert to Pandas
217 |             preview_table = pa.Table.from_batches(batches)
218 |             df = preview_table.to_pandas(
219 |                 split_blocks=True,
220 |                 self_destruct=True,
221 |                 types_mapper=pd.ArrowDtype
222 |             )
223 |             self.logger.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
224 |             return df
225 | 
226 |         except Exception as e:
227 |             self.logger.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
228 |             return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
229 | 
230 |     def get_column_stats(self, column_name: str) -> Dict[str, Any]:
231 |         """
232 |         Calculates statistics for a specific column by reading its data.
233 | 
234 |         Args:
235 |             column_name: The name of the column to analyze.
236 | 
237 |         Returns:
238 |             A dictionary containing calculated statistics, metadata statistics,
239 |             and potential error or message keys.
240 |         """
241 |         if not self.pq_file or not self.schema:
242 |             self.logger.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
243 |             return self._create_stats_result(column_name, None, error="File or schema not available")
244 | 
245 |         try:
246 |             field = self.schema.field(column_name)
247 |         except KeyError:
248 |             self.logger.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
249 |             return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema")
250 | 
251 |         calculated_stats: Dict[str, Any] = {}
252 |         error_msg: Optional[str] = None
253 |         message: Optional[str] = None
254 |         metadata_stats: Optional[Dict] = None
255 |         metadata_stats_error: Optional[str] = None
256 | 
257 |         try:
258 |             # Data Reading
259 |             table = self.pq_file.read(columns=[column_name])
260 |             column_data = table.column(0)
261 |             self.logger.debug(
262 |                 f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}")
263 | 
264 |             # Basic Counts
265 |             total_count = len(column_data)
266 |             if total_count > 0:
267 |                 null_count = column_data.null_count
268 |                 valid_count = total_count - null_count
269 |                 calculated_stats["Total Count"] = f"{total_count:,}"
270 |                 calculated_stats["Valid Count"] = f"{valid_count:,}"
271 |                 calculated_stats["Null Count"] = f"{null_count:,}"
272 |                 calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%"
273 |             else:
274 |                 self.logger.info(f"Column '{column_name}' read resulted in 0 rows.")
275 |                 message = "Column is empty (0 rows)."
276 |                 valid_count = 0  # Ensure valid_count is 0 for later checks
277 | 
278 |             # Type-Specific Calculations
279 |             if valid_count > 0:
280 |                 col_type = field.type
281 |                 self.logger.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
282 |                 try:
283 |                     if pa.types.is_floating(col_type) or pa.types.is_integer(col_type):
284 |                         calculated_stats.update(self._calculate_numeric_stats(column_data))
285 |                     elif pa.types.is_temporal(col_type):
286 |                         calculated_stats.update(self._calculate_temporal_stats(column_data))
287 |                     elif pa.types.is_string(col_type) or pa.types.is_large_string(col_type) \
288 |                             or pa.types.is_binary(col_type) or pa.types.is_large_binary(col_type):
289 |                         calculated_stats.update(self._calculate_string_binary_stats(column_data))
290 |                     elif pa.types.is_boolean(col_type):
291 |                         calculated_stats.update(self._calculate_boolean_stats(column_data))
292 |                     elif pa.types.is_dictionary(col_type):
293 |                         calculated_stats.update(self._calculate_dictionary_stats(column_data, col_type))
294 |                         message = calculated_stats.pop("message", message)
295 |                     elif pa.types.is_struct(col_type) or pa.types.is_list(col_type) or pa.types.is_map(col_type) \
296 |                             or pa.types.is_fixed_size_list(col_type) or pa.types.is_union(col_type):
297 |                         calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type))
298 |                         message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'."
299 |                     else:
300 |                         self.logger.warning(f"Statistics calculation not fully implemented for type: {col_type}")
301 |                         message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'."
302 | 
303 |                 except Exception as calc_err:
304 |                     self.logger.exception(
305 |                         f"Error during type-specific calculation for column '{column_name}': {calc_err}")
306 |                     error_msg = f"Calculation error for type {field.type}: {calc_err}"
307 |                     calculated_stats["Calculation Error"] = str(calc_err)  # Add specific error key
308 | 
309 |             elif total_count > 0:
310 |                 message = "Column contains only NULL values."
311 | 
312 |             # Metadata Statistics ---
313 |             metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
314 | 
315 |         except pa.lib.ArrowException as arrow_e:
316 |             self.logger.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
317 |             error_msg = f"Arrow processing error: {arrow_e}"
318 |         except Exception as e:
319 |             self.logger.exception(f"Unexpected error during stats calculation for column '{column_name}'")
320 |             error_msg = f"Calculation failed unexpectedly: {e}"
321 | 
322 |         return self._create_stats_result(
323 |             column_name, field, calculated_stats, metadata_stats, metadata_stats_error, error_msg, message
324 |         )
325 | 
326 |     def _decode_metadata_bytes(self, value: Optional[Union[bytes, str]]) -> Optional[str]:
327 |         """Safely decodes bytes metadata values to UTF-8 strings, replacing errors."""
328 |         if isinstance(value, bytes):
329 |             try:
330 |                 return value.decode('utf-8', errors='replace')
331 |             except Exception as e:
332 |                 self.logger.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
333 |                 return f"[Decode Error: {value!r}]"
334 |         return str(value) if value is not None else None
335 | 
336 |     def _decode_key_value_metadata(self, kv_meta: Optional[Dict[Union[str, bytes], Union[str, bytes]]]) -> Optional[
337 |         Dict[str, str]]:
338 |         """Decodes keys and values of the key-value metadata dictionary."""
339 |         if not kv_meta:
340 |             return None
341 |         decoded_kv = {}
342 |         try:
343 |             for k, v in kv_meta.items():
344 |                 key_str = self._decode_metadata_bytes(k) or "[Invalid Key]"
345 |                 val_str = self._decode_metadata_bytes(v) or "[Invalid Value]"
346 |                 decoded_kv[key_str] = val_str
347 |             return decoded_kv
348 |         except Exception as e:
349 |             self.logger.warning(f"Could not decode key-value metadata: {e}")
350 |             return {"error": f"Error decoding key-value metadata: {e}"}
351 | 
352 |     def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
353 |         """Formats a PyArrow DataType into a readable string, including details."""
354 |         if pa.types.is_timestamp(field_type):
355 |             tz_str = f", tz='{field_type.tz}'" if field_type.tz else ""
356 |             return f"TIMESTAMP(unit='{field_type.unit}'{tz_str})"
357 |         if pa.types.is_time32(field_type) or pa.types.is_time64(field_type):
358 |             return f"TIME(unit='{field_type.unit}')"
359 |         if pa.types.is_duration(field_type):
360 |             return f"DURATION(unit='{field_type.unit}')"
361 |         if pa.types.is_decimal128(field_type) or pa.types.is_decimal256(field_type):
362 |             return f"DECIMAL({field_type.precision}, {field_type.scale})"
363 |         if pa.types.is_fixed_size_binary(field_type):
364 |             return f"FIXED_SIZE_BINARY({field_type.byte_width})"
365 |         if pa.types.is_list(field_type) or pa.types.is_large_list(field_type) or pa.types.is_fixed_size_list(
366 |                 field_type):
367 |             prefix = "LIST"
368 |             if pa.types.is_large_list(field_type): prefix = "LARGE_LIST"
369 |             if pa.types.is_fixed_size_list(field_type): prefix = f"FIXED_SIZE_LIST({field_type.list_size})"
370 |             value_type_str = self._format_pyarrow_type(field_type.value_type)
371 |             return f"{prefix}<item: {value_type_str}>"
372 |         if pa.types.is_struct(field_type):
373 |             num_fields_to_show = 3
374 |             field_details = ", ".join(
375 |                 f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:num_fields_to_show])
376 |             suffix = "..." if field_type.num_fields > num_fields_to_show else ""
377 |             return f"STRUCT<{field_details}{suffix}>"
378 |         if pa.types.is_map(field_type):
379 |             keys_sorted = getattr(field_type, 'keys_sorted', False)
380 |             sorted_str = ", keys_sorted" if keys_sorted else ""
381 |             key_type_str = self._format_pyarrow_type(field_type.key_type)
382 |             item_type_str = self._format_pyarrow_type(field_type.item_type)
383 |             return f"MAP<key: {key_type_str}, value: {item_type_str}{sorted_str}>"
384 |         if pa.types.is_dictionary(field_type):
385 |             index_type_str = self._format_pyarrow_type(field_type.index_type)
386 |             value_type_str = self._format_pyarrow_type(field_type.value_type)
387 |             ordered = getattr(field_type, 'ordered', False)
388 |             return f"DICTIONARY<indices: {index_type_str}, values: {value_type_str}{', ordered' if ordered else ''}>"
389 |         if pa.types.is_union(field_type):
390 |             type_codes = getattr(field_type, 'type_codes', [])
391 |             mode = getattr(field_type, 'mode', 'sparse')
392 |             field_details = ", ".join(
393 |                 f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:3])  # Show first few fields
394 |             suffix = "..." if field_type.num_fields > 3 else ""
395 |             return f"UNION<{field_details}{suffix}> (mode='{mode}', codes={type_codes[:5]}{'...' if len(type_codes) > 5 else ''})"
396 | 
397 |         return str(field_type).upper()
398 | 
399 |     def _safe_compute(self, func, data, *args, **kwargs) -> Tuple[Optional[Any], Optional[str]]:
400 |         """Helper to safely execute a pyarrow.compute function and handle errors."""
401 |         if data.null_count == len(data):
402 |             return None, "Input data is all NULL"
403 |         try:
404 |             result_scalar = func(data, *args, **kwargs)
405 |             return result_scalar.as_py() if result_scalar.is_valid else None, None
406 |         except pa.lib.ArrowNotImplementedError as nie:
407 |             return None, "Not Implemented"
408 |         except Exception as e:
409 |             return None, f"Compute Error: {e}"
410 | 
411 |     def _calculate_numeric_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
412 |         """Calculates min, max, mean, stddev for numeric columns using _safe_compute."""
413 |         stats: Dict[str, Any] = {}
414 |         min_val, err = self._safe_compute(pc.min, column_data)
415 |         stats["Min"] = min_val if err is None else err
416 |         max_val, err = self._safe_compute(pc.max, column_data)
417 |         stats["Max"] = max_val if err is None else err
418 |         mean_val, err = self._safe_compute(pc.mean, column_data)
419 |         stats["Mean"] = f"{mean_val:.4f}" if mean_val is not None and err is None else (err or "N/A")
420 |         stddev_val, err = self._safe_compute(pc.stddev, column_data, ddof=1)
421 |         stats["StdDev"] = f"{stddev_val:.4f}" if stddev_val is not None and err is None else (err or "N/A")
422 |         if stats["StdDev"] == "Not Implemented":
423 |             variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1)
424 |             stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else (
425 |                     err_var or "N/A")
426 |         distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
427 |         stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
428 | 
429 |         # Add histogram data for visualization
430 |         try:
431 |             # Convert to Python list for histogram calculation (sample if too large)
432 |             data_length = len(column_data)
433 |             sample_size = min(10000, data_length)  # Limit to 10k samples for performance
434 | 
435 |             if data_length > sample_size:
436 |                 # Sample the data
437 |                 import random
438 |                 indices = sorted(random.sample(range(data_length), sample_size))
439 |                 sampled_data = [column_data[i].as_py() for i in indices]
440 |             else:
441 |                 sampled_data = column_data.to_pylist()
442 | 
443 |             # Filter out None values
444 |             clean_data = [val for val in sampled_data if val is not None]
445 | 
446 |             if len(clean_data) > 10:  # Only create histogram if we have enough data
447 |                 stats["_histogram_data"] = clean_data
448 |                 stats["_data_type"] = "numeric"
449 | 
450 |         except Exception as e:
451 |             self.logger.warning(f"Failed to prepare histogram data: {e}")
452 | 
453 |         return stats
454 | 
455 |     def _calculate_temporal_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
456 |         """Calculates min and max for temporal columns using _safe_compute."""
457 |         stats: Dict[str, Any] = {}
458 |         min_val, err = self._safe_compute(pc.min, column_data)
459 |         stats["Min"] = min_val if err is None else err  # .as_py() handles conversion
460 |         max_val, err = self._safe_compute(pc.max, column_data)
461 |         stats["Max"] = max_val if err is None else err
462 |         return stats
463 | 
464 |     def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
465 |         """Calculates distinct count for string/binary columns."""
466 |         stats: Dict[str, Any] = {}
467 |         distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
468 |         stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
469 |         return stats
470 | 
471 |     def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
472 |         """Calculates value counts (True/False) for boolean columns."""
473 |         stats: Dict[str, Any] = {}
474 |         try:
475 |             if column_data.null_count == len(column_data):
476 |                 stats["Value Counts"] = "All NULL"
477 |                 return stats
478 | 
479 |             # value_counts returns a StructArray [{values: bool, counts: int64}, ...]
480 |             value_counts_struct = pc.value_counts(column_data)
481 |             counts_dict = {}
482 |             if len(value_counts_struct) > 0:
483 |                 for i in range(len(value_counts_struct)):
484 |                     value = value_counts_struct.field("values")[i].as_py()
485 |                     count = value_counts_struct.field("counts")[i].as_py()
486 |                     counts_dict[value] = count  # Keys are True/False
487 | 
488 |             stats["Value Counts"] = {str(k): f"{v:,}" for k, v in counts_dict.items()}
489 |             # Ensure both True and False are present, even if count is 0
490 |             if 'True' not in stats["Value Counts"]: stats["Value Counts"]['True'] = "0"
491 |             if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0"
492 | 
493 |         except Exception as vc_e:
494 |             self.logger.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
495 |             stats["Value Counts"] = "Error calculating"
496 |         return stats
497 | 
498 |     def _calculate_dictionary_stats(self, column_data: pa.ChunkedArray, col_type: pa.DictionaryType) -> Dict[str, Any]:
499 |         """Calculates stats for dictionary type based on its value type."""
500 |         stats: Dict[str, Any] = {"message": "Stats calculated on dictionary values."}  # Start with message
501 |         try:
502 |             unwrapped_data = column_data.dictionary_decode()
503 |             value_type = col_type.value_type
504 |             self.logger.debug(f"Calculating dictionary stats based on value type: {value_type}")
505 | 
506 |             # Delegate calculation based on the *value* type
507 |             if pa.types.is_floating(value_type) or pa.types.is_integer(value_type):
508 |                 stats.update(self._calculate_numeric_stats(unwrapped_data))
509 |             elif pa.types.is_temporal(value_type):
510 |                 stats.update(self._calculate_temporal_stats(unwrapped_data))
511 |             elif pa.types.is_string(value_type) or pa.types.is_large_string(value_type) \
512 |                     or pa.types.is_binary(value_type) or pa.types.is_large_binary(value_type):
513 |                 stats.update(self._calculate_string_binary_stats(unwrapped_data))
514 |             # Add other dictionary value types if necessary (boolean, etc.)
515 |             else:
516 |                 stats[
517 |                     "message"] += f" (Stats for value type '{self._format_pyarrow_type(value_type)}' not fully implemented)."
518 |                 # Calculate distinct count on the original dictionary array (can be faster)
519 |                 distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
520 |                 stats[
521 |                     "Distinct Values (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (
522 |                         err or "N/A")
523 | 
524 |         except pa.lib.ArrowException as arrow_decode_err:
525 |             self.logger.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
526 |             stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}"
527 |         except Exception as dict_e:
528 |             self.logger.warning(f"Could not process dictionary type for stats: {dict_e}")
529 |             stats["Dictionary Error"] = f"Processing Error: {dict_e}"
530 |         return stats
531 | 
532 |     def _calculate_complex_type_stats(self, column_data: pa.ChunkedArray, col_type: pa.DataType) -> Dict[str, Any]:
533 |         """Calculates basic stats (like distinct count) for complex types."""
534 |         stats: Dict[str, Any] = {}
535 |         # Distinct count is often the most feasible stat for complex types
536 |         distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
537 |         # Note: Distinct count on complex types can be approximate or may error depending on type
538 |         stats["Distinct Count (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (
539 |                 err or "N/A")
540 |         return stats
541 | 
542 |     def _get_stats_from_metadata(self, column_name: str) -> Tuple[Dict[str, Any], Optional[str]]:
543 |         """Retrieves statistics stored within the Parquet file metadata per row group."""
544 |         metadata_stats: Dict[str, Any] = {}
545 |         error_str: Optional[str] = None
546 | 
547 |         if not self.metadata or not self.schema:
548 |             return {}, "Metadata or Schema not available"
549 | 
550 |         try:
551 |             col_index = self.schema.get_field_index(column_name)
552 | 
553 |             for i in range(self.metadata.num_row_groups):
554 |                 group_key = f"RG {i}"
555 |                 try:
556 |                     rg_meta = self.metadata.row_group(i)
557 |                     metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index)
558 |                 except IndexError:
559 |                     self.logger.warning(f"Column index {col_index} out of bounds for row group {i}.")
560 |                     metadata_stats[group_key] = "Index Error"
561 |                 except Exception as e:
562 |                     self.logger.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
563 |                     metadata_stats[group_key] = f"Read Error: {e}"
564 | 
565 |         except KeyError:
566 |             self.logger.warning(f"Column '{column_name}' not found in schema for metadata stats.")
567 |             error_str = f"Column '{column_name}' not found in schema"
568 |         except Exception as e:
569 |             self.logger.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
570 |             error_str = f"Error accessing metadata structure: {e}"
571 | 
572 |         return metadata_stats, error_str
573 | 
574 |     def _extract_stats_for_single_group(self, rg_meta: pq.RowGroupMetaData, col_index: int) -> Union[
575 |         str, Dict[str, Any]]:
576 |         """Extracts stats from a column chunk's metadata within a row group."""
577 |         try:
578 |             col_chunk_meta = rg_meta.column(col_index)
579 |             stats = col_chunk_meta.statistics
580 |             if not stats: return "No stats in metadata"
581 | 
582 |             def _format_stat(value, is_present, is_numeric=True):
583 |                 if not is_present: return "N/A"
584 |                 try:
585 |                     # Attempt to format nicely, fallback to repr for safety
586 |                     return f"{value:,}" if is_numeric else str(value)
587 |                 except Exception:
588 |                     return repr(value)
589 | 
590 |             return {
591 |                 "min": _format_stat(stats.min, stats.has_min_max, is_numeric=False),
592 |                 "max": _format_stat(stats.max, stats.has_min_max, is_numeric=False),
593 |                 "nulls": _format_stat(stats.null_count, stats.has_null_count),
594 |                 "distinct": _format_stat(stats.distinct_count, stats.has_distinct_count),
595 |                 "size_comp": _format_stat(col_chunk_meta.total_compressed_size,
596 |                                           col_chunk_meta.total_compressed_size is not None),
597 |                 "size_uncomp": _format_stat(col_chunk_meta.total_uncompressed_size,
598 |                                             col_chunk_meta.total_uncompressed_size is not None),
599 |             }
600 |         except IndexError:
601 |             self.logger.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
602 |             return "Index Error"
603 |         except Exception as e:
604 |             self.logger.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
605 |             return f"Metadata Read Error: {e}"
606 | 
607 |     def _create_stats_result(
608 |             self,
609 |             column_name: str,
610 |             field: Optional[pa.Field],
611 |             calculated_stats: Optional[Dict] = None,
612 |             metadata_stats: Optional[Dict] = None,
613 |             metadata_stats_error: Optional[str] = None,
614 |             calculation_error: Optional[str] = None,
615 |             message: Optional[str] = None
616 |     ) -> Dict[str, Any]:
617 |         """Consistently packages the results of column statistics calculation."""
618 |         calculated_stats_dict = calculated_stats if calculated_stats is not None else {}
619 | 
620 |         col_type_str = "Unknown"
621 |         col_nullable = None
622 |         if field:
623 |             try:
624 |                 col_type_str = self._format_pyarrow_type(field.type)
625 |                 col_nullable = field.nullable
626 |             except Exception as e:
627 |                 self.logger.error(f"Error formatting type for column {column_name}: {e}")
628 |                 col_type_str = f"[Error formatting: {field.type}]"
629 |                 col_nullable = None
630 | 
631 |         return {
632 |             "column": column_name,
633 |             "type": col_type_str,
634 |             "nullable": col_nullable,
635 |             "calculated": calculated_stats_dict,
636 |             "basic_metadata_stats": metadata_stats,
637 |             "metadata_stats_error": metadata_stats_error,
638 |             "error": calculation_error,
639 |             "message": message
640 |         }
641 | 
642 |     def _format_size(self, num_bytes: int) -> str:
643 |         """Formats bytes into a human-readable string (KB, MB, GB)."""
644 |         if num_bytes < 1024:
645 |             return f"{num_bytes} Bytes"
646 |         elif num_bytes < 1024 ** 2:
647 |             return f"{num_bytes / 1024:.2f} KB"
648 |         elif num_bytes < 1024 ** 3:
649 |             return f"{num_bytes / 1024 ** 2:.2f} MB"
650 |         else:
651 |             return f"{num_bytes / 1024 ** 3:.2f} GB"
652 | 


--------------------------------------------------------------------------------