├── src ├── __init__.py ├── sample │ └── parquet │ │ └── titanic.parquet └── parqv │ ├── views │ ├── components │ │ ├── __init__.py │ │ ├── loading_display.py │ │ ├── error_display.py │ │ └── enhanced_data_table.py │ ├── utils │ │ ├── __init__.py │ │ ├── data_formatters.py │ │ ├── visualization.py │ │ └── stats_formatters.py │ ├── __init__.py │ ├── metadata_view.py │ ├── base.py │ ├── data_view.py │ └── schema_view.py │ ├── data_sources │ ├── formats │ │ ├── __init__.py │ │ ├── csv.py │ │ ├── json.py │ │ └── parquet.py │ ├── base │ │ ├── __init__.py │ │ ├── exceptions.py │ │ └── handler.py │ └── __init__.py │ ├── core │ ├── config.py │ ├── __init__.py │ ├── logging.py │ ├── file_utils.py │ └── handler_factory.py │ ├── __init__.py │ ├── parqv.css │ ├── cli.py │ └── app.py ├── assets └── parqv.gif ├── .gitignore ├── pyproject.toml ├── README.md └── LICENSE /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/parqv.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanspareilsmyn/parqv/HEAD/assets/parqv.gif -------------------------------------------------------------------------------- /src/sample/parquet/titanic.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanspareilsmyn/parqv/HEAD/src/sample/parquet/titanic.parquet -------------------------------------------------------------------------------- /src/parqv/views/components/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reusable UI components for parqv views. 3 | """ 4 | 5 | from .error_display import ErrorDisplay 6 | from .loading_display import LoadingDisplay 7 | from .enhanced_data_table import EnhancedDataTable 8 | 9 | __all__ = [ 10 | "ErrorDisplay", 11 | "LoadingDisplay", 12 | "EnhancedDataTable", 13 | ] -------------------------------------------------------------------------------- /src/parqv/data_sources/formats/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Format-specific data handlers for parqv. 3 | """ 4 | 5 | from .parquet import ParquetHandler, ParquetHandlerError 6 | from .json import JsonHandler, JsonHandlerError 7 | from .csv import CsvHandler, CsvHandlerError 8 | 9 | __all__ = [ 10 | # Parquet format 11 | "ParquetHandler", 12 | "ParquetHandlerError", 13 | 14 | # JSON format 15 | "JsonHandler", 16 | "JsonHandlerError", 17 | 18 | # CSV format 19 | "CsvHandler", 20 | "CsvHandlerError", 21 | ] -------------------------------------------------------------------------------- /src/parqv/views/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for parqv views. 3 | """ 4 | 5 | from .data_formatters import format_metadata_for_display, format_value_for_display 6 | from .stats_formatters import format_stats_for_display, format_column_info 7 | from .visualization import create_text_histogram, should_show_histogram 8 | 9 | __all__ = [ 10 | # Data formatting 11 | "format_metadata_for_display", 12 | "format_value_for_display", 13 | "format_stats_for_display", 14 | "format_column_info", 15 | 16 | # Visualization 17 | "create_text_histogram", 18 | "should_show_histogram", 19 | ] 20 | -------------------------------------------------------------------------------- /src/parqv/data_sources/base/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base classes and interfaces for data sources. 3 | """ 4 | 5 | from .handler import DataHandler 6 | from .exceptions import ( 7 | DataSourceError, 8 | DataHandlerError, 9 | FileValidationError, 10 | UnsupportedFormatError, 11 | DataReadError, 12 | SchemaError, 13 | MetadataError, 14 | ) 15 | 16 | __all__ = [ 17 | # Base handler interface 18 | "DataHandler", 19 | 20 | # Exception classes 21 | "DataSourceError", 22 | "DataHandlerError", 23 | "FileValidationError", 24 | "UnsupportedFormatError", 25 | "DataReadError", 26 | "SchemaError", 27 | "MetadataError", 28 | ] -------------------------------------------------------------------------------- /src/parqv/core/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration constants and settings for parqv application. 3 | """ 4 | 5 | from typing import Dict, Type, List 6 | from pathlib import Path 7 | 8 | # File extensions and their corresponding handler types 9 | SUPPORTED_EXTENSIONS: Dict[str, str] = { 10 | ".parquet": "parquet", 11 | ".json": "json", 12 | ".ndjson": "json", 13 | ".csv": "csv" 14 | } 15 | 16 | # Application constants 17 | LOG_FILENAME = "parqv.log" 18 | LOG_MAX_BYTES = 1024 * 1024 * 5 # 5MB 19 | LOG_BACKUP_COUNT = 3 20 | LOG_ENCODING = "utf-8" 21 | 22 | # UI Constants 23 | DEFAULT_PREVIEW_ROWS = 50 24 | 25 | # CSS Path (relative to the app module) 26 | CSS_PATH = "parqv.css" -------------------------------------------------------------------------------- /src/parqv/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | parqv - A Textual application for visualizing Parquet and JSON files. 3 | """ 4 | 5 | from .app import ParqV 6 | from .cli import run_app 7 | from .core import ( 8 | SUPPORTED_EXTENSIONS, 9 | DEFAULT_PREVIEW_ROWS, 10 | FileValidationError, 11 | validate_and_detect_file, 12 | HandlerFactory, 13 | HandlerCreationError, 14 | setup_logging, 15 | get_logger 16 | ) 17 | 18 | __version__ = "1.0.0" 19 | 20 | __all__ = [ 21 | "ParqV", 22 | "run_app", 23 | "SUPPORTED_EXTENSIONS", 24 | "DEFAULT_PREVIEW_ROWS", 25 | "FileValidationError", 26 | "validate_and_detect_file", 27 | "HandlerFactory", 28 | "HandlerCreationError", 29 | "setup_logging", 30 | "get_logger", 31 | ] 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .nox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | *.py,cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Environments 50 | .env 51 | .venv 52 | env/ 53 | venv/ 54 | ENV/ 55 | env.bak/ 56 | venv.bak/ -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "parqv" 7 | version = "0.3.0" 8 | description = "An interactive Python TUI for visualizing, exploring, and analyzing files directly in your terminal." 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | license = "Apache-2.0" 12 | authors = [{ name = "Sangmin Yoon", email = "sanspareilsmyn@gmail.com" }] 13 | 14 | dependencies = [ 15 | "textual>=1.0.0", 16 | "pyarrow>=16.0.0", 17 | "pandas>=2.0.0", 18 | "numpy>=1.20.0", 19 | "duckdb>=1.2.0" 20 | ] 21 | 22 | [project.scripts] 23 | parqv = "parqv.app:run_app" 24 | 25 | [tool.setuptools] 26 | package-dir = {"" = "src"} 27 | 28 | [tool.setuptools.packages.find] 29 | where = ["src"] 30 | 31 | [tool.setuptools.package-data] 32 | "parqv" = ["*.css"] -------------------------------------------------------------------------------- /src/parqv/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core modules for parqv application. 3 | 4 | This package contains fundamental configuration, utilities, and factory classes. 5 | """ 6 | 7 | from .config import SUPPORTED_EXTENSIONS, DEFAULT_PREVIEW_ROWS, CSS_PATH 8 | from .logging import setup_logging, get_logger 9 | from .file_utils import FileValidationError, validate_and_detect_file, validate_file_path, detect_file_type 10 | from .handler_factory import HandlerFactory, HandlerCreationError 11 | 12 | __all__ = [ 13 | # Configuration 14 | "SUPPORTED_EXTENSIONS", 15 | "DEFAULT_PREVIEW_ROWS", 16 | "CSS_PATH", 17 | 18 | # Logging 19 | "setup_logging", 20 | "get_logger", 21 | 22 | # File utilities 23 | "FileValidationError", 24 | "validate_and_detect_file", 25 | "validate_file_path", 26 | "detect_file_type", 27 | 28 | # Factory 29 | "HandlerFactory", 30 | "HandlerCreationError", 31 | ] -------------------------------------------------------------------------------- /src/parqv/data_sources/base/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exception classes for data sources. 3 | """ 4 | 5 | 6 | class DataSourceError(Exception): 7 | """Base exception for all data source errors.""" 8 | pass 9 | 10 | 11 | class DataHandlerError(DataSourceError): 12 | """Base exception for all data handler errors.""" 13 | pass 14 | 15 | 16 | class FileValidationError(DataSourceError): 17 | """Exception raised when file validation fails.""" 18 | pass 19 | 20 | 21 | class UnsupportedFormatError(DataSourceError): 22 | """Exception raised when an unsupported file format is encountered.""" 23 | pass 24 | 25 | 26 | class DataReadError(DataSourceError): 27 | """Exception raised when data reading fails.""" 28 | pass 29 | 30 | 31 | class SchemaError(DataSourceError): 32 | """Exception raised when schema operations fail.""" 33 | pass 34 | 35 | 36 | class MetadataError(DataSourceError): 37 | """Exception raised when metadata operations fail.""" 38 | pass -------------------------------------------------------------------------------- /src/parqv/views/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Views package for parqv application. 3 | 4 | This package contains all UI views and their supporting components and utilities. 5 | """ 6 | 7 | # Main views 8 | from .metadata_view import MetadataView 9 | from .data_view import DataView 10 | from .schema_view import SchemaView 11 | 12 | # Base classes 13 | from .base import BaseView 14 | 15 | # Components (optional, for advanced usage) 16 | from .components import ErrorDisplay, LoadingDisplay, EnhancedDataTable 17 | 18 | # Utilities (optional, for advanced usage) 19 | from .utils import format_metadata_for_display, format_stats_for_display 20 | 21 | __all__ = [ 22 | # Main views - these are the primary exports 23 | "MetadataView", 24 | "DataView", 25 | "SchemaView", 26 | 27 | # Base class - for extending functionality 28 | "BaseView", 29 | 30 | # Components - for custom view development 31 | "ErrorDisplay", 32 | "LoadingDisplay", 33 | "EnhancedDataTable", 34 | 35 | # Utilities - for data formatting 36 | "format_metadata_for_display", 37 | "format_stats_for_display", 38 | ] 39 | -------------------------------------------------------------------------------- /src/parqv/data_sources/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data sources package for parqv application. 3 | 4 | This package provides adapters for various data file formats, 5 | offering a unified interface for data access. 6 | """ 7 | 8 | # Base classes and exceptions 9 | from .base import ( 10 | DataHandler, 11 | DataHandlerError, 12 | DataSourceError, 13 | FileValidationError, 14 | UnsupportedFormatError, 15 | DataReadError, 16 | SchemaError, 17 | MetadataError, 18 | ) 19 | 20 | # Format-specific handlers 21 | from .formats import ( 22 | ParquetHandler, 23 | ParquetHandlerError, 24 | JsonHandler, 25 | JsonHandlerError, 26 | CsvHandler, 27 | CsvHandlerError, 28 | ) 29 | 30 | __all__ = [ 31 | # Base interface and exceptions 32 | "DataHandler", 33 | "DataHandlerError", 34 | "DataSourceError", 35 | "FileValidationError", 36 | "UnsupportedFormatError", 37 | "DataReadError", 38 | "SchemaError", 39 | "MetadataError", 40 | 41 | # Format handlers 42 | "ParquetHandler", 43 | "ParquetHandlerError", 44 | "JsonHandler", 45 | "JsonHandlerError", 46 | "CsvHandler", 47 | "CsvHandlerError", 48 | ] -------------------------------------------------------------------------------- /src/parqv/core/logging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging configuration for parqv application. 3 | """ 4 | 5 | import logging 6 | import sys 7 | from logging.handlers import RotatingFileHandler 8 | 9 | from .config import LOG_FILENAME, LOG_MAX_BYTES, LOG_BACKUP_COUNT, LOG_ENCODING 10 | 11 | 12 | def setup_logging() -> logging.Logger: 13 | """ 14 | Sets up logging configuration for the parqv application. 15 | 16 | Returns: 17 | The root logger instance configured for parqv. 18 | """ 19 | file_handler = RotatingFileHandler( 20 | LOG_FILENAME, 21 | maxBytes=LOG_MAX_BYTES, 22 | backupCount=LOG_BACKUP_COUNT, 23 | encoding=LOG_ENCODING 24 | ) 25 | 26 | logging.basicConfig( 27 | level=logging.INFO, 28 | format="%(asctime)s [%(levelname)-5.5s] %(name)s (%(filename)s:%(lineno)d) - %(message)s", 29 | handlers=[file_handler, logging.StreamHandler(sys.stdout)], 30 | force=True # Override any existing configuration 31 | ) 32 | 33 | return logging.getLogger(__name__) 34 | 35 | 36 | def get_logger(name: str) -> logging.Logger: 37 | """ 38 | Gets a logger instance for the given name. 39 | 40 | Args: 41 | name: The name for the logger (typically __name__) 42 | 43 | Returns: 44 | A logger instance. 45 | """ 46 | return logging.getLogger(name) -------------------------------------------------------------------------------- /src/parqv/views/components/loading_display.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loading display component for parqv views. 3 | """ 4 | 5 | from textual.containers import Center, Middle 6 | from textual.widgets import LoadingIndicator, Label 7 | 8 | 9 | class LoadingDisplay(Center): 10 | """ 11 | A reusable component for displaying loading states in a consistent format. 12 | """ 13 | 14 | def __init__(self, message: str = "Loading...", **kwargs): 15 | """ 16 | Initialize the loading display. 17 | 18 | Args: 19 | message: Loading message to display 20 | **kwargs: Additional arguments for Center container 21 | """ 22 | super().__init__(**kwargs) 23 | self.message = message 24 | 25 | def compose(self): 26 | """Compose the loading display layout.""" 27 | with Middle(): 28 | yield LoadingIndicator() 29 | yield Label(self.message, classes="loading-message") 30 | 31 | @classmethod 32 | def data_loading(cls, **kwargs) -> 'LoadingDisplay': 33 | """Create a loading display for data loading operations.""" 34 | return cls(message="Loading data...", **kwargs) 35 | 36 | @classmethod 37 | def metadata_loading(cls, **kwargs) -> 'LoadingDisplay': 38 | """Create a loading display for metadata loading operations.""" 39 | return cls(message="Loading metadata...", **kwargs) 40 | 41 | @classmethod 42 | def schema_loading(cls, **kwargs) -> 'LoadingDisplay': 43 | """Create a loading display for schema loading operations.""" 44 | return cls(message="Loading schema...", **kwargs) -------------------------------------------------------------------------------- /src/parqv/views/metadata_view.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metadata view for displaying file metadata information. 3 | """ 4 | 5 | from textual.containers import VerticalScroll 6 | from textual.widgets import Pretty 7 | 8 | from .base import BaseView 9 | from .components import ErrorDisplay 10 | from .utils import format_metadata_for_display 11 | 12 | 13 | class MetadataView(BaseView): 14 | """ 15 | View for displaying metadata information about the loaded file. 16 | 17 | Shows file statistics, format information, and other metadata 18 | in a formatted display. 19 | """ 20 | 21 | def load_content(self) -> None: 22 | """Load and display metadata content.""" 23 | if not self.check_handler_available(): 24 | return 25 | 26 | try: 27 | # Get raw metadata from handler 28 | raw_metadata = self.handler.get_metadata_summary() 29 | 30 | # Format metadata for display 31 | formatted_metadata = format_metadata_for_display(raw_metadata) 32 | 33 | # Check if there's an error in the formatted data 34 | if "Error" in formatted_metadata and len(formatted_metadata) == 1: 35 | self.show_error(formatted_metadata["Error"]) 36 | return 37 | 38 | # Display the formatted metadata 39 | self._display_metadata(formatted_metadata) 40 | 41 | self.logger.info("Metadata loaded successfully") 42 | 43 | except Exception as e: 44 | self.show_error("Failed to load metadata", e) 45 | 46 | def _display_metadata(self, metadata: dict) -> None: 47 | """ 48 | Display the formatted metadata using Pretty widget. 49 | 50 | Args: 51 | metadata: Formatted metadata dictionary 52 | """ 53 | try: 54 | pretty_widget = Pretty(metadata, id="metadata-pretty") 55 | self.mount(pretty_widget) 56 | except Exception as e: 57 | self.logger.error(f"Failed to create Pretty widget: {e}") 58 | self.show_error("Failed to display metadata") 59 | 60 | def refresh_metadata(self) -> None: 61 | """Refresh the metadata display.""" 62 | self.clear_content() 63 | self.load_content() -------------------------------------------------------------------------------- /src/parqv/core/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | File utilities for parqv application. 3 | """ 4 | 5 | from pathlib import Path 6 | from typing import Optional, Tuple 7 | 8 | from .config import SUPPORTED_EXTENSIONS 9 | from .logging import get_logger 10 | 11 | log = get_logger(__name__) 12 | 13 | 14 | class FileValidationError(Exception): 15 | """Exception raised when file validation fails.""" 16 | pass 17 | 18 | 19 | def validate_file_path(file_path_str: Optional[str]) -> Path: 20 | """ 21 | Validates and resolves the file path. 22 | 23 | Args: 24 | file_path_str: String representation of the file path 25 | 26 | Returns: 27 | Resolved Path object 28 | 29 | Raises: 30 | FileValidationError: If file path is invalid or file doesn't exist 31 | """ 32 | if not file_path_str: 33 | raise FileValidationError("No file path provided.") 34 | 35 | file_path = Path(file_path_str) 36 | log.debug(f"Validating file path: {file_path}") 37 | 38 | if not file_path.is_file(): 39 | raise FileValidationError(f"File not found or is not a regular file: {file_path}") 40 | 41 | return file_path 42 | 43 | 44 | def detect_file_type(file_path: Path) -> str: 45 | """ 46 | Detects the file type based on its extension. 47 | 48 | Args: 49 | file_path: Path object representing the file 50 | 51 | Returns: 52 | String representing the detected file type ('parquet' or 'json') 53 | 54 | Raises: 55 | FileValidationError: If file extension is not supported 56 | """ 57 | file_suffix = file_path.suffix.lower() 58 | 59 | if file_suffix not in SUPPORTED_EXTENSIONS: 60 | supported_exts = ", ".join(SUPPORTED_EXTENSIONS.keys()) 61 | raise FileValidationError( 62 | f"Unsupported file extension: '{file_suffix}'. " 63 | f"Only {supported_exts} are supported." 64 | ) 65 | 66 | detected_type = SUPPORTED_EXTENSIONS[file_suffix] 67 | log.info(f"Detected '{file_suffix}' extension, type: {detected_type}") 68 | 69 | return detected_type 70 | 71 | 72 | def validate_and_detect_file(file_path_str: Optional[str]) -> Tuple[Path, str]: 73 | """ 74 | Convenience function that validates file path and detects file type. 75 | 76 | Args: 77 | file_path_str: String representation of the file path 78 | 79 | Returns: 80 | Tuple of (validated_path, detected_type) 81 | 82 | Raises: 83 | FileValidationError: If validation or type detection fails 84 | """ 85 | file_path = validate_file_path(file_path_str) 86 | file_type = detect_file_type(file_path) 87 | 88 | return file_path, file_type -------------------------------------------------------------------------------- /src/parqv/views/components/error_display.py: -------------------------------------------------------------------------------- 1 | """ 2 | Error display component for parqv views. 3 | """ 4 | 5 | from typing import Optional 6 | 7 | from textual.containers import VerticalScroll 8 | from textual.widgets import Static, Label 9 | 10 | 11 | class ErrorDisplay(VerticalScroll): 12 | """ 13 | A reusable component for displaying error messages in a consistent format. 14 | """ 15 | 16 | def __init__(self, 17 | title: str = "Error", 18 | message: str = "An error occurred", 19 | details: Optional[str] = None, 20 | **kwargs): 21 | """ 22 | Initialize the error display. 23 | 24 | Args: 25 | title: Error title/category 26 | message: Main error message 27 | details: Optional detailed error information 28 | **kwargs: Additional arguments for VerticalScroll 29 | """ 30 | super().__init__(**kwargs) 31 | self.title = title 32 | self.message = message 33 | self.details = details 34 | 35 | def compose(self): 36 | """Compose the error display layout.""" 37 | yield Label(self.title, classes="error-title") 38 | yield Static(f"[red]{self.message}[/red]", classes="error-content") 39 | 40 | if self.details: 41 | yield Static("Details:", classes="error-details-label") 42 | yield Static(f"[dim]{self.details}[/dim]", classes="error-details") 43 | 44 | @classmethod 45 | def file_not_found(cls, file_path: str, **kwargs) -> 'ErrorDisplay': 46 | """Create an error display for file not found errors.""" 47 | return cls( 48 | title="File Not Found", 49 | message=f"Could not find file: {file_path}", 50 | details="Please check that the file path is correct and the file exists.", 51 | **kwargs 52 | ) 53 | 54 | @classmethod 55 | def handler_not_available(cls, **kwargs) -> 'ErrorDisplay': 56 | """Create an error display for missing data handler.""" 57 | return cls( 58 | title="Data Handler Not Available", 59 | message="No data handler is currently loaded", 60 | details="This usually means the file could not be processed or loaded.", 61 | **kwargs 62 | ) 63 | 64 | @classmethod 65 | def data_loading_error(cls, error_msg: str, **kwargs) -> 'ErrorDisplay': 66 | """Create an error display for data loading errors.""" 67 | return cls( 68 | title="Data Loading Error", 69 | message="Failed to load data from the file", 70 | details=f"Technical details: {error_msg}", 71 | **kwargs 72 | ) -------------------------------------------------------------------------------- /src/parqv/views/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base classes for parqv views. 3 | """ 4 | 5 | from typing import Optional 6 | 7 | from textual.containers import Container 8 | from textual.widgets import Static 9 | 10 | from ..core import get_logger 11 | from ..data_sources import DataHandler 12 | 13 | 14 | class BaseView(Container): 15 | """ 16 | Base class for all parqv views. 17 | 18 | Provides common functionality for data loading, error handling, 19 | and handler access. 20 | """ 21 | 22 | def __init__(self, **kwargs): 23 | super().__init__(**kwargs) 24 | self._is_mounted = False 25 | 26 | @property 27 | def logger(self): 28 | """Get a logger for this view.""" 29 | return get_logger(f"{self.__class__.__module__}.{self.__class__.__name__}") 30 | 31 | @property 32 | def handler(self) -> Optional[DataHandler]: 33 | """Get the data handler from the app.""" 34 | if hasattr(self.app, 'handler'): 35 | return self.app.handler 36 | return None 37 | 38 | def on_mount(self) -> None: 39 | """Called when the view is mounted.""" 40 | self._is_mounted = True 41 | self.load_content() 42 | 43 | def load_content(self) -> None: 44 | """ 45 | Load the main content for this view. Must be implemented by subclasses. 46 | 47 | Raises: 48 | NotImplementedError: If not implemented by subclass 49 | """ 50 | raise NotImplementedError("Subclasses must implement load_content()") 51 | 52 | def clear_content(self) -> None: 53 | """Clear all content from the view.""" 54 | try: 55 | self.query("*").remove() 56 | except Exception as e: 57 | self.logger.error(f"Error clearing content: {e}") 58 | 59 | def show_error(self, message: str, exception: Optional[Exception] = None) -> None: 60 | """ 61 | Display an error message in the view. 62 | 63 | Args: 64 | message: Error message to display 65 | exception: Optional exception that caused the error 66 | """ 67 | if exception: 68 | self.logger.exception(f"Error in {self.__class__.__name__}: {message}") 69 | else: 70 | self.logger.error(f"Error in {self.__class__.__name__}: {message}") 71 | 72 | self.clear_content() 73 | error_widget = Static(f"[red]Error: {message}[/red]", classes="error-content") 74 | self.mount(error_widget) 75 | 76 | def show_info(self, message: str) -> None: 77 | """ 78 | Display an informational message in the view. 79 | 80 | Args: 81 | message: Info message to display 82 | """ 83 | self.logger.info(f"Info in {self.__class__.__name__}: {message}") 84 | self.clear_content() 85 | info_widget = Static(f"[blue]Info: {message}[/blue]", classes="info-content") 86 | self.mount(info_widget) 87 | 88 | def check_handler_available(self) -> bool: 89 | """ 90 | Check if handler is available and show error if not. 91 | 92 | Returns: 93 | True if handler is available, False otherwise 94 | """ 95 | if not self.handler: 96 | self.show_error("Data handler not available") 97 | return False 98 | return True 99 | -------------------------------------------------------------------------------- /src/parqv/core/handler_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handler factory for creating appropriate data handlers based on file type. 3 | """ 4 | 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | from ..data_sources import DataHandler, DataHandlerError, ParquetHandler, JsonHandler, CsvHandler 9 | from .logging import get_logger 10 | 11 | log = get_logger(__name__) 12 | 13 | 14 | class HandlerCreationError(Exception): 15 | """Exception raised when handler creation fails.""" 16 | pass 17 | 18 | 19 | class HandlerFactory: 20 | """Factory class for creating data handlers.""" 21 | 22 | # Registry of handler types to handler classes 23 | _HANDLER_REGISTRY = { 24 | "parquet": ParquetHandler, 25 | "json": JsonHandler, 26 | "csv": CsvHandler, 27 | } 28 | 29 | @classmethod 30 | def create_handler(cls, file_path: Path, handler_type: str) -> DataHandler: 31 | """ 32 | Creates an appropriate handler for the given file type. 33 | 34 | Args: 35 | file_path: Path to the data file 36 | handler_type: Type of handler to create ('parquet' or 'json') 37 | 38 | Returns: 39 | An instance of the appropriate DataHandler subclass 40 | 41 | Raises: 42 | HandlerCreationError: If handler creation fails 43 | """ 44 | if handler_type not in cls._HANDLER_REGISTRY: 45 | available_types = ", ".join(cls._HANDLER_REGISTRY.keys()) 46 | raise HandlerCreationError( 47 | f"Unknown handler type: '{handler_type}'. " 48 | f"Available types: {available_types}" 49 | ) 50 | 51 | handler_class = cls._HANDLER_REGISTRY[handler_type] 52 | 53 | log.info(f"Creating {handler_type.capitalize()} handler for: {file_path}") 54 | 55 | try: 56 | handler = handler_class(file_path) 57 | log.info(f"{handler_type.capitalize()} handler created successfully.") 58 | return handler 59 | 60 | except DataHandlerError as e: 61 | log.error(f"Failed to create {handler_type} handler: {e}") 62 | raise HandlerCreationError(f"Failed to initialize {handler_type} handler: {e}") from e 63 | 64 | except Exception as e: 65 | log.exception(f"Unexpected error creating {handler_type} handler") 66 | raise HandlerCreationError( 67 | f"Unexpected error during {handler_type} handler creation: {e}" 68 | ) from e 69 | 70 | @classmethod 71 | def get_supported_types(cls) -> list[str]: 72 | """ 73 | Returns a list of supported handler types. 74 | 75 | Returns: 76 | List of supported handler type strings 77 | """ 78 | return list(cls._HANDLER_REGISTRY.keys()) 79 | 80 | @classmethod 81 | def register_handler(cls, handler_type: str, handler_class: type[DataHandler]) -> None: 82 | """ 83 | Registers a new handler type (for extensibility). 84 | 85 | Args: 86 | handler_type: String identifier for the handler type 87 | handler_class: Class that implements DataHandler interface 88 | """ 89 | log.info(f"Registering handler type '{handler_type}' with class {handler_class.__name__}") 90 | cls._HANDLER_REGISTRY[handler_type] = handler_class -------------------------------------------------------------------------------- /src/parqv/parqv.css: -------------------------------------------------------------------------------- 1 | /* --- Base Screen Styles --- */ 2 | Screen { 3 | background: $surface; 4 | color: $text; 5 | } 6 | 7 | /* --- Header & Footer Styles --- */ 8 | Header { 9 | background: $primary; 10 | } 11 | Footer { 12 | background: $primary-darken-1; 13 | } 14 | Footer > .footer--key { 15 | color: $text-muted; 16 | } 17 | Footer > .footer--highlight-key { 18 | background: $accent-darken-1; 19 | color: $text; 20 | text-style: bold; 21 | } 22 | 23 | /* --- Tabbed Interface Styles --- */ 24 | TabbedContent { 25 | height: 100%; 26 | } 27 | TabbedContent > Tabs { 28 | background: $primary-darken-1; 29 | color: $text-muted; 30 | } 31 | TabbedContent > Tabs > Tab { 32 | padding: 1 2; 33 | } 34 | TabbedContent > Tabs > Tab:hover { 35 | background: $primary; 36 | } 37 | TabbedContent > Tabs > .--current { 38 | background: $accent; 39 | color: $text; 40 | text-style: bold; 41 | } 42 | TabbedContent > Content { 43 | padding: 1 2; 44 | height: 1fr; 45 | width: 100%; 46 | overflow: hidden; 47 | } 48 | TabbedContent > Content > * { 49 | height: 100%; 50 | width: 100%; 51 | } 52 | 53 | /* --- Schema Tab (#schema-view - VerticalScroll) --- */ 54 | #schema-view { 55 | padding: 0; 56 | } 57 | #schema-view > ListView#column-list-view { 58 | border: round $accent-lighten-2; 59 | margin-bottom: 1; 60 | background: $primary-background; 61 | overflow: auto; 62 | } 63 | #schema-view > ListView#column-list-view > ListItem { 64 | padding: 0 1; 65 | height: auto; 66 | } 67 | #schema-view > ListView#column-list-view > ListItem.--highlight { 68 | background: $accent; 69 | color: $text; 70 | } 71 | #schema-view > ListView#column-list-view > ListItem.--highlight Label { 72 | color: $text; 73 | } 74 | #schema-view > LoadingIndicator#schema-loading-indicator { 75 | margin: 1 0; 76 | width: 100%; 77 | text-align: center; 78 | } 79 | #schema-view > Container#schema-stats-content { 80 | padding: 1; 81 | overflow: auto; 82 | } 83 | #schema-view .stats-line { 84 | margin-bottom: 0; 85 | height: auto; 86 | width: 100%; 87 | } 88 | #schema-view .stats-code { 89 | background: $panel-darken-1; 90 | border: solid $accent-lighten-1; 91 | padding: 0 1; 92 | margin: 1 0; 93 | width: 100%; 94 | height: auto; 95 | overflow: auto; 96 | } 97 | #schema-view .stats-error { 98 | color: $error; 99 | } 100 | 101 | /* --- Metadata Tab (#metadata-view - VerticalScroll) --- */ 102 | #metadata-view { 103 | overflow-y: auto; 104 | } 105 | #metadata-view > Pretty { 106 | width: 100%; 107 | } 108 | 109 | /* --- Data Preview Tab (#data-view - Container) --- */ 110 | #data-view { 111 | } 112 | #data-view > DataTable { 113 | height: 100%; 114 | width: 100%; 115 | } 116 | 117 | /* --- Row Groups Tab (#rowgroup-view - VerticalScroll) --- */ 118 | #rowgroup-view { 119 | overflow-y: auto; 120 | } 121 | #rowgroup-view > DataTable { 122 | width: 100%; 123 | } 124 | 125 | /* --- General Widget Styles --- */ 126 | DataTable { 127 | margin-top: 1; 128 | } 129 | DataTable > Header { 130 | background: $secondary; 131 | color: $text; 132 | text-style: bold; 133 | } 134 | DataTable > Body > Row.--cursor { 135 | background: $accent; 136 | color: $text; 137 | } 138 | DataTable > Body > Row:hover { 139 | background: $secondary-darken-2; 140 | } 141 | 142 | /* --- Error Message Styles (Used in App Level Error) --- */ 143 | .error-title { 144 | color: $error; 145 | text-style: bold; 146 | margin-bottom: 1; 147 | } 148 | .error-content { 149 | color: $error; 150 | } -------------------------------------------------------------------------------- /src/parqv/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command Line Interface for parqv application. 3 | """ 4 | 5 | import sys 6 | 7 | from .app import ParqV 8 | from .core import SUPPORTED_EXTENSIONS, FileValidationError, validate_and_detect_file, setup_logging, get_logger 9 | 10 | 11 | def _print_user_message(message: str, log_level: str = "info") -> None: 12 | """ 13 | Show a message to the user and log it. 14 | 15 | Args: 16 | message: message to display and log 17 | log_level: log level ('info', 'error', 'warning') 18 | """ 19 | log = get_logger(__name__) 20 | 21 | print(message, file=sys.stderr) 22 | 23 | if log_level == "error": 24 | log.error(message) 25 | elif log_level == "warning": 26 | log.warning(message) 27 | else: 28 | log.info(message) 29 | 30 | 31 | def validate_cli_arguments() -> str: 32 | """ 33 | Validates command line arguments. 34 | 35 | Returns: 36 | The file path string from command line arguments 37 | 38 | Raises: 39 | SystemExit: If arguments are invalid 40 | """ 41 | log = get_logger(__name__) 42 | 43 | if len(sys.argv) < 2: 44 | usage_message = "Usage: parqv " 45 | supported_message = f"Supported file types: {', '.join(SUPPORTED_EXTENSIONS.keys())}" 46 | 47 | _print_user_message(usage_message, "error") 48 | _print_user_message(supported_message, "info") 49 | 50 | log.error("No file path provided via CLI arguments") 51 | sys.exit(1) 52 | 53 | file_path_str = sys.argv[1] 54 | log.debug(f"File path received from CLI: {file_path_str}") 55 | return file_path_str 56 | 57 | 58 | def run_app() -> None: 59 | """ 60 | Main entry point for the parqv CLI application. 61 | 62 | This function: 63 | 1. Sets up logging 64 | 2. Validates command line arguments 65 | 3. Validates the file path and type 66 | 4. Creates and runs the Textual app 67 | """ 68 | # Setup logging first 69 | log = setup_logging() 70 | log.info("--- parqv CLI started ---") 71 | 72 | try: 73 | # Get and validate CLI arguments 74 | file_path_str = validate_cli_arguments() 75 | 76 | # Validate file path and detect type (for early validation) 77 | file_path, file_type = validate_and_detect_file(file_path_str) 78 | log.info(f"File validated successfully: {file_path} (type: {file_type})") 79 | 80 | # Create and run the app 81 | log.info("Starting parqv application...") 82 | app = ParqV(file_path_str=file_path_str) 83 | app.run() 84 | 85 | log.info("parqv application finished successfully") 86 | 87 | except FileValidationError as e: 88 | log.error(f"File validation failed: {e}") 89 | 90 | error_message = f"Error: {e}" 91 | help_message = f"Please provide a file with one of these extensions: {', '.join(SUPPORTED_EXTENSIONS.keys())}" 92 | 93 | _print_user_message(error_message, "error") 94 | _print_user_message(help_message, "info") 95 | 96 | log.error("Exiting due to file validation error") 97 | sys.exit(1) 98 | 99 | except KeyboardInterrupt: 100 | log.info("Application interrupted by user (Ctrl+C)") 101 | _print_user_message("\nApplication interrupted by user.", "info") 102 | sys.exit(0) 103 | 104 | except Exception as e: 105 | log.exception(f"Unexpected error in CLI: {e}") 106 | _print_user_message(f"An unexpected error occurred: {e}", "error") 107 | _print_user_message("Check the log file for more details.", "info") 108 | sys.exit(1) 109 | 110 | 111 | if __name__ == "__main__": 112 | run_app() 113 | -------------------------------------------------------------------------------- /src/parqv/views/data_view.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data view for displaying tabular data preview. 3 | """ 4 | 5 | from typing import Optional 6 | 7 | import pandas as pd 8 | from textual.app import ComposeResult 9 | 10 | from .base import BaseView 11 | from .components import EnhancedDataTable 12 | from ..core import DEFAULT_PREVIEW_ROWS 13 | 14 | 15 | class DataView(BaseView): 16 | """ 17 | View for displaying a preview of the data in tabular format. 18 | 19 | Shows the first N rows of data in an interactive table format 20 | with proper error handling and loading states. 21 | """ 22 | 23 | def __init__(self, preview_rows: int = DEFAULT_PREVIEW_ROWS, **kwargs): 24 | """ 25 | Initialize the data view. 26 | 27 | Args: 28 | preview_rows: Number of rows to show in preview 29 | **kwargs: Additional arguments for BaseView 30 | """ 31 | super().__init__(**kwargs) 32 | self.preview_rows = preview_rows 33 | self._data_table: Optional[EnhancedDataTable] = None 34 | 35 | def compose(self) -> ComposeResult: 36 | """Compose the data view layout.""" 37 | self._data_table = EnhancedDataTable(id="data-preview-table") 38 | yield self._data_table 39 | 40 | def load_content(self) -> None: 41 | """Load and display data content.""" 42 | if not self.check_handler_available(): 43 | return 44 | 45 | if not self._data_table: 46 | self.show_error("Data table component not initialized") 47 | return 48 | 49 | try: 50 | # Get data preview from handler 51 | self.logger.info(f"Loading data preview ({self.preview_rows} rows)") 52 | df = self.handler.get_data_preview(num_rows=self.preview_rows) 53 | 54 | # Validate DataFrame 55 | if df is None: 56 | self.show_error("Could not load data preview - handler returned None") 57 | return 58 | 59 | # Handle error DataFrame (some handlers return error as DataFrame) 60 | if self._is_error_dataframe(df): 61 | error_msg = self._extract_error_from_dataframe(df) 62 | self.show_error(error_msg) 63 | return 64 | 65 | # Load DataFrame into table 66 | success = self._data_table.load_dataframe(df, max_rows=self.preview_rows) 67 | 68 | if success: 69 | self.logger.info(f"Data preview loaded successfully: {len(df)} rows") 70 | else: 71 | self.show_error("Failed to load data into table component") 72 | 73 | except Exception as e: 74 | self.show_error("Failed to load data preview", e) 75 | 76 | def _is_error_dataframe(self, df: pd.DataFrame) -> bool: 77 | """ 78 | Check if the DataFrame represents an error condition. 79 | 80 | Args: 81 | df: DataFrame to check 82 | 83 | Returns: 84 | True if the DataFrame contains error information 85 | """ 86 | return ( 87 | not df.empty and 88 | "error" in df.columns and 89 | len(df.columns) == 1 90 | ) 91 | 92 | def _extract_error_from_dataframe(self, df: pd.DataFrame) -> str: 93 | """ 94 | Extract error message from an error DataFrame. 95 | 96 | Args: 97 | df: Error DataFrame 98 | 99 | Returns: 100 | Error message string 101 | """ 102 | try: 103 | if not df.empty and "error" in df.columns: 104 | return str(df["error"].iloc[0]) 105 | except Exception: 106 | pass 107 | return "Unknown error in data loading" 108 | 109 | def refresh_data(self) -> None: 110 | """Refresh the data display.""" 111 | self.clear_content() 112 | self.load_content() 113 | 114 | def set_preview_rows(self, new_rows: int) -> None: 115 | """ 116 | Update the number of preview rows and refresh display. 117 | 118 | Args: 119 | new_rows: New number of rows to preview 120 | """ 121 | if new_rows > 0: 122 | self.preview_rows = new_rows 123 | self.refresh_data() 124 | else: 125 | self.logger.warning(f"Invalid preview_rows value: {new_rows}") 126 | 127 | def get_current_data(self) -> Optional[pd.DataFrame]: 128 | """ 129 | Get the currently displayed data if available. 130 | 131 | Returns: 132 | Currently loaded DataFrame or None 133 | """ 134 | if not self.handler: 135 | return None 136 | 137 | try: 138 | return self.handler.get_data_preview(num_rows=self.preview_rows) 139 | except Exception as e: 140 | self.logger.error(f"Failed to get current data: {e}") 141 | return None 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parqv 2 | 3 | [![Python Version](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://www.python.org/) 4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) 5 | [![PyPI version](https://badge.fury.io/py/parqv.svg)](https://badge.fury.io/py/parqv) 6 | [![Built with Textual](https://img.shields.io/badge/Built%20with-Textual-blueviolet.svg)](https://textual.textualize.io/) 7 | 8 | --- 9 | 10 | **Supported File Formats:** ✅ **Parquet** | ✅ **JSON** / **JSON Lines (ndjson)** | ✅ **CSV / TSV** | *(More planned!)* 11 | 12 | --- 13 | 14 | **`parqv` is a Python-based interactive TUI (Text User Interface) tool designed to explore, analyze, and understand various data file formats directly within your terminal.** `parqv` aims to provide a unified, visual experience for quick data inspection without leaving your console. 15 | 16 | ## 💻 Demo 17 | ![parqv.gif](assets/parqv.gif) 18 | *(Demo shows Parquet features; UI adapts for other formats)* 19 | 20 | ## 🤔 Why `parqv`? 21 | 1. **Unified Interface:** Launch `parqv ` to access **metadata, schema, data preview, and column statistics** all within a single, navigable terminal window. No more juggling different commands for different file types. 22 | 2. **Interactive Exploration:** 23 | * **🖱️ Keyboard & Mouse Driven:** Navigate using familiar keys (arrows, `hjkl`, Tab) or even your mouse (thanks to `Textual`). 24 | * **📜 Scrollable Views:** Easily scroll through large schemas, data tables, or column lists. 25 | * **🌲 Clear Schema View:** Understand column names, data types, and nullability at a glance. (Complex nested structures visualization might vary by format). 26 | * **📊 Dynamic Stats:** Select a column and instantly see its detailed statistics (counts, nulls, min/max, mean, distinct values, etc.). 27 | 3. **Cross-Format Consistency:** 28 | * **🎨 Rich Display:** Leverages `rich` and `Textual` for colorful, readable tables and text across supported formats. 29 | * **📈 Quick Stats:** Get key statistical insights consistently, regardless of the underlying file type. 30 | * **🔌 Extensible:** Designed with a handler interface to easily add support for more file formats in the future (like CSV, Arrow IPC, etc.). 31 | 32 | ## ✨ Features (TUI Mode) 33 | * **Multi-Format Support:** Now supports **Parquet** (`.parquet`), **JSON/JSON Lines** (`.json`, `.ndjson`), and **CSV/TSV** (`.csv`, `.tsv`). Run `parqv `. 34 | * **Metadata Panel:** Displays key file information (path, format, size, total rows, column count, etc.). *Fields may vary slightly depending on the file format.* 35 | * **Schema Explorer:** 36 | * Interactive list view of columns. 37 | * Clearly shows column names, data types, and nullability. 38 | * **Data Table Viewer:** 39 | * Scrollable table preview of the file's data. 40 | * Attempts to preserve data types for better representation. 41 | * **Column Statistics Viewer:** 42 | * Select a column in the Schema tab to view detailed statistics. 43 | * Shows counts (total, valid, null), percentages, and type-specific stats (min/max, mean, stddev, distinct counts, length stats, boolean value counts where applicable). 44 | * **Row Group Inspector (Parquet Specific):** 45 | * *This panel only appears when viewing Parquet files.* 46 | * Lists row groups with stats (row count, compressed/uncompressed size). 47 | * (Planned) Select a row group for more details. 48 | 49 | ## 🚀 Getting Started 50 | 51 | **1. Prerequisites:** 52 | * **Python:** Version 3.10 or higher. 53 | * **pip:** The Python package installer. 54 | 55 | **2. Install `parqv`:** 56 | * Open your terminal and run: 57 | ```bash 58 | pip install parqv 59 | ``` 60 | *(This will also install dependencies like `textual`, `pyarrow`, `pandas`, and `duckdb`)* 61 | * **Updating `parqv`:** 62 | ```bash 63 | pip install --upgrade parqv 64 | ``` 65 | 66 | **3. Run `parqv`:** 67 | * Point `parqv` to your data file: 68 | ```bash 69 | #parquet 70 | parqv /path/to/your/data.parquet 71 | 72 | # json 73 | parqv /path/to/your/data.json 74 | * The interactive TUI will launch. Use your keyboard (and mouse, if supported by your terminal) to navigate: 75 | * **Arrow Keys / `j`,`k` (in lists):** Move selection up/down. 76 | * **`Tab` / `Shift+Tab`:** Cycle focus between the main tab content and potentially other areas. (Focus handling might evolve). 77 | * **`Enter` (in column list):** Select a column to view statistics. 78 | * **View Switching:** Use `Ctrl+N` (Next Tab) and `Ctrl+P` (Previous Tab) or click on the tabs (Metadata, Schema, Data Preview). 79 | * **Scrolling:** Use `PageUp` / `PageDown` / `Home` / `End` or arrow keys/mouse wheel within scrollable areas (like Schema stats or Data Preview). 80 | * **`q` / `Ctrl+C`:** Quit `parqv`. 81 | * *(Help Screen `?` is planned)* 82 | 83 | --- 84 | 85 | ## 📄 License 86 | 87 | Licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for the full license text. 88 | -------------------------------------------------------------------------------- /src/parqv/data_sources/base/handler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base data handler interface for parqv data sources. 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from pathlib import Path 7 | from typing import Any, Dict, List, Optional 8 | 9 | import pandas as pd 10 | 11 | from ...core import get_logger 12 | 13 | 14 | class DataHandler(ABC): 15 | """ 16 | Abstract Base Class for data handlers. 17 | 18 | Defines the common interface required by the ParqV application 19 | to interact with different data file formats. 20 | """ 21 | 22 | def __init__(self, file_path: Path): 23 | """ 24 | Initialize the handler with the file path. 25 | 26 | Subclasses should open the file or set up necessary resources here. 27 | 28 | Args: 29 | file_path: Path to the data file. 30 | 31 | Raises: 32 | DataHandlerError: If initialization fails (e.g., file not found, format error). 33 | """ 34 | self.file_path = file_path 35 | self.logger = get_logger(f"{self.__class__.__module__}.{self.__class__.__name__}") 36 | 37 | @abstractmethod 38 | def close(self) -> None: 39 | """ 40 | Close any open resources (files, connections, etc.). 41 | 42 | Must be implemented by subclasses. 43 | """ 44 | pass 45 | 46 | @abstractmethod 47 | def get_metadata_summary(self) -> Dict[str, Any]: 48 | """ 49 | Get a dictionary containing summary metadata about the data source. 50 | 51 | Keys should be human-readable strings. Values can be of various types. 52 | Should include an 'error' key if metadata retrieval fails. 53 | 54 | Returns: 55 | A dictionary with metadata summary or an error dictionary. 56 | """ 57 | pass 58 | 59 | @abstractmethod 60 | def get_schema_data(self) -> Optional[List[Dict[str, Any]]]: 61 | """ 62 | Get the schema as a list of dictionaries. 63 | 64 | Each dictionary should represent a column and ideally contain keys: 65 | - 'name' (str): Column name. 66 | - 'type' (str): Formatted data type string. 67 | - 'nullable' (Any): Indicator of nullability (e.g., bool, str "YES"/"NO"). 68 | 69 | Returns: 70 | A list of schema dictionaries, an empty list if no columns, 71 | or None if schema retrieval failed. 72 | """ 73 | pass 74 | 75 | @abstractmethod 76 | def get_data_preview(self, num_rows: int = 50) -> Optional[pd.DataFrame]: 77 | """ 78 | Fetch a preview of the data. 79 | 80 | Args: 81 | num_rows: The maximum number of rows to fetch. 82 | 83 | Returns: 84 | A pandas DataFrame with preview data, an empty DataFrame if no data, 85 | a DataFrame with an 'error' column on failure, or None on critical failure. 86 | """ 87 | pass 88 | 89 | @abstractmethod 90 | def get_column_stats(self, column_name: str) -> Dict[str, Any]: 91 | """ 92 | Calculate and return statistics for a specific column. 93 | 94 | The returned dictionary should ideally contain keys like: 95 | - 'column' (str): Column name. 96 | - 'type' (str): Formatted data type string. 97 | - 'nullable' (Any): Nullability indicator. 98 | - 'calculated' (Dict[str, Any]): Dictionary of computed statistics. 99 | - 'error' (Optional[str]): Error message if calculation failed. 100 | - 'message' (Optional[str]): Informational message. 101 | 102 | Args: 103 | column_name: The name of the column. 104 | 105 | Returns: 106 | A dictionary containing column statistics or error information. 107 | """ 108 | pass 109 | 110 | def format_size(self, num_bytes: int) -> str: 111 | """ 112 | Format bytes into a human-readable string. 113 | 114 | Args: 115 | num_bytes: Number of bytes to format 116 | 117 | Returns: 118 | Human-readable size string 119 | """ 120 | if num_bytes < 1024: 121 | return f"{num_bytes} bytes" 122 | elif num_bytes < 1024 ** 2: 123 | return f"{num_bytes / 1024:.1f} KB" 124 | elif num_bytes < 1024 ** 3: 125 | return f"{num_bytes / 1024 ** 2:.1f} MB" 126 | else: 127 | return f"{num_bytes / 1024 ** 3:.1f} GB" 128 | 129 | def __enter__(self): 130 | """Enter the runtime context related to this object.""" 131 | return self 132 | 133 | def __exit__(self, exc_type, exc_val, exc_tb): 134 | """Exit the runtime context related to this object, ensuring cleanup.""" 135 | self.close() 136 | 137 | def __del__(self): 138 | """Attempt to close the handler when the object is garbage collected (best effort).""" 139 | try: 140 | self.close() 141 | except Exception: 142 | # Ignore exceptions during garbage collection 143 | pass 144 | -------------------------------------------------------------------------------- /src/parqv/views/components/enhanced_data_table.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enhanced data table component for parqv views. 3 | """ 4 | 5 | from typing import Optional, List, Tuple, Any 6 | 7 | import pandas as pd 8 | from textual.containers import Container 9 | from textual.widgets import DataTable, Static 10 | 11 | from ...core import get_logger 12 | 13 | log = get_logger(__name__) 14 | 15 | 16 | class EnhancedDataTable(Container): 17 | """ 18 | An enhanced data table component that handles DataFrame display with better error handling. 19 | """ 20 | 21 | def __init__(self, **kwargs): 22 | super().__init__(**kwargs) 23 | self._table: Optional[DataTable] = None 24 | 25 | def compose(self): 26 | """Compose the data table layout.""" 27 | self._table = DataTable(id="enhanced-data-table") 28 | self._table.cursor_type = "row" 29 | yield self._table 30 | 31 | def clear_table(self) -> bool: 32 | """ 33 | Clear the table contents safely. 34 | 35 | Returns: 36 | True if cleared successfully, False if recreation was needed 37 | """ 38 | if not self._table: 39 | return False 40 | 41 | try: 42 | self._table.clear(columns=True) 43 | return True 44 | except Exception as e: 45 | log.warning(f"Failed to clear table, recreating: {e}") 46 | return self._recreate_table() 47 | 48 | def _recreate_table(self) -> bool: 49 | """ 50 | Recreate the table if clearing failed. 51 | 52 | Returns: 53 | True if recreation was successful, False otherwise 54 | """ 55 | try: 56 | if self._table: 57 | self._table.remove() 58 | 59 | self._table = DataTable(id="enhanced-data-table") 60 | self._table.cursor_type = "row" 61 | self.mount(self._table) 62 | return True 63 | except Exception as e: 64 | log.error(f"Failed to recreate table: {e}") 65 | return False 66 | 67 | def load_dataframe(self, df: pd.DataFrame, max_rows: Optional[int] = None) -> bool: 68 | """ 69 | Load a pandas DataFrame into the table. 70 | 71 | Args: 72 | df: The DataFrame to load 73 | max_rows: Optional maximum number of rows to display 74 | 75 | Returns: 76 | True if loaded successfully, False otherwise 77 | """ 78 | if not self._table: 79 | log.error("Table not initialized") 80 | return False 81 | 82 | try: 83 | # Clear existing content 84 | if not self.clear_table(): 85 | return False 86 | 87 | # Handle empty DataFrame 88 | if df.empty: 89 | self._show_empty_message() 90 | return True 91 | 92 | # Limit rows if specified 93 | display_df = df.head(max_rows) if max_rows else df 94 | 95 | # Add columns 96 | columns = [str(col) for col in display_df.columns] 97 | self._table.add_columns(*columns) 98 | 99 | # Add rows 100 | rows_data = self._prepare_rows_data(display_df) 101 | self._table.add_rows(rows_data) 102 | 103 | log.info(f"Loaded {len(display_df)} rows and {len(columns)} columns into table") 104 | return True 105 | 106 | except Exception as e: 107 | log.exception(f"Error loading DataFrame into table: {e}") 108 | self._show_error_message(f"Failed to load data: {e}") 109 | return False 110 | 111 | def _prepare_rows_data(self, df: pd.DataFrame) -> List[Tuple[str, ...]]: 112 | """ 113 | Prepare DataFrame rows for the DataTable. 114 | 115 | Args: 116 | df: The DataFrame to process 117 | 118 | Returns: 119 | List of tuples representing table rows 120 | """ 121 | rows_data = [] 122 | for row in df.itertuples(index=False, name=None): 123 | # Convert each item to string, handling NaN values 124 | row_strings = tuple( 125 | str(item) if pd.notna(item) else "" 126 | for item in row 127 | ) 128 | rows_data.append(row_strings) 129 | return rows_data 130 | 131 | def _show_empty_message(self) -> None: 132 | """Show a message when the DataFrame is empty.""" 133 | try: 134 | self.query("Static").remove() # Remove any existing messages 135 | empty_msg = Static("No data available in the selected range or file is empty.", 136 | classes="info-content") 137 | self.mount(empty_msg) 138 | except Exception as e: 139 | log.error(f"Failed to show empty message: {e}") 140 | 141 | def _show_error_message(self, message: str) -> None: 142 | """Show an error message in the table area.""" 143 | try: 144 | self.query("DataTable, Static").remove() # Remove table and any messages 145 | error_msg = Static(f"[red]{message}[/red]", classes="error-content") 146 | self.mount(error_msg) 147 | except Exception as e: 148 | log.error(f"Failed to show error message: {e}") 149 | 150 | def get_table(self) -> Optional[DataTable]: 151 | """Get the underlying DataTable widget.""" 152 | return self._table -------------------------------------------------------------------------------- /src/parqv/views/utils/data_formatters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data formatting utilities for parqv views. 3 | """ 4 | 5 | from typing import Any, Dict, Union 6 | from rich.text import Text 7 | 8 | 9 | def format_metadata_for_display(metadata: Dict[str, Any]) -> Dict[str, Any]: 10 | """ 11 | Format metadata dictionary for consistent display. 12 | 13 | Args: 14 | metadata: Raw metadata dictionary from handler 15 | 16 | Returns: 17 | Formatted metadata dictionary ready for display 18 | """ 19 | if not metadata: 20 | return {"Error": "No metadata available"} 21 | 22 | # Check for error in metadata 23 | if "error" in metadata: 24 | return {"Error": metadata["error"]} 25 | 26 | formatted = {} 27 | 28 | # Format specific known fields with better presentation 29 | field_formatters = { 30 | "File Path": lambda x: str(x), 31 | "Path": lambda x: str(x), 32 | "Format": lambda x: str(x).upper(), 33 | "Total Rows": lambda x: _format_number(x), 34 | "Total Columns": lambda x: _format_number(x), 35 | "Columns": lambda x: _format_number(x), 36 | "Size": lambda x: _format_size_if_bytes(x), 37 | "Memory Usage": lambda x: _format_size_if_bytes(x), 38 | "DuckDB View": lambda x: f"`{x}`" if x else "N/A", 39 | } 40 | 41 | for key, value in metadata.items(): 42 | if isinstance(value, dict): 43 | # Handle nested dictionaries (like grouped metadata) 44 | formatted[key] = _format_nested_metadata(value, field_formatters) 45 | elif key in field_formatters: 46 | formatted[key] = field_formatters[key](value) 47 | else: 48 | formatted[key] = format_value_for_display(value) 49 | 50 | return formatted 51 | 52 | 53 | def _format_nested_metadata(nested_dict: Dict[str, Any], field_formatters: Dict) -> Dict[str, Any]: 54 | """Format nested metadata dictionaries.""" 55 | formatted_nested = {} 56 | 57 | for key, value in nested_dict.items(): 58 | if isinstance(value, dict): 59 | # Handle further nesting if needed 60 | formatted_nested[key] = _format_nested_metadata(value, field_formatters) 61 | elif key in field_formatters: 62 | formatted_nested[key] = field_formatters[key](value) 63 | else: 64 | formatted_nested[key] = format_value_for_display(value) 65 | 66 | return formatted_nested 67 | 68 | 69 | def format_value_for_display(value: Any) -> str: 70 | """ 71 | Format a single value for display in the UI. 72 | 73 | Args: 74 | value: The value to format 75 | 76 | Returns: 77 | String representation suitable for display 78 | """ 79 | if value is None: 80 | return "N/A" 81 | 82 | if isinstance(value, (int, float)): 83 | return _format_number(value) 84 | 85 | if isinstance(value, bool): 86 | return "Yes" if value else "No" 87 | 88 | if isinstance(value, str): 89 | # Handle empty strings 90 | if not value.strip(): 91 | return "N/A" 92 | return value 93 | 94 | # For other types, convert to string 95 | return str(value) 96 | 97 | 98 | def _format_number(value: Union[str, int, float]) -> str: 99 | """ 100 | Format numbers with thousand separators. 101 | 102 | Args: 103 | value: Numeric value or string representation 104 | 105 | Returns: 106 | Formatted number string 107 | """ 108 | if isinstance(value, str): 109 | # Try to extract number from string like "1,234" or "1234" 110 | try: 111 | # Remove existing commas and convert 112 | clean_str = value.replace(",", "").strip() 113 | if clean_str.isdigit(): 114 | return f"{int(clean_str):,}" 115 | elif "." in clean_str: 116 | return f"{float(clean_str):,.2f}" 117 | else: 118 | return value # Return as-is if not numeric 119 | except (ValueError, AttributeError): 120 | return value 121 | 122 | if isinstance(value, int): 123 | return f"{value:,}" 124 | 125 | if isinstance(value, float): 126 | return f"{value:,.2f}" 127 | 128 | return str(value) 129 | 130 | 131 | def _format_size_if_bytes(value: Union[str, int]) -> str: 132 | """ 133 | Format size values, detecting if they represent bytes. 134 | 135 | Args: 136 | value: Size value that might be in bytes 137 | 138 | Returns: 139 | Formatted size string 140 | """ 141 | if isinstance(value, str): 142 | # If it already contains size units, return as-is 143 | if any(unit in value.lower() for unit in ["kb", "mb", "gb", "tb", "bytes"]): 144 | return value 145 | 146 | # Try to parse as number and format as bytes 147 | try: 148 | clean_str = value.replace(",", "").strip() 149 | if "bytes" in value.lower(): 150 | num_bytes = int(clean_str.split()[0]) 151 | return _format_bytes(num_bytes) 152 | else: 153 | return value 154 | except (ValueError, IndexError): 155 | return value 156 | 157 | if isinstance(value, int): 158 | # Assume it's bytes if it's a large integer 159 | if value > 1024: 160 | return _format_bytes(value) 161 | else: 162 | return f"{value:,}" 163 | 164 | return str(value) 165 | 166 | 167 | def _format_bytes(num_bytes: int) -> str: 168 | """ 169 | Format bytes into human-readable format. 170 | 171 | Args: 172 | num_bytes: Number of bytes 173 | 174 | Returns: 175 | Human-readable size string 176 | """ 177 | if num_bytes < 1024: 178 | return f"{num_bytes:,} bytes" 179 | elif num_bytes < 1024 ** 2: 180 | return f"{num_bytes / 1024:.1f} KB" 181 | elif num_bytes < 1024 ** 3: 182 | return f"{num_bytes / 1024 ** 2:.1f} MB" 183 | else: 184 | return f"{num_bytes / 1024 ** 3:.1f} GB" -------------------------------------------------------------------------------- /src/parqv/app.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | from textual.app import App, ComposeResult, Binding 5 | from textual.containers import Container 6 | from textual.widgets import Header, Footer, Static, Label, TabbedContent, TabPane 7 | 8 | from .core import CSS_PATH, FileValidationError, validate_and_detect_file, HandlerFactory, HandlerCreationError, get_logger 9 | from .data_sources import DataHandler 10 | from .views.data_view import DataView 11 | from .views.metadata_view import MetadataView 12 | from .views.schema_view import SchemaView 13 | 14 | log = get_logger(__name__) 15 | 16 | 17 | class ParqV(App[None]): 18 | """A Textual app to visualize Parquet or JSON files.""" 19 | 20 | CSS_PATH = CSS_PATH 21 | BINDINGS = [ 22 | Binding("q", "quit", "Quit", priority=True), 23 | ] 24 | 25 | def __init__(self, file_path_str: Optional[str] = None, *args, **kwargs): 26 | """ 27 | Initialize the ParqV application. 28 | 29 | Args: 30 | file_path_str: Path to the file to visualize 31 | *args, **kwargs: Additional arguments for the Textual App 32 | """ 33 | super().__init__(*args, **kwargs) 34 | 35 | # Application state 36 | self.file_path: Optional[Path] = None 37 | self.handler: Optional[DataHandler] = None 38 | self.handler_type: Optional[str] = None 39 | self.error_message: Optional[str] = None 40 | 41 | # Initialize with file if provided 42 | if file_path_str: 43 | self._initialize_file_handler(file_path_str) 44 | 45 | def _initialize_file_handler(self, file_path_str: str) -> None: 46 | """ 47 | Initialize the file handler for the given file path. 48 | 49 | Args: 50 | file_path_str: Path to the file to process 51 | """ 52 | try: 53 | # Validate file and detect type 54 | self.file_path, self.handler_type = validate_and_detect_file(file_path_str) 55 | 56 | # Create appropriate handler 57 | self.handler = HandlerFactory.create_handler(self.file_path, self.handler_type) 58 | 59 | log.info(f"Successfully initialized {self.handler_type} handler for: {self.file_path.name}") 60 | 61 | except (FileValidationError, HandlerCreationError) as e: 62 | self.error_message = str(e) 63 | log.error(f"Failed to initialize handler: {e}") 64 | 65 | except Exception as e: 66 | self.error_message = f"An unexpected error occurred: {e}" 67 | log.exception("Unexpected error during handler initialization") 68 | 69 | def compose(self) -> ComposeResult: 70 | """Compose the UI layout.""" 71 | yield Header() 72 | 73 | if self.error_message: 74 | log.debug(f"Displaying error message: {self.error_message}") 75 | yield Container( 76 | Label("Error Loading File:", classes="error-title"), 77 | Static(self.error_message, classes="error-content"), 78 | id="error-container" 79 | ) 80 | elif self.handler: 81 | log.debug(f"Composing main layout with TabbedContent for {self.handler_type} handler.") 82 | with TabbedContent(id="main-tabs"): 83 | yield TabPane("Metadata", MetadataView(id="metadata-view"), id="tab-metadata") 84 | yield TabPane("Schema", SchemaView(id="schema-view"), id="tab-schema") 85 | yield TabPane("Data Preview", DataView(id="data-view"), id="tab-data") 86 | else: 87 | log.warning("No handler available and no error message set") 88 | yield Container( 89 | Label("No file loaded.", classes="error-title"), 90 | Static("Please provide a valid file path.", classes="error-content"), 91 | id="no-file-container" 92 | ) 93 | 94 | yield Footer() 95 | 96 | def on_mount(self) -> None: 97 | """Handle app mount event - set up header information.""" 98 | log.debug("App mounted.") 99 | self._update_header() 100 | 101 | def _update_header(self) -> None: 102 | """Update the header with file and format information.""" 103 | try: 104 | header = self.query_one(Header) 105 | 106 | if self.handler and self.file_path and self.handler_type: 107 | display_name = self.file_path.name 108 | format_name = self.handler_type.capitalize() 109 | header.title = f"parqv - {display_name}" 110 | header.sub_title = f"Format: {format_name}" 111 | elif self.error_message: 112 | header.title = "parqv - Error" 113 | header.sub_title = "Failed to load file" 114 | else: 115 | header.title = "parqv" 116 | header.sub_title = "File Viewer" 117 | 118 | except Exception as e: 119 | log.error(f"Failed to update header: {e}") 120 | 121 | def action_quit(self) -> None: 122 | """Handle quit action - cleanup and exit.""" 123 | log.info("Quit action triggered.") 124 | self._cleanup() 125 | self.exit() 126 | 127 | def _cleanup(self) -> None: 128 | """Clean up resources before exit.""" 129 | if self.handler: 130 | try: 131 | self.handler.close() 132 | log.info("Handler closed successfully.") 133 | except Exception as e: 134 | log.error(f"Error during handler cleanup: {e}") 135 | 136 | 137 | # For backward compatibility, keep the old CLI entry point 138 | def run_app(): 139 | """ 140 | Legacy CLI entry point for backward compatibility. 141 | 142 | Note: New code should use parqv.cli.run_app() instead. 143 | """ 144 | from .cli import run_app as new_run_app 145 | log.warning("Using legacy run_app(). Consider importing from parqv.cli instead.") 146 | new_run_app() 147 | 148 | 149 | if __name__ == "__main__": 150 | run_app() 151 | -------------------------------------------------------------------------------- /src/parqv/views/utils/visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualization utilities for parqv views. 3 | 4 | Provides text-based data visualization functions like ASCII histograms. 5 | """ 6 | import math 7 | from typing import List, Union, Optional 8 | 9 | TICK_CHARS = [' ', '▂', '▃', '▄', '▅', '▆', '▇', '█'] 10 | 11 | 12 | def create_text_histogram( 13 | data: List[Union[int, float]], 14 | bins: int = 15, 15 | width: int = 60, 16 | height: int = 8, 17 | title: Optional[str] = None 18 | ) -> List[str]: 19 | """ 20 | Create a professional, text-based histogram from numerical data. 21 | 22 | Args: 23 | data: List of numerical values. 24 | bins: The number of bins for the histogram. 25 | width: The total character width of the output histogram. 26 | height: The maximum height of the histogram bars in lines. 27 | title: An optional title for the histogram. 28 | 29 | Returns: 30 | A list of strings representing the histogram, ready for printing. 31 | """ 32 | if not data: 33 | return ["(No data available for histogram)"] 34 | 35 | # 1. Sanitize the input data 36 | clean_data = [float(val) for val in data if isinstance(val, (int, float)) and math.isfinite(val)] 37 | 38 | if not clean_data: 39 | return ["(No valid numerical data to plot)"] 40 | 41 | min_val, max_val = min(clean_data), max(clean_data) 42 | 43 | if min_val == max_val: 44 | return [f"(All values are identical: {_format_number(min_val)})"] 45 | 46 | # 2. Create bins and count frequencies 47 | # Add a small epsilon to the range to ensure max_val falls into the last bin 48 | epsilon = (max_val - min_val) / 1e9 49 | value_range = (max_val - min_val) + epsilon 50 | bin_width = value_range / bins 51 | 52 | bin_counts = [0] * bins 53 | for value in clean_data: 54 | bin_index = int((value - min_val) / bin_width) 55 | bin_counts[bin_index] += 1 56 | 57 | # 3. Render the histogram 58 | return _render_histogram( 59 | bin_counts=bin_counts, 60 | min_val=min_val, 61 | max_val=max_val, 62 | width=width, 63 | height=height, 64 | title=title 65 | ) 66 | 67 | 68 | def _render_histogram( 69 | bin_counts: List[int], 70 | min_val: float, 71 | max_val: float, 72 | width: int, 73 | height: int, 74 | title: Optional[str] 75 | ) -> List[str]: 76 | """ 77 | Internal function to render the histogram components into ASCII art. 78 | """ 79 | lines = [] 80 | if title: 81 | lines.append(title.center(width)) 82 | 83 | max_count = max(bin_counts) if bin_counts else 0 84 | if max_count == 0: 85 | return lines + ["(No data falls within histogram bins)"] 86 | 87 | # --- Layout Calculations --- 88 | y_axis_width = len(str(max_count)) 89 | plot_width = width - y_axis_width - 3 # Reserve space for "| " and axis 90 | if plot_width <= 0: 91 | return ["(Terminal width too narrow to draw histogram)"] 92 | 93 | # Resample the data bins to fit the available plot_width. 94 | # This stretches or shrinks the histogram to match the screen space. 95 | display_bins = [] 96 | num_data_bins = len(bin_counts) 97 | for i in range(plot_width): 98 | # Find the corresponding data bin for this screen column 99 | data_bin_index = int(i * num_data_bins / plot_width) 100 | display_bins.append(bin_counts[data_bin_index]) 101 | 102 | # --- Y-Axis and Bars (Top to Bottom) --- 103 | for row in range(height, -1, -1): 104 | line = "" 105 | # Y-axis labels 106 | if row == height: 107 | line += f"{max_count:<{y_axis_width}} | " 108 | elif row == 0: 109 | line += f"{0:<{y_axis_width}} +-" 110 | else: 111 | line += " " * y_axis_width + " | " 112 | 113 | # Bars - now iterate over the resampled display_bins 114 | for count in display_bins: 115 | # Scale current count to the available height 116 | scaled_height = (count / max_count) * height 117 | 118 | # Determine character based on height relative to current row 119 | if scaled_height >= row: 120 | line += TICK_CHARS[-1] # Full block for the solid part of the bar 121 | elif scaled_height > row - 1: 122 | # This is the top of the bar, use a partial character 123 | partial_index = int((scaled_height - row + 1) * (len(TICK_CHARS) - 1)) 124 | line += TICK_CHARS[max(0, partial_index)] 125 | elif row == 0: 126 | line += "-" # X-axis line 127 | else: 128 | line += " " # Empty space above the bar 129 | 130 | lines.append(line) 131 | 132 | # --- X-Axis Labels --- 133 | x_axis_labels = _create_x_axis_labels(min_val, max_val, plot_width) 134 | label_line = " " * (y_axis_width + 3) + x_axis_labels 135 | lines.append(label_line) 136 | 137 | return lines 138 | 139 | 140 | def _create_x_axis_labels(min_val: float, max_val: float, plot_width: int) -> str: 141 | """Create a formatted string for the X-axis labels.""" 142 | min_label = _format_number(min_val) 143 | max_label = _format_number(max_val) 144 | 145 | available_width = plot_width - len(min_label) - len(max_label) 146 | 147 | if available_width < 4: 148 | return f"{min_label}{' ' * (plot_width - len(min_label) - len(max_label))}{max_label}" 149 | 150 | mid_val = (min_val + max_val) / 2 151 | mid_label = _format_number(mid_val) 152 | 153 | spacing1 = (plot_width // 2) - len(min_label) - (len(mid_label) // 2) 154 | spacing2 = (plot_width - (plot_width // 2)) - (len(mid_label) - (len(mid_label) // 2)) - len(max_label) 155 | 156 | if spacing1 < 1 or spacing2 < 1: 157 | return f"{min_label}{' ' * (plot_width - len(min_label) - len(max_label))}{max_label}" 158 | 159 | return f"{min_label}{' ' * spacing1}{mid_label}{' ' * spacing2}{max_label}" 160 | 161 | 162 | def _format_number(value: float) -> str: 163 | """Format a number nicely for display on an axis.""" 164 | if abs(value) < 1e-4 and value != 0: 165 | return f"{value:.1e}" 166 | if abs(value) >= 1e5: 167 | return f"{value:.1e}" 168 | if math.isclose(value, int(value)): 169 | return str(int(value)) 170 | if abs(value) < 10: 171 | return f"{value:.2f}" 172 | if abs(value) < 100: 173 | return f"{value:.1f}" 174 | return str(int(value)) 175 | 176 | 177 | def should_show_histogram(data_type: str, distinct_count: int, total_count: int) -> bool: 178 | """ 179 | Determine if a histogram should be shown for this data. 180 | This function uses a set of heuristics to decide if the data is 181 | continuous enough to warrant a histogram visualization. 182 | """ 183 | # 1. Type Check: Histograms are only meaningful for numeric data. 184 | if 'numeric' not in data_type and 'integer' not in data_type and 'float' not in data_type: 185 | return False 186 | 187 | # 2. Data Volume Check: Don't render if there's too little data or no variation. 188 | if total_count < 20 or distinct_count <= 1: 189 | return False 190 | 191 | # 3. Categorical Data Filter: If the number of distinct values is very low, 192 | # treat it as categorical data (e.g., ratings from 1-10, months 1-12). 193 | if distinct_count < 15: 194 | return False 195 | 196 | # 4. High Cardinality Filter: If almost every value is unique (like an ID or index), 197 | # a histogram is not useful as most bars would have a height of 1. 198 | distinct_ratio = distinct_count / total_count 199 | if distinct_ratio > 0.95: 200 | return False 201 | 202 | # 5. Pass: If the data passes all the above filters, it is considered 203 | # sufficiently continuous to be visualized with a histogram. 204 | return True 205 | -------------------------------------------------------------------------------- /src/parqv/views/utils/stats_formatters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Statistics formatting utilities for parqv views. 3 | """ 4 | 5 | from typing import Any, Dict, List, Union 6 | 7 | from rich.text import Text 8 | 9 | from .visualization import create_text_histogram, should_show_histogram 10 | 11 | 12 | def format_stats_for_display(stats_data: Dict[str, Any]) -> List[Union[str, Text]]: 13 | """ 14 | Format statistics dictionary for display as lines of rich text. 15 | 16 | Args: 17 | stats_data: Raw statistics dictionary from handler 18 | 19 | Returns: 20 | List of formatted lines ready for display 21 | """ 22 | if not stats_data: 23 | return [Text.from_markup("[red]No statistics data available.[/red]")] 24 | 25 | lines: List[Union[str, Text]] = [] 26 | 27 | # Extract basic column information 28 | col_name = stats_data.get("column", "N/A") 29 | col_type = stats_data.get("type", "Unknown") 30 | nullable_val = stats_data.get("nullable") 31 | 32 | # Format column header 33 | lines.extend(_format_column_header(col_name, col_type, nullable_val)) 34 | 35 | # Handle calculation errors 36 | calc_error = stats_data.get("error") 37 | if calc_error: 38 | lines.extend(_format_error_section(calc_error)) 39 | 40 | # Add informational messages 41 | message = stats_data.get("message") 42 | if message: 43 | lines.extend(_format_message_section(message)) 44 | 45 | # Format calculated statistics 46 | calculated = stats_data.get("calculated") 47 | if calculated: 48 | lines.extend(_format_calculated_stats(calculated, has_error=bool(calc_error))) 49 | 50 | return lines 51 | 52 | 53 | def format_column_info(column_name: str, column_type: str, nullable: Any) -> List[Union[str, Text]]: 54 | """ 55 | Format basic column information for display. 56 | 57 | Args: 58 | column_name: Name of the column 59 | column_type: Type of the column 60 | nullable: Nullability information 61 | 62 | Returns: 63 | List of formatted lines for column info 64 | """ 65 | return _format_column_header(column_name, column_type, nullable) 66 | 67 | 68 | def _format_column_header(col_name: str, col_type: str, nullable_val: Any) -> List[Union[str, Text]]: 69 | """Format the column header section.""" 70 | # Determine nullability display 71 | if nullable_val is True: 72 | nullable_str = "Nullable" 73 | elif nullable_val is False: 74 | nullable_str = "Required" 75 | else: 76 | nullable_str = "Unknown Nullability" 77 | 78 | lines = [ 79 | Text.assemble(("Column: ", "bold"), f"`{col_name}`"), 80 | Text.assemble(("Type: ", "bold"), f"{col_type} ({nullable_str})"), 81 | "─" * (len(col_name) + len(col_type) + 20) 82 | ] 83 | 84 | return lines 85 | 86 | 87 | def _format_error_section(calc_error: str) -> List[Union[str, Text]]: 88 | """Format the error section.""" 89 | return [ 90 | Text("Calculation Error:", style="bold red"), 91 | f"```\n{calc_error}\n```", 92 | "" 93 | ] 94 | 95 | 96 | def _format_message_section(message: str) -> List[Union[str, Text]]: 97 | """Format the informational message section.""" 98 | return [ 99 | Text(f"Info: {message}", style="italic cyan"), 100 | "" 101 | ] 102 | 103 | 104 | def _format_calculated_stats(calculated: Dict[str, Any], has_error: bool = False) -> List[Union[str, Text]]: 105 | """Format the calculated statistics section.""" 106 | lines = [Text("Calculated Statistics:", style="bold")] 107 | 108 | # Define the order of statistics to display 109 | stats_order = [ 110 | "Total Count", "Valid Count", "Null Count", "Null Percentage", 111 | "Distinct Count", "Distinct Values (Approx)", 112 | "Min", "Max", "Mean", "Median (50%)", "StdDev", "Variance", 113 | "True Count", "False Count", 114 | "Value Counts" 115 | ] 116 | 117 | found_stats = False 118 | 119 | for key in stats_order: 120 | if key in calculated: 121 | found_stats = True 122 | value = calculated[key] 123 | lines.extend(_format_single_stat(key, value)) 124 | 125 | # Add any additional stats not in the predefined order (excluding internal histogram data) 126 | for key, value in calculated.items(): 127 | if key not in stats_order and not key.startswith('_'): # Skip internal fields 128 | found_stats = True 129 | lines.extend(_format_single_stat(key, value)) 130 | 131 | # Handle case where no stats were found 132 | if not found_stats and not has_error: 133 | lines.append(Text(" (No specific stats calculated for this type)", style="dim")) 134 | 135 | # Add histogram visualization for numeric data 136 | if "_histogram_data" in calculated and "_data_type" in calculated: 137 | if calculated["_data_type"] == "numeric": 138 | lines.extend(_format_histogram_visualization(calculated)) 139 | 140 | return lines 141 | 142 | 143 | def _format_single_stat(key: str, value: Any) -> List[Union[str, Text]]: 144 | """Format a single statistic entry.""" 145 | lines = [] 146 | 147 | if key == "Value Counts" and isinstance(value, dict): 148 | lines.append(f" - {key}:") 149 | for sub_key, sub_val in value.items(): 150 | sub_val_str = _format_stat_value(sub_val) 151 | lines.append(f" - {sub_key}: {sub_val_str}") 152 | else: 153 | formatted_value = _format_stat_value(value) 154 | lines.append(f" - {key}: {formatted_value}") 155 | 156 | return lines 157 | 158 | 159 | def _format_stat_value(value: Any) -> str: 160 | """Format a single statistic value.""" 161 | if isinstance(value, (int, float)): 162 | if isinstance(value, int): 163 | return f"{value:,}" 164 | else: 165 | return f"{value:,.4f}" 166 | else: 167 | return str(value) 168 | 169 | 170 | def _format_histogram_visualization(calculated: Dict[str, Any]) -> List[Union[str, Text]]: 171 | """Format histogram visualization for numeric data.""" 172 | lines = [] 173 | 174 | try: 175 | histogram_data = calculated.get("_histogram_data", []) 176 | if not histogram_data: 177 | return lines 178 | 179 | # Check if we should show histogram 180 | distinct_count_str = calculated.get("Distinct Count", "0") 181 | try: 182 | # Remove commas and convert to int 183 | distinct_count = int(distinct_count_str.replace(",", "")) 184 | except (ValueError, AttributeError): 185 | distinct_count = len(set(histogram_data)) 186 | 187 | total_count = len(histogram_data) 188 | 189 | if should_show_histogram("numeric", distinct_count, total_count): 190 | lines.append("") 191 | lines.append(Text("Data Distribution:", style="bold cyan")) 192 | 193 | # Create histogram 194 | histogram_lines = create_text_histogram( 195 | data=histogram_data, 196 | bins=15, 197 | width=50, 198 | height=8, 199 | title=None 200 | ) 201 | 202 | # Add each histogram line 203 | for line in histogram_lines: 204 | if isinstance(line, str): 205 | lines.append(f" {line}") 206 | else: 207 | lines.append(line) 208 | else: 209 | # For discrete data, show a note 210 | if distinct_count < total_count * 0.1: # Less than 10% unique values 211 | lines.append("") 212 | lines.append(Text("Note: Data appears to be discrete/categorical", style="dim italic")) 213 | lines.append(Text("(Histogram not shown for discrete values)", style="dim italic")) 214 | 215 | except Exception as e: 216 | # Don't fail the whole stats display if histogram fails 217 | lines.append("") 218 | lines.append(Text(f"Note: Could not generate histogram: {e}", style="dim red")) 219 | 220 | return lines 221 | -------------------------------------------------------------------------------- /src/parqv/views/schema_view.py: -------------------------------------------------------------------------------- 1 | """ 2 | Schema view for displaying column schema and statistics. 3 | """ 4 | 5 | from typing import Dict, Any, Optional, List 6 | 7 | from rich.text import Text 8 | from textual.app import ComposeResult 9 | from textual.containers import VerticalScroll, Container, Horizontal 10 | from textual.reactive import var 11 | from textual.widgets import Static, ListView, ListItem, Label, LoadingIndicator 12 | 13 | from .base import BaseView 14 | from .utils import format_stats_for_display 15 | 16 | 17 | class ColumnListItem(ListItem): 18 | """A ListItem that stores the column name for schema display.""" 19 | 20 | def __init__(self, column_name: str) -> None: 21 | # Ensure IDs are CSS-safe (replace spaces, etc.) 22 | safe_id_name = "".join(c if c.isalnum() else '_' for c in column_name) 23 | super().__init__(Label(column_name), name=column_name, id=f"col-item-{safe_id_name}") 24 | self.column_name = column_name 25 | 26 | 27 | class SchemaView(BaseView): 28 | """ 29 | View for displaying schema information and column statistics. 30 | 31 | Shows a list of columns on the left and detailed statistics 32 | for the selected column on the right. 33 | """ 34 | 35 | DEFAULT_STATS_MESSAGE = "Select a column from the list to view its statistics." 36 | 37 | # Reactive variable for loading state 38 | loading = var(False) 39 | 40 | def __init__(self, **kwargs): 41 | super().__init__(**kwargs) 42 | self._columns_data: Optional[List[Dict[str, Any]]] = None 43 | self._current_column: Optional[str] = None 44 | 45 | def compose(self) -> ComposeResult: 46 | """Compose the schema view layout.""" 47 | with Horizontal(): 48 | # Left side: Column list 49 | with Container(id="column-list-container", classes="column-list"): 50 | yield Static("Columns", classes="section-title") 51 | yield ListView(id="column-list-view") 52 | 53 | # Right side: Column statistics 54 | with Container(id="stats-container", classes="column-stats"): 55 | yield Static("Column Statistics", classes="section-title") 56 | with VerticalScroll(id="schema-stats-scroll"): 57 | yield Container(id="schema-stats-content") 58 | yield LoadingIndicator(id="schema-loading-indicator") 59 | 60 | def load_content(self) -> None: 61 | """Load schema content.""" 62 | if not self.check_handler_available(): 63 | return 64 | 65 | try: 66 | # Load column list 67 | self._load_column_list() 68 | 69 | # Display default message in stats area 70 | self._display_default_message() 71 | 72 | self.logger.info("Schema loaded successfully") 73 | 74 | except Exception as e: 75 | self.show_error("Failed to load schema", e) 76 | 77 | def _load_column_list(self) -> None: 78 | """Load the list of columns from the data handler.""" 79 | try: 80 | list_view = self.query_one("#column-list-view", ListView) 81 | list_view.clear() 82 | 83 | # Get schema data from handler 84 | self._columns_data = self.handler.get_schema_data() 85 | self.logger.debug(f"Received schema data: {self._columns_data}") 86 | 87 | if self._columns_data is None: 88 | self._show_list_error("Could not load schema data") 89 | return 90 | 91 | if not self._columns_data: 92 | self._show_list_warning("Schema has no columns") 93 | return 94 | 95 | # Populate column list 96 | column_count = 0 97 | for col_info in self._columns_data: 98 | column_name = col_info.get("name") 99 | if column_name: 100 | list_view.append(ColumnListItem(column_name)) 101 | column_count += 1 102 | else: 103 | self.logger.warning("Found column info without a 'name' key") 104 | 105 | self.logger.info(f"Populated column list with {column_count} columns") 106 | 107 | except Exception as e: 108 | self.logger.exception("Error loading column list") 109 | self._show_list_error(f"Error loading schema: {e}") 110 | 111 | def _show_list_error(self, message: str) -> None: 112 | """Show error message in the column list.""" 113 | try: 114 | list_view = self.query_one("#column-list-view", ListView) 115 | list_view.clear() 116 | list_view.append(ListItem(Label(f"[red]{message}[/red]"))) 117 | except Exception as e: 118 | self.logger.error(f"Failed to show list error: {e}") 119 | 120 | def _show_list_warning(self, message: str) -> None: 121 | """Show warning message in the column list.""" 122 | try: 123 | list_view = self.query_one("#column-list-view", ListView) 124 | list_view.clear() 125 | list_view.append(ListItem(Label(f"[yellow]{message}[/yellow]"))) 126 | except Exception as e: 127 | self.logger.error(f"Failed to show list warning: {e}") 128 | 129 | def _display_default_message(self) -> None: 130 | """Display the initial message in the stats area.""" 131 | try: 132 | stats_container = self.query_one("#schema-stats-content", Container) 133 | stats_container.query("*").remove() 134 | stats_container.mount(Static(self.DEFAULT_STATS_MESSAGE, classes="stats-line")) 135 | except Exception as e: 136 | self.logger.error(f"Failed to display default stats message: {e}") 137 | 138 | def on_list_view_selected(self, event: ListView.Selected) -> None: 139 | """Handle column selection from the list.""" 140 | if hasattr(event.item, 'column_name'): 141 | column_name = event.item.column_name 142 | self._current_column = column_name 143 | self._load_column_stats(column_name) 144 | else: 145 | self.logger.warning("Selected item does not have column_name attribute") 146 | 147 | def _load_column_stats(self, column_name: str) -> None: 148 | """ 149 | Load and display statistics for the selected column. 150 | 151 | Args: 152 | column_name: Name of the column to analyze 153 | """ 154 | if not self.handler: 155 | self._show_stats_error("Data handler not available") 156 | return 157 | 158 | try: 159 | # Set loading state 160 | self.loading = True 161 | 162 | # Get column statistics 163 | self.logger.debug(f"Loading stats for column: {column_name}") 164 | raw_stats = self.handler.get_column_stats(column_name) 165 | 166 | # Format stats for display 167 | formatted_lines = format_stats_for_display(raw_stats) 168 | 169 | # Display the formatted stats 170 | self._display_column_stats(formatted_lines) 171 | 172 | except Exception as e: 173 | self.logger.exception(f"Error loading stats for column {column_name}") 174 | self._show_stats_error(f"Failed to load statistics: {e}") 175 | finally: 176 | self.loading = False 177 | 178 | def _display_column_stats(self, formatted_lines: List) -> None: 179 | """ 180 | Display formatted column statistics. 181 | 182 | Args: 183 | formatted_lines: List of formatted text lines to display 184 | """ 185 | try: 186 | stats_container = self.query_one("#schema-stats-content", Container) 187 | stats_container.query("*").remove() 188 | 189 | for line in formatted_lines: 190 | if isinstance(line, Text): 191 | stats_container.mount(Static(line, classes="stats-line")) 192 | else: 193 | stats_container.mount(Static(str(line), classes="stats-line")) 194 | 195 | except Exception as e: 196 | self.logger.error(f"Failed to display column stats: {e}") 197 | self._show_stats_error("Failed to display statistics") 198 | 199 | def _show_stats_error(self, message: str) -> None: 200 | """Show error message in the stats area.""" 201 | try: 202 | stats_container = self.query_one("#schema-stats-content", Container) 203 | stats_container.query("*").remove() 204 | stats_container.mount(Static(f"[red]Error: {message}[/red]", classes="error-content")) 205 | except Exception as e: 206 | self.logger.error(f"Failed to show stats error: {e}") 207 | 208 | def watch_loading(self, loading: bool) -> None: 209 | """React to changes in the loading state.""" 210 | try: 211 | loading_indicator = self.query_one("#schema-loading-indicator", LoadingIndicator) 212 | stats_scroll = self.query_one("#schema-stats-scroll", VerticalScroll) 213 | 214 | if loading: 215 | loading_indicator.display = True 216 | stats_scroll.display = False 217 | else: 218 | loading_indicator.display = False 219 | stats_scroll.display = True 220 | 221 | except Exception as e: 222 | self.logger.error(f"Error updating loading state: {e}") 223 | 224 | def refresh_schema(self) -> None: 225 | """Refresh the schema display.""" 226 | self._current_column = None 227 | self.clear_content() 228 | self.load_content() 229 | 230 | def get_current_column(self) -> Optional[str]: 231 | """Get the currently selected column name.""" 232 | return self._current_column 233 | 234 | def get_columns_data(self) -> Optional[List[Dict[str, Any]]]: 235 | """Get the current columns data.""" 236 | return self._columns_data 237 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /src/parqv/data_sources/formats/csv.py: -------------------------------------------------------------------------------- 1 | """ 2 | CSV file handler for parqv data sources. 3 | """ 4 | 5 | from pathlib import Path 6 | from typing import Any, Dict, List, Optional 7 | 8 | import pandas as pd 9 | 10 | from ..base import DataHandler, DataHandlerError 11 | 12 | 13 | class CsvHandlerError(DataHandlerError): 14 | """Custom exception for CSV handling errors.""" 15 | pass 16 | 17 | 18 | class CsvHandler(DataHandler): 19 | """ 20 | Handles CSV file interactions using pandas. 21 | 22 | Provides methods to access metadata, schema, data preview, and column statistics 23 | for CSV files using pandas DataFrame operations. 24 | """ 25 | 26 | def __init__(self, file_path: Path): 27 | """ 28 | Initialize the CsvHandler by validating the path and reading the CSV file. 29 | 30 | Args: 31 | file_path: Path to the CSV file. 32 | 33 | Raises: 34 | CsvHandlerError: If the file is not found, not a file, or cannot be read. 35 | """ 36 | super().__init__(file_path) 37 | self.df: Optional[pd.DataFrame] = None 38 | self._original_dtypes: Optional[Dict[str, str]] = None 39 | 40 | try: 41 | # Validate file existence 42 | if not self.file_path.is_file(): 43 | raise FileNotFoundError(f"CSV file not found or is not a regular file: {self.file_path}") 44 | 45 | # Read the CSV file with pandas 46 | self._read_csv_file() 47 | 48 | self.logger.info(f"Successfully initialized CsvHandler for: {self.file_path.name}") 49 | 50 | except FileNotFoundError as fnf_e: 51 | self.logger.error(f"File not found during CsvHandler initialization: {fnf_e}") 52 | raise CsvHandlerError(str(fnf_e)) from fnf_e 53 | except pd.errors.EmptyDataError as empty_e: 54 | self.logger.error(f"CSV file is empty: {empty_e}") 55 | raise CsvHandlerError(f"CSV file '{self.file_path.name}' is empty") from empty_e 56 | except pd.errors.ParserError as parse_e: 57 | self.logger.error(f"CSV parsing error: {parse_e}") 58 | raise CsvHandlerError(f"Failed to parse CSV file '{self.file_path.name}': {parse_e}") from parse_e 59 | except Exception as e: 60 | self.logger.exception(f"Unexpected error initializing CsvHandler for {self.file_path.name}") 61 | raise CsvHandlerError(f"Failed to initialize CSV handler '{self.file_path.name}': {e}") from e 62 | 63 | def _read_csv_file(self) -> None: 64 | """Read the CSV file using pandas with appropriate settings.""" 65 | try: 66 | # Read CSV with automatic type inference 67 | self.df = pd.read_csv( 68 | self.file_path, 69 | # Basic settings 70 | encoding='utf-8', 71 | # Handle various separators automatically 72 | sep=None, # Let pandas auto-detect 73 | engine='python', # More flexible parsing 74 | # Preserve original string representation for better type info 75 | dtype=str, # Read everything as string first 76 | na_values=['', 'NULL', 'null', 'None', 'N/A', 'n/a', 'NaN', 'nan'], 77 | keep_default_na=True, 78 | ) 79 | 80 | # Store original dtypes before conversion 81 | self._original_dtypes = {col: 'string' for col in self.df.columns} 82 | 83 | # Try to infer better types 84 | self._infer_types() 85 | 86 | self.logger.debug(f"Successfully read CSV with shape: {self.df.shape}") 87 | 88 | except UnicodeDecodeError: 89 | # Try with different encodings 90 | for encoding in ['latin1', 'cp1252', 'iso-8859-1']: 91 | try: 92 | self.logger.warning(f"Trying encoding: {encoding}") 93 | self.df = pd.read_csv( 94 | self.file_path, 95 | encoding=encoding, 96 | sep=None, 97 | engine='python', 98 | dtype=str, 99 | na_values=['', 'NULL', 'null', 'None', 'N/A', 'n/a', 'NaN', 'nan'], 100 | keep_default_na=True, 101 | ) 102 | self._original_dtypes = {col: 'string' for col in self.df.columns} 103 | self._infer_types() 104 | self.logger.info(f"Successfully read CSV with encoding: {encoding}") 105 | break 106 | except UnicodeDecodeError: 107 | continue 108 | else: 109 | raise CsvHandlerError(f"Could not decode CSV file with any common encoding") 110 | 111 | def _infer_types(self) -> None: 112 | """Infer appropriate data types for columns.""" 113 | if self.df is None: 114 | return 115 | 116 | for col in self.df.columns: 117 | # Try to convert to numeric 118 | numeric_converted = pd.to_numeric(self.df[col], errors='coerce') 119 | if not numeric_converted.isna().all(): 120 | # If most values can be converted to numeric, use numeric type 121 | non_na_original = self.df[col].notna().sum() 122 | non_na_converted = numeric_converted.notna().sum() 123 | 124 | if non_na_converted / max(non_na_original, 1) > 0.8: # 80% conversion success 125 | self.df[col] = numeric_converted 126 | if (numeric_converted == numeric_converted.astype('Int64', errors='ignore')).all(): 127 | self._original_dtypes[col] = 'integer' 128 | else: 129 | self._original_dtypes[col] = 'float' 130 | continue 131 | 132 | # Try to convert to datetime 133 | try: 134 | datetime_converted = pd.to_datetime(self.df[col], errors='coerce', infer_datetime_format=True) 135 | if not datetime_converted.isna().all(): 136 | non_na_original = self.df[col].notna().sum() 137 | non_na_converted = datetime_converted.notna().sum() 138 | 139 | if non_na_converted / max(non_na_original, 1) > 0.8: # 80% conversion success 140 | self.df[col] = datetime_converted 141 | self._original_dtypes[col] = 'datetime' 142 | continue 143 | except (ValueError, TypeError): 144 | pass 145 | 146 | # Try to convert to boolean 147 | bool_values = self.df[col].str.lower().isin(['true', 'false', 't', 'f', '1', '0', 'yes', 'no', 'y', 'n']) 148 | if bool_values.sum() / len(self.df[col]) > 0.8: 149 | bool_mapping = { 150 | 'true': True, 'false': False, 't': True, 'f': False, 151 | '1': True, '0': False, 'yes': True, 'no': False, 152 | 'y': True, 'n': False 153 | } 154 | self.df[col] = self.df[col].str.lower().map(bool_mapping) 155 | self._original_dtypes[col] = 'boolean' 156 | continue 157 | 158 | # Keep as string 159 | self._original_dtypes[col] = 'string' 160 | 161 | def close(self) -> None: 162 | """Close and cleanup resources (CSV data is held in memory).""" 163 | if self.df is not None: 164 | self.logger.info(f"Closed CSV handler for: {self.file_path.name}") 165 | self.df = None 166 | self._original_dtypes = None 167 | 168 | def get_metadata_summary(self) -> Dict[str, Any]: 169 | """ 170 | Get a summary dictionary of the CSV file's metadata. 171 | 172 | Returns: 173 | A dictionary containing metadata like file path, format, row count, columns, size. 174 | """ 175 | if self.df is None: 176 | return {"error": "CSV data not loaded or handler closed."} 177 | 178 | try: 179 | file_size = self.file_path.stat().st_size 180 | size_str = self.format_size(file_size) 181 | except Exception as e: 182 | self.logger.warning(f"Could not get file size for {self.file_path}: {e}") 183 | size_str = "N/A" 184 | 185 | # Create a well-structured metadata summary 186 | summary = { 187 | "File Information": { 188 | "Path": str(self.file_path), 189 | "Format": "CSV", 190 | "Size": size_str 191 | }, 192 | "Data Structure": { 193 | "Total Rows": f"{len(self.df):,}", 194 | "Total Columns": f"{len(self.df.columns):,}", 195 | "Memory Usage": f"{self.df.memory_usage(deep=True).sum():,} bytes" 196 | }, 197 | "Column Types Summary": self._get_column_types_summary() 198 | } 199 | 200 | return summary 201 | 202 | def _get_column_types_summary(self) -> Dict[str, int]: 203 | """Get a summary of column types in the CSV data.""" 204 | if self.df is None or self._original_dtypes is None: 205 | return {} 206 | 207 | type_counts = {} 208 | for col_type in self._original_dtypes.values(): 209 | type_counts[col_type] = type_counts.get(col_type, 0) + 1 210 | 211 | # Format for better display 212 | formatted_summary = {} 213 | type_labels = { 214 | 'string': 'Text Columns', 215 | 'integer': 'Integer Columns', 216 | 'float': 'Numeric Columns', 217 | 'datetime': 'Date/Time Columns', 218 | 'boolean': 'Boolean Columns' 219 | } 220 | 221 | for type_key, count in type_counts.items(): 222 | label = type_labels.get(type_key, f'{type_key.title()} Columns') 223 | formatted_summary[label] = f"{count:,}" 224 | 225 | return formatted_summary 226 | 227 | def get_schema_data(self) -> Optional[List[Dict[str, Any]]]: 228 | """ 229 | Get the schema of the CSV data. 230 | 231 | Returns: 232 | A list of dictionaries describing columns (name, type, nullable), 233 | or None if schema couldn't be determined. 234 | """ 235 | if self.df is None: 236 | self.logger.warning("DataFrame is not available for schema data") 237 | return None 238 | 239 | schema_list = [] 240 | 241 | for col in self.df.columns: 242 | try: 243 | # Get the inferred type 244 | col_type = self._original_dtypes.get(col, 'string') 245 | 246 | # Check for null values 247 | has_nulls = self.df[col].isna().any() 248 | 249 | schema_list.append({ 250 | "name": str(col), 251 | "type": col_type, 252 | "nullable": bool(has_nulls) 253 | }) 254 | 255 | except Exception as e: 256 | self.logger.error(f"Error processing column '{col}' for schema data: {e}") 257 | schema_list.append({ 258 | "name": str(col), 259 | "type": f"[Error: {e}]", 260 | "nullable": None 261 | }) 262 | 263 | return schema_list 264 | 265 | def get_data_preview(self, num_rows: int = 50) -> Optional[pd.DataFrame]: 266 | """ 267 | Fetch a preview of the data. 268 | 269 | Args: 270 | num_rows: The maximum number of rows to fetch. 271 | 272 | Returns: 273 | A pandas DataFrame with preview data, an empty DataFrame if no data, 274 | or a DataFrame with an 'error' column on failure. 275 | """ 276 | if self.df is None: 277 | self.logger.warning("CSV data not available for preview") 278 | return pd.DataFrame({"error": ["CSV data not loaded or handler closed."]}) 279 | 280 | try: 281 | if self.df.empty: 282 | self.logger.info("CSV file has no data rows") 283 | return pd.DataFrame(columns=self.df.columns) 284 | 285 | # Return first num_rows 286 | preview_df = self.df.head(num_rows).copy() 287 | self.logger.info(f"Generated preview of {len(preview_df)} rows for {self.file_path.name}") 288 | return preview_df 289 | 290 | except Exception as e: 291 | self.logger.exception(f"Error generating data preview from CSV file: {self.file_path.name}") 292 | return pd.DataFrame({"error": [f"Failed to generate preview: {e}"]}) 293 | 294 | def get_column_stats(self, column_name: str) -> Dict[str, Any]: 295 | """ 296 | Calculate and return statistics for a specific column. 297 | 298 | Args: 299 | column_name: The name of the column. 300 | 301 | Returns: 302 | A dictionary containing column statistics or error information. 303 | """ 304 | if self.df is None: 305 | return self._create_stats_result( 306 | column_name, "Unknown", {}, error="CSV data not loaded or handler closed." 307 | ) 308 | 309 | if column_name not in self.df.columns: 310 | return self._create_stats_result( 311 | column_name, "Unknown", {}, error=f"Column '{column_name}' not found in CSV data." 312 | ) 313 | 314 | try: 315 | col_series = self.df[column_name] 316 | col_type = self._original_dtypes.get(column_name, 'string') 317 | 318 | # Basic counts 319 | total_count = len(col_series) 320 | null_count = col_series.isna().sum() 321 | valid_count = total_count - null_count 322 | null_percentage = (null_count / total_count * 100) if total_count > 0 else 0 323 | 324 | stats = { 325 | "Total Count": f"{total_count:,}", 326 | "Valid Count": f"{valid_count:,}", 327 | "Null Count": f"{null_count:,}", 328 | "Null Percentage": f"{null_percentage:.2f}%" 329 | } 330 | 331 | # Type-specific statistics 332 | if valid_count > 0: 333 | valid_series = col_series.dropna() 334 | 335 | # Distinct count (always applicable) 336 | distinct_count = valid_series.nunique() 337 | stats["Distinct Count"] = f"{distinct_count:,}" 338 | 339 | if col_type in ['integer', 'float']: 340 | # Numeric statistics 341 | stats.update(self._calculate_numeric_stats_pandas(valid_series)) 342 | elif col_type == 'datetime': 343 | # Datetime statistics 344 | stats.update(self._calculate_datetime_stats_pandas(valid_series)) 345 | elif col_type == 'boolean': 346 | # Boolean statistics 347 | stats.update(self._calculate_boolean_stats_pandas(valid_series)) 348 | elif col_type == 'string': 349 | # String statistics (min/max by alphabetical order) 350 | stats.update(self._calculate_string_stats_pandas(valid_series)) 351 | 352 | return self._create_stats_result(column_name, col_type, stats, nullable=null_count > 0) 353 | 354 | except Exception as e: 355 | self.logger.exception(f"Error calculating stats for column '{column_name}'") 356 | return self._create_stats_result( 357 | column_name, "Unknown", {}, error=f"Failed to calculate statistics: {e}" 358 | ) 359 | 360 | def _calculate_numeric_stats_pandas(self, series: pd.Series) -> Dict[str, Any]: 361 | """Calculate statistics for numeric columns using pandas.""" 362 | stats = {} 363 | try: 364 | stats["Min"] = series.min() 365 | stats["Max"] = series.max() 366 | stats["Mean"] = f"{series.mean():.4f}" 367 | stats["Median (50%)"] = series.median() 368 | stats["StdDev"] = f"{series.std():.4f}" 369 | 370 | # Add histogram data for visualization 371 | try: 372 | # Sample data if too large for performance 373 | sample_size = min(10000, len(series)) 374 | if len(series) > sample_size: 375 | sampled_series = series.sample(n=sample_size, random_state=42) 376 | else: 377 | sampled_series = series 378 | 379 | # Convert to list for histogram 380 | clean_data = sampled_series.tolist() 381 | 382 | if len(clean_data) > 10: # Only create histogram if we have enough data 383 | stats["_histogram_data"] = clean_data 384 | stats["_data_type"] = "numeric" 385 | 386 | except Exception as e: 387 | self.logger.warning(f"Failed to prepare histogram data: {e}") 388 | 389 | except Exception as e: 390 | self.logger.warning(f"Error calculating numeric stats: {e}") 391 | stats["Calculation Error"] = str(e) 392 | return stats 393 | 394 | def _calculate_datetime_stats_pandas(self, series: pd.Series) -> Dict[str, Any]: 395 | """Calculate statistics for datetime columns using pandas.""" 396 | stats = {} 397 | try: 398 | stats["Min"] = series.min() 399 | stats["Max"] = series.max() 400 | # Calculate time range 401 | time_range = series.max() - series.min() 402 | stats["Range"] = str(time_range) 403 | except Exception as e: 404 | self.logger.warning(f"Error calculating datetime stats: {e}") 405 | stats["Calculation Error"] = str(e) 406 | return stats 407 | 408 | def _calculate_boolean_stats_pandas(self, series: pd.Series) -> Dict[str, Any]: 409 | """Calculate statistics for boolean columns using pandas.""" 410 | stats = {} 411 | try: 412 | value_counts = series.value_counts() 413 | stats["True Count"] = f"{value_counts.get(True, 0):,}" 414 | stats["False Count"] = f"{value_counts.get(False, 0):,}" 415 | if len(value_counts) > 0: 416 | true_pct = (value_counts.get(True, 0) / len(series) * 100) 417 | stats["True Percentage"] = f"{true_pct:.2f}%" 418 | except Exception as e: 419 | self.logger.warning(f"Error calculating boolean stats: {e}") 420 | stats["Calculation Error"] = str(e) 421 | return stats 422 | 423 | def _calculate_string_stats_pandas(self, series: pd.Series) -> Dict[str, Any]: 424 | """Calculate statistics for string columns using pandas.""" 425 | stats = {} 426 | try: 427 | # Only min/max for strings (alphabetical order) 428 | stats["Min"] = str(series.min()) 429 | stats["Max"] = str(series.max()) 430 | 431 | # Most common values 432 | value_counts = series.value_counts().head(5) 433 | if len(value_counts) > 0: 434 | top_values = {} 435 | for value, count in value_counts.items(): 436 | top_values[str(value)] = f"{count:,}" 437 | stats["Top Values"] = top_values 438 | except Exception as e: 439 | self.logger.warning(f"Error calculating string stats: {e}") 440 | stats["Calculation Error"] = str(e) 441 | return stats 442 | 443 | def _create_stats_result( 444 | self, 445 | column_name: str, 446 | col_type: str, 447 | calculated_stats: Dict[str, Any], 448 | nullable: Optional[bool] = None, 449 | error: Optional[str] = None, 450 | message: Optional[str] = None 451 | ) -> Dict[str, Any]: 452 | """Package the stats results consistently.""" 453 | return { 454 | "column": column_name, 455 | "type": col_type, 456 | "nullable": nullable if nullable is not None else "Unknown", 457 | "calculated": calculated_stats or {}, 458 | "error": error, 459 | "message": message, 460 | } 461 | -------------------------------------------------------------------------------- /src/parqv/data_sources/formats/json.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Dict, List, Optional, Tuple 3 | 4 | import duckdb 5 | import pandas as pd 6 | 7 | from ..base import DataHandler, DataHandlerError 8 | 9 | 10 | class JsonHandlerError(DataHandlerError): 11 | """Custom exception for JSON handling errors.""" 12 | pass 13 | 14 | 15 | class JsonHandler(DataHandler): 16 | """ 17 | Handles JSON file interactions using DuckDB. 18 | 19 | Leverages DuckDB's `read_json_auto` for parsing standard JSON and JSON Lines (ndjson) 20 | and `SUMMARIZE` for efficient statistics calculation. 21 | 22 | Attributes: 23 | file_path (Path): Path to the JSON file. 24 | """ 25 | DEFAULT_VIEW_NAME = "json_data_view" 26 | 27 | def __init__(self, file_path: Path): 28 | """ 29 | Initializes the JsonHandler. 30 | 31 | Args: 32 | file_path: Path to the JSON file. 33 | 34 | Raises: 35 | JsonHandlerError: If the file doesn't exist, isn't a file, or if 36 | initialization fails (e.g., DuckDB connection, view creation). 37 | """ 38 | super().__init__(file_path) 39 | 40 | self.file_path = self._validate_file_path(file_path) 41 | self._db_conn: Optional[duckdb.DuckDBPyConnection] = None 42 | self._view_name: str = self.DEFAULT_VIEW_NAME 43 | self._schema: Optional[List[Dict[str, Any]]] = None 44 | self._row_count: Optional[int] = None 45 | 46 | try: 47 | self._connect_db() 48 | self._create_duckdb_view() 49 | self._load_metadata() 50 | self.logger.info(f"JsonHandler initialized successfully for: {self.file_path}") 51 | except Exception as e: 52 | self.logger.exception(f"Error during JsonHandler initialization for {self.file_path}") 53 | self.close() 54 | if isinstance(e, JsonHandlerError): 55 | raise 56 | raise JsonHandlerError(f"Failed to initialize JSON handler: {e}") from e 57 | 58 | def _validate_file_path(self, file_path: Path) -> Path: 59 | """Checks if the file path is valid.""" 60 | resolved_path = file_path.resolve() 61 | if not resolved_path.is_file(): 62 | raise JsonHandlerError(f"JSON file not found or is not a file: {resolved_path}") 63 | return resolved_path 64 | 65 | def _connect_db(self): 66 | """Establishes a connection to an in-memory DuckDB database.""" 67 | try: 68 | self._db_conn = duckdb.connect(database=':memory:', read_only=False) 69 | self.logger.debug("DuckDB in-memory connection established.") 70 | except Exception as e: 71 | self.logger.exception("Failed to initialize DuckDB connection.") 72 | raise JsonHandlerError(f"DuckDB connection failed: {e}") from e 73 | 74 | def _create_duckdb_view(self): 75 | """Creates a DuckDB view pointing to the JSON file.""" 76 | if not self._db_conn: 77 | raise JsonHandlerError("DuckDB connection not available for view creation.") 78 | 79 | file_path_str = str(self.file_path) 80 | safe_view_name = f'"{self._view_name}"' 81 | load_query = f"CREATE OR REPLACE VIEW {safe_view_name} AS SELECT * FROM read_json_auto('{file_path_str}');" 82 | 83 | try: 84 | self._db_conn.sql(load_query) 85 | self.logger.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.") 86 | except duckdb.Error as db_err: 87 | self.logger.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}") 88 | if "Could not open file" in str(db_err): 89 | raise JsonHandlerError( 90 | f"DuckDB could not open file: {file_path_str}. Check permissions or path. Error: {db_err}") from db_err 91 | elif "JSON Error" in str(db_err) or "Parse Error" in str(db_err): 92 | raise JsonHandlerError( 93 | f"DuckDB failed to parse JSON file: {file_path_str}. Check format. Error: {db_err}") from db_err 94 | else: 95 | raise JsonHandlerError(f"DuckDB failed create view for JSON file: {db_err}") from db_err 96 | except Exception as e: 97 | self.logger.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.") 98 | raise JsonHandlerError(f"Failed to create DuckDB view: {e}") from e 99 | 100 | def _load_metadata(self): 101 | """Fetches schema and row count from the DuckDB view.""" 102 | if not self._db_conn: 103 | raise JsonHandlerError("Cannot fetch metadata, DuckDB connection not available.") 104 | 105 | try: 106 | # Fetch Schema 107 | describe_query = f"DESCRIBE \"{self._view_name}\";" 108 | schema_result = self._db_conn.sql(describe_query).fetchall() 109 | self._schema = self._parse_schema(schema_result) 110 | self.logger.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.") 111 | 112 | # Fetch Row Count 113 | count_query = f"SELECT COUNT(*) FROM \"{self._view_name}\";" 114 | count_result = self._db_conn.sql(count_query).fetchone() 115 | self._row_count = count_result[0] if count_result else 0 116 | self.logger.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}") 117 | 118 | except duckdb.Error as db_err: 119 | self.logger.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}") 120 | self._schema = None 121 | self._row_count = None 122 | except Exception as e: 123 | self.logger.exception(f"Unexpected error fetching metadata for view '{self._view_name}'") 124 | self._schema = None 125 | self._row_count = None 126 | 127 | def _parse_schema(self, describe_output: List[Tuple]) -> List[Dict[str, Any]]: 128 | """Parses the output of DuckDB's DESCRIBE query.""" 129 | if not describe_output: 130 | self.logger.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.") 131 | return [] 132 | 133 | parsed_schema = [] 134 | for row in describe_output: 135 | # Handle potential variations in DESCRIBE output columns 136 | if len(row) >= 3: 137 | name, type_str, null_str = row[0], row[1], row[2] 138 | is_nullable = None 139 | if isinstance(null_str, str): 140 | is_nullable = null_str.upper() == 'YES' 141 | parsed_schema.append({"name": name, "type": type_str, "nullable": is_nullable}) 142 | else: 143 | self.logger.warning(f"Unexpected format in DESCRIBE output row: {row}") 144 | return parsed_schema 145 | 146 | def get_schema_data(self) -> Optional[List[Dict[str, Any]]]: 147 | """ 148 | Returns the schema of the JSON data. 149 | 150 | Returns: 151 | A list of dictionaries describing columns (name, type, nullable), 152 | or None if schema couldn't be fetched. 153 | """ 154 | if self._schema is None: 155 | self.logger.warning("Schema is unavailable. It might not have been fetched successfully.") 156 | return self._schema 157 | 158 | def get_metadata_summary(self) -> Dict[str, Any]: 159 | """ 160 | Provides a summary dictionary of the JSON file's metadata. 161 | 162 | Returns: 163 | A dictionary containing metadata like file path, format, row count, columns, size. 164 | """ 165 | if not self._db_conn: 166 | return {"error": "DuckDB connection not initialized or closed."} 167 | 168 | row_count_str = "N/A (Error fetching)" 169 | if self._row_count is not None: 170 | row_count_str = f"{self._row_count:,}" 171 | 172 | columns_str = "N/A (Error fetching)" 173 | if self._schema is not None: 174 | columns_str = f"{len(self._schema):,}" 175 | 176 | summary = { 177 | "File Path": str(self.file_path), 178 | "Format": "JSON/JSONL", 179 | "DuckDB View": self._view_name, 180 | "Total Rows": row_count_str, 181 | "Columns": columns_str, 182 | } 183 | try: 184 | summary["Size"] = f"{self.file_path.stat().st_size:,} bytes" 185 | except Exception as e: 186 | self.logger.warning(f"Could not get file size for {self.file_path}: {e}") 187 | summary["Size"] = "N/A" 188 | 189 | return summary 190 | 191 | def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame: 192 | """ 193 | Fetches a preview of the data. 194 | 195 | Args: 196 | num_rows: The maximum number of rows to preview. 197 | 198 | Returns: 199 | A pandas DataFrame containing the first `num_rows` of data, 200 | an empty DataFrame if the file is empty, or a DataFrame with an 201 | error message if fetching fails. 202 | """ 203 | if not self._db_conn: 204 | self.logger.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.") 205 | return pd.DataFrame({"error": ["DuckDB connection not available."]}) 206 | if self._schema is None: 207 | self.logger.warning("Data preview unavailable: Schema couldn't be determined.") 208 | return pd.DataFrame({"error": ["Schema not available, cannot fetch preview."]}) 209 | if self._row_count == 0: 210 | self.logger.info("Data preview: Source JSON view is empty.") 211 | # Return empty DataFrame with correct columns if possible 212 | if self._schema: 213 | return pd.DataFrame(columns=[col['name'] for col in self._schema]) 214 | else: 215 | return pd.DataFrame() # Fallback 216 | 217 | try: 218 | limit = max(1, num_rows) 219 | preview_query = f"SELECT * FROM \"{self._view_name}\" LIMIT {limit};" 220 | df = self._db_conn.sql(preview_query).df() 221 | return df 222 | except duckdb.Error as db_err: 223 | self.logger.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}") 224 | return pd.DataFrame({"error": [f"DuckDB error fetching preview: {db_err}"]}) 225 | except Exception as e: 226 | self.logger.exception(f"Unexpected error getting data preview from '{self._view_name}'") 227 | return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]}) 228 | 229 | def _get_column_info(self, column_name: str) -> Optional[Dict[str, Any]]: 230 | """Retrieves schema information for a specific column.""" 231 | if self._schema is None: 232 | return None 233 | return next((col for col in self._schema if col["name"] == column_name), None) 234 | 235 | def _is_complex_type(self, dtype_str: str) -> bool: 236 | """Checks if a DuckDB data type string represents a complex type.""" 237 | if not isinstance(dtype_str, str): 238 | return False 239 | dtype_upper = dtype_str.upper() 240 | return any(t in dtype_upper for t in ['STRUCT', 'LIST', 'MAP', 'UNION']) 241 | 242 | def get_column_stats(self, column_name: str) -> Dict[str, Any]: 243 | """ 244 | Calculates statistics for a given column using DuckDB's SUMMARIZE or basic counts. 245 | 246 | Args: 247 | column_name: The name of the column to analyze. 248 | 249 | Returns: 250 | A dictionary containing calculated statistics, type information, and 251 | any errors or messages. 252 | """ 253 | if not self._db_conn: 254 | return self._create_stats_result(column_name, "Unknown", {}, error="DuckDB connection not available.") 255 | 256 | col_info = self._get_column_info(column_name) 257 | if not col_info: 258 | return self._create_stats_result(column_name, "Unknown", {}, 259 | error=f"Column '{column_name}' not found in schema.") 260 | 261 | col_type = col_info["type"] 262 | col_nullable = col_info["nullable"] # Already boolean or None 263 | is_complex = self._is_complex_type(col_type) 264 | safe_column_name = f'"{column_name}"' # Quote column name for safety 265 | stats: Dict[str, Any] = {} 266 | error_msg: Optional[str] = None 267 | message: Optional[str] = None 268 | 269 | try: 270 | if self._row_count == 0: 271 | message = "Table is empty. No statistics calculated." 272 | return self._create_stats_result(column_name, col_type, stats, nullable=col_nullable, message=message) 273 | 274 | if is_complex: 275 | # Use basic counts for complex types as SUMMARIZE is less informative 276 | self.logger.debug(f"Calculating basic counts for complex type column: {column_name}") 277 | stats = self._get_basic_column_counts(safe_column_name) 278 | message = f"Only basic counts calculated for complex type '{col_type}'." 279 | # Attempt distinct count for complex types (can be slow/error-prone) 280 | try: 281 | distinct_q = f"SELECT COUNT(DISTINCT {safe_column_name}) FROM \"{self._view_name}\" WHERE {safe_column_name} IS NOT NULL;" 282 | distinct_res = self._db_conn.sql(distinct_q).fetchone() 283 | if distinct_res and distinct_res[0] is not None: 284 | stats["Distinct Count"] = f"{distinct_res[0]:,}" 285 | else: 286 | stats["Distinct Count"] = "N/A" # Or 0 if appropriate 287 | except duckdb.Error as distinct_err: 288 | self.logger.warning( 289 | f"Could not calculate distinct count for complex column '{column_name}': {distinct_err}") 290 | stats["Distinct Count"] = "Error" 291 | 292 | else: 293 | # Use SUMMARIZE for non-complex types 294 | self.logger.debug(f"Using SUMMARIZE for simple type column: {column_name}") 295 | summarize_query = f"SUMMARIZE SELECT {safe_column_name} FROM \"{self._view_name}\";" 296 | summarize_df = self._db_conn.sql(summarize_query).df() 297 | 298 | if summarize_df.empty: 299 | message = "SUMMARIZE returned no results (column might be all NULLs or empty)." 300 | # Get basic counts as fallback if summarize is empty 301 | stats = self._get_basic_column_counts(safe_column_name) 302 | else: 303 | # SUMMARIZE puts results in the first row 304 | stats = self._format_summarize_stats(summarize_df.iloc[0]) 305 | 306 | # Add histogram data for numeric columns 307 | try: 308 | self._add_histogram_data_if_numeric(stats, safe_column_name) 309 | except Exception as hist_e: 310 | self.logger.warning(f"Failed to add histogram data for {column_name}: {hist_e}") 311 | 312 | except duckdb.Error as db_err: 313 | self.logger.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}") 314 | error_msg = f"DuckDB calculation failed: {db_err}" 315 | except Exception as e: 316 | self.logger.exception(f"Unexpected error calculating statistics for column '{column_name}'") 317 | error_msg = f"Calculation failed unexpectedly: {e}" 318 | 319 | return self._create_stats_result( 320 | column_name, col_type, stats, nullable=col_nullable, error=error_msg, message=message 321 | ) 322 | 323 | def _get_basic_column_counts(self, safe_column_name: str) -> Dict[str, Any]: 324 | """Helper to get total, null, and valid counts for a column.""" 325 | stats = {} 326 | if not self._db_conn or self._row_count is None: 327 | return {"error": "Connection or row count unavailable for basic counts"} 328 | 329 | if self._row_count == 0: 330 | stats["Total Count"] = "0" 331 | stats["Valid Count"] = "0" 332 | stats["Null Count"] = "0" 333 | stats["Null Percentage"] = "N/A" 334 | return stats 335 | 336 | try: 337 | q_counts = f""" 338 | SELECT 339 | SUM(CASE WHEN {safe_column_name} IS NULL THEN 1 ELSE 0 END) AS null_count 340 | FROM "{self._view_name}"; 341 | """ 342 | counts_res = self._db_conn.sql(q_counts).fetchone() 343 | 344 | if counts_res: 345 | null_count = counts_res[0] if counts_res[0] is not None else 0 346 | total_count = self._row_count 347 | valid_count = total_count - null_count 348 | stats["Total Count"] = f"{total_count:,}" 349 | stats["Valid Count"] = f"{valid_count:,}" 350 | stats["Null Count"] = f"{null_count:,}" 351 | stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%" if total_count > 0 else "N/A" 352 | else: 353 | stats["Total Count"] = f"{self._row_count:,}" 354 | stats["Valid Count"] = "Error" 355 | stats["Null Count"] = "Error" 356 | stats["Null Percentage"] = "Error" 357 | 358 | except duckdb.Error as db_err: 359 | self.logger.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}") 360 | stats["Counts Error"] = str(db_err) 361 | return stats 362 | 363 | def _format_summarize_stats(self, summarize_row: pd.Series) -> Dict[str, Any]: 364 | """Formats the output of DuckDB's SUMMARIZE into a stats dictionary.""" 365 | stats = {} 366 | if 'count' in summarize_row and pd.notna(summarize_row['count']): 367 | total_count = int(summarize_row['count']) 368 | stats["Total Count"] = f"{total_count:,}" 369 | null_count = 0 370 | if 'null_percentage' in summarize_row and pd.notna(summarize_row['null_percentage']): 371 | null_perc = summarize_row['null_percentage'] 372 | null_count = int(round(total_count * (null_perc / 100.0))) 373 | stats["Null Percentage"] = f"{null_perc:.2f}%" 374 | stats["Null Count"] = f"{null_count:,}" 375 | else: 376 | stats["Null Percentage"] = "0.00%" # Assume 0 if missing 377 | stats["Null Count"] = "0" 378 | 379 | stats["Valid Count"] = f"{total_count - null_count:,}" 380 | else: 381 | stats["Total Count"] = "N/A" 382 | stats["Valid Count"] = "N/A" 383 | stats["Null Count"] = "N/A" 384 | stats["Null Percentage"] = "N/A" 385 | 386 | # Distinct Count 387 | if 'distinct' in summarize_row and pd.notna(summarize_row['distinct']): 388 | stats["Distinct Count"] = f"{int(summarize_row['distinct']):,}" 389 | 390 | # Numeric Stats 391 | if 'min' in summarize_row and pd.notna(summarize_row['min']): 392 | stats["Min"] = summarize_row['min'] 393 | if 'max' in summarize_row and pd.notna(summarize_row['max']): 394 | stats["Max"] = summarize_row['max'] 395 | if 'mean' in summarize_row and pd.notna(summarize_row['mean']): 396 | try: 397 | stats["Mean"] = f"{float(summarize_row['mean']):.4f}" 398 | except (ValueError, TypeError): 399 | stats["Mean"] = str(summarize_row['mean']) 400 | if 'std' in summarize_row and pd.notna(summarize_row['std']): 401 | try: 402 | stats["StdDev"] = f"{float(summarize_row['std']):.4f}" 403 | except (ValueError, TypeError): 404 | stats["StdDev"] = str(summarize_row['std']) 405 | 406 | # Quantiles (example for median) 407 | if '50%' in summarize_row and pd.notna(summarize_row['50%']): 408 | stats["Median (50%)"] = summarize_row['50%'] 409 | 410 | return stats 411 | 412 | def _add_histogram_data_if_numeric(self, stats: Dict[str, Any], safe_column_name: str) -> None: 413 | """Add histogram data for numeric columns by sampling from DuckDB.""" 414 | # Check if this looks like numeric data (has Mean, Min, Max) 415 | if not all(key in stats for key in ["Mean", "Min", "Max"]): 416 | return 417 | 418 | try: 419 | # Sample data for histogram (limit to 10k samples for performance) 420 | sample_query = f""" 421 | SELECT {safe_column_name} 422 | FROM "{self._view_name}" 423 | WHERE {safe_column_name} IS NOT NULL 424 | USING SAMPLE 10000 425 | """ 426 | 427 | sample_df = self._db_conn.sql(sample_query).df() 428 | 429 | if not sample_df.empty and len(sample_df) > 10: 430 | # Extract the column data 431 | column_data = sample_df.iloc[:, 0].tolist() 432 | 433 | # Filter out any remaining nulls 434 | clean_data = [val for val in column_data if val is not None] 435 | 436 | if len(clean_data) > 10: 437 | stats["_histogram_data"] = clean_data 438 | stats["_data_type"] = "numeric" 439 | 440 | except Exception as e: 441 | self.logger.warning(f"Failed to sample data for histogram: {e}") 442 | 443 | def _create_stats_result( 444 | self, 445 | column_name: str, 446 | col_type: str, 447 | calculated_stats: Dict[str, Any], 448 | nullable: Optional[bool] = None, 449 | error: Optional[str] = None, 450 | message: Optional[str] = None 451 | ) -> Dict[str, Any]: 452 | """Packages the stats results consistently.""" 453 | return { 454 | "column": column_name, 455 | "type": col_type, 456 | "nullable": nullable if nullable is not None else "Unknown", 457 | "calculated": calculated_stats or {}, 458 | "basic_metadata_stats": None, 459 | "metadata_stats_error": None, 460 | "error": error, 461 | "message": message, 462 | } 463 | 464 | def close(self): 465 | """Closes the DuckDB connection if it's open.""" 466 | if self._db_conn: 467 | try: 468 | self._db_conn.close() 469 | self.logger.info(f"DuckDB connection closed for {self.file_path}.") 470 | self._db_conn = None 471 | except Exception as e: 472 | # Log error but don't raise during close typically 473 | self.logger.error(f"Error closing DuckDB connection for {self.file_path}: {e}") 474 | self._db_conn = None # Assume closed even if error occurred 475 | 476 | def __enter__(self): 477 | """Enter context management.""" 478 | return self 479 | 480 | def __exit__(self, exc_type, exc_val, exc_tb): 481 | """Exit context management, ensuring connection closure.""" 482 | self.close() 483 | 484 | def __del__(self): 485 | """Ensures connection is closed when object is garbage collected (best effort).""" 486 | self.close() 487 | -------------------------------------------------------------------------------- /src/parqv/data_sources/formats/parquet.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Dict, List, Tuple, Optional, Union 3 | 4 | import pandas as pd 5 | import pyarrow as pa 6 | import pyarrow.compute as pc 7 | import pyarrow.parquet as pq 8 | 9 | from ..base import DataHandler, DataHandlerError 10 | 11 | 12 | class ParquetHandlerError(DataHandlerError): 13 | """Custom exception for Parquet Handler errors.""" 14 | pass 15 | 16 | 17 | class ParquetHandler(DataHandler): 18 | """ 19 | Handles Parquet file interactions using PyArrow. 20 | 21 | Provides methods to access metadata, schema, data preview, and column statistics. 22 | Manages the Parquet file resource lifecycle. 23 | """ 24 | 25 | def __init__(self, file_path: Path): 26 | """ 27 | Initializes the ParquetHandler by validating the path and opening the Parquet file. 28 | 29 | Args: 30 | file_path: Path to the Parquet file. 31 | 32 | Raises: 33 | ParquetHandlerError: If the file is not found, not a file, or cannot be opened/read. 34 | """ 35 | super().__init__(file_path) 36 | self.pq_file: Optional[pq.ParquetFile] = None 37 | self.schema: Optional[pa.Schema] = None 38 | self.metadata: Optional[pq.FileMetaData] = None 39 | 40 | try: 41 | # Validate file existence using the path stored by the base class 42 | if not self.file_path.is_file(): 43 | raise FileNotFoundError(f"Parquet file not found or is not a file: {self.file_path}") 44 | 45 | # Open the Parquet file 46 | self.pq_file = pq.ParquetFile(self.file_path) 47 | self.schema = self.pq_file.schema_arrow 48 | self.metadata = self.pq_file.metadata 49 | self.logger.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}") 50 | 51 | except FileNotFoundError as fnf_e: 52 | self.logger.error(f"File not found during ParquetHandler initialization: {fnf_e}") 53 | raise ParquetHandlerError(str(fnf_e)) from fnf_e 54 | except pa.lib.ArrowIOError as arrow_io_e: 55 | self.logger.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}") 56 | raise ParquetHandlerError( 57 | f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e 58 | except Exception as e: 59 | self.logger.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}") 60 | self.close() 61 | raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e 62 | 63 | # Resource Management 64 | def close(self) -> None: 65 | """Closes the Parquet file resource if it's open.""" 66 | if self.pq_file is not None: 67 | try: 68 | # ParquetFile might not have a close method depending on source, check first 69 | if hasattr(self.pq_file, 'close'): 70 | self.pq_file.close() 71 | self.logger.info(f"Closed Parquet file: {self.file_path.name}") 72 | except Exception as e: 73 | # Log error during close but don't raise, as we're cleaning up 74 | self.logger.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}") 75 | finally: 76 | self.pq_file = None 77 | self.schema = None 78 | self.metadata = None 79 | 80 | def __enter__(self): 81 | """Enter the runtime context related to this object.""" 82 | if not self.pq_file: 83 | raise ParquetHandlerError("Parquet file is not open or handler was closed.") 84 | return self 85 | 86 | def __exit__(self, exc_type, exc_val, exc_tb): 87 | """Exit the runtime context related to this object, ensuring cleanup.""" 88 | self.close() 89 | 90 | def __del__(self): 91 | """Attempt to close the file when the object is garbage collected (best effort).""" 92 | self.close() 93 | 94 | def get_metadata_summary(self) -> Dict[str, Any]: 95 | """ 96 | Provides a summary dictionary of the Parquet file's metadata. 97 | 98 | Returns: 99 | A dictionary containing key metadata attributes, or an error dictionary. 100 | """ 101 | if not self.metadata or not self.schema: 102 | self.logger.warning(f"Metadata or schema not available for summary: {self.file_path.name}") 103 | return {"error": "Metadata or schema not available"} 104 | 105 | try: 106 | created_by = self._decode_metadata_bytes(self.metadata.created_by) or "N/A" 107 | file_size = self.file_path.stat().st_size 108 | summary = { 109 | "File Path": str(self.file_path.resolve()), 110 | "Format": "Parquet", 111 | "Size": self._format_size(file_size), 112 | "Total Rows": f"{self.metadata.num_rows:,}", 113 | "Row Groups": self.metadata.num_row_groups, 114 | "Columns": self.metadata.num_columns, 115 | "Format Version": self.metadata.format_version, 116 | "Creator": created_by, 117 | "Serialization Library": self._decode_metadata_bytes( 118 | self.metadata.serialized_size > 0 and self.metadata.created_by) or "N/A", 119 | } 120 | kv_meta = self._decode_key_value_metadata(self.metadata.metadata) 121 | if kv_meta: 122 | summary["Key Value Metadata"] = kv_meta 123 | 124 | return summary 125 | except Exception as e: 126 | self.logger.exception(f"Error generating metadata summary for {self.file_path.name}") 127 | return {"error": f"Error getting metadata summary: {e}"} 128 | 129 | def get_schema_data(self) -> Optional[List[Dict[str, Any]]]: 130 | """ 131 | Returns a simplified list representation of the Arrow schema. 132 | 133 | Returns: 134 | A list of dictionaries, each describing a column (name, type string, nullable bool), 135 | or None if the schema is unavailable. 136 | """ 137 | if not self.schema: 138 | self.logger.warning(f"Schema is not available for get_schema_data: {self.file_path.name}") 139 | return None 140 | 141 | schema_list = [] 142 | for field in self.schema: 143 | try: 144 | type_str = self._format_pyarrow_type(field.type) 145 | schema_list.append({ 146 | "name": field.name, 147 | "type": type_str, 148 | "nullable": field.nullable 149 | }) 150 | except Exception as e: 151 | self.logger.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True) 152 | schema_list.append({ 153 | "name": field.name, 154 | "type": f"[Error: {e}]", 155 | "nullable": None 156 | }) 157 | return schema_list 158 | 159 | def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame: 160 | """ 161 | Fetches a preview of the data from the Parquet file using efficient batch iteration. 162 | 163 | Args: 164 | num_rows: The maximum number of rows to fetch. 165 | 166 | Returns: 167 | A pandas DataFrame with the preview data, potentially using ArrowDTypes. 168 | Returns an empty DataFrame if the file is empty or no data is read. 169 | Returns a DataFrame with an 'error' column on failure. 170 | """ 171 | if not self.pq_file: 172 | self.logger.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}") 173 | return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]}) 174 | 175 | if self.metadata and self.metadata.num_rows == 0: 176 | self.logger.info(f"Parquet file is empty based on metadata: {self.file_path.name}") 177 | if self.schema: 178 | return pd.DataFrame(columns=self.schema.names) 179 | else: 180 | return pd.DataFrame() 181 | 182 | try: 183 | # Determine rows to fetch, capped by file total 184 | num_rows_to_fetch = num_rows 185 | if self.metadata: 186 | num_rows_to_fetch = min(num_rows, self.metadata.num_rows) 187 | 188 | # Use iter_batches for memory efficiency 189 | batches = [] 190 | rows_read = 0 191 | internal_batch_size = min(max(num_rows_to_fetch // 2, 1024), 65536) 192 | 193 | for batch in self.pq_file.iter_batches(batch_size=internal_batch_size): 194 | if rows_read >= num_rows_to_fetch: 195 | break 196 | rows_needed_in_batch = num_rows_to_fetch - rows_read 197 | slice_len = min(len(batch), rows_needed_in_batch) 198 | batches.append(batch.slice(0, slice_len)) 199 | rows_read += slice_len 200 | if rows_read >= num_rows_to_fetch: 201 | break 202 | 203 | if not batches: 204 | # Check if file might have rows but reading yielded nothing 205 | if self.metadata and self.metadata.num_rows > 0: 206 | self.logger.warning( 207 | f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}") 208 | else: 209 | self.logger.info(f"No data read for preview (file likely empty): {self.file_path.name}") 210 | # Return empty DF with columns if schema available 211 | if self.schema: 212 | return pd.DataFrame(columns=self.schema.names) 213 | else: 214 | return pd.DataFrame() 215 | 216 | # Combine batches and convert to Pandas 217 | preview_table = pa.Table.from_batches(batches) 218 | df = preview_table.to_pandas( 219 | split_blocks=True, 220 | self_destruct=True, 221 | types_mapper=pd.ArrowDtype 222 | ) 223 | self.logger.info(f"Generated preview of {len(df)} rows for {self.file_path.name}") 224 | return df 225 | 226 | except Exception as e: 227 | self.logger.exception(f"Error generating data preview from Parquet file: {self.file_path.name}") 228 | return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]}) 229 | 230 | def get_column_stats(self, column_name: str) -> Dict[str, Any]: 231 | """ 232 | Calculates statistics for a specific column by reading its data. 233 | 234 | Args: 235 | column_name: The name of the column to analyze. 236 | 237 | Returns: 238 | A dictionary containing calculated statistics, metadata statistics, 239 | and potential error or message keys. 240 | """ 241 | if not self.pq_file or not self.schema: 242 | self.logger.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}") 243 | return self._create_stats_result(column_name, None, error="File or schema not available") 244 | 245 | try: 246 | field = self.schema.field(column_name) 247 | except KeyError: 248 | self.logger.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}") 249 | return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema") 250 | 251 | calculated_stats: Dict[str, Any] = {} 252 | error_msg: Optional[str] = None 253 | message: Optional[str] = None 254 | metadata_stats: Optional[Dict] = None 255 | metadata_stats_error: Optional[str] = None 256 | 257 | try: 258 | # Data Reading 259 | table = self.pq_file.read(columns=[column_name]) 260 | column_data = table.column(0) 261 | self.logger.debug( 262 | f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}") 263 | 264 | # Basic Counts 265 | total_count = len(column_data) 266 | if total_count > 0: 267 | null_count = column_data.null_count 268 | valid_count = total_count - null_count 269 | calculated_stats["Total Count"] = f"{total_count:,}" 270 | calculated_stats["Valid Count"] = f"{valid_count:,}" 271 | calculated_stats["Null Count"] = f"{null_count:,}" 272 | calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%" 273 | else: 274 | self.logger.info(f"Column '{column_name}' read resulted in 0 rows.") 275 | message = "Column is empty (0 rows)." 276 | valid_count = 0 # Ensure valid_count is 0 for later checks 277 | 278 | # Type-Specific Calculations 279 | if valid_count > 0: 280 | col_type = field.type 281 | self.logger.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}") 282 | try: 283 | if pa.types.is_floating(col_type) or pa.types.is_integer(col_type): 284 | calculated_stats.update(self._calculate_numeric_stats(column_data)) 285 | elif pa.types.is_temporal(col_type): 286 | calculated_stats.update(self._calculate_temporal_stats(column_data)) 287 | elif pa.types.is_string(col_type) or pa.types.is_large_string(col_type) \ 288 | or pa.types.is_binary(col_type) or pa.types.is_large_binary(col_type): 289 | calculated_stats.update(self._calculate_string_binary_stats(column_data)) 290 | elif pa.types.is_boolean(col_type): 291 | calculated_stats.update(self._calculate_boolean_stats(column_data)) 292 | elif pa.types.is_dictionary(col_type): 293 | calculated_stats.update(self._calculate_dictionary_stats(column_data, col_type)) 294 | message = calculated_stats.pop("message", message) 295 | elif pa.types.is_struct(col_type) or pa.types.is_list(col_type) or pa.types.is_map(col_type) \ 296 | or pa.types.is_fixed_size_list(col_type) or pa.types.is_union(col_type): 297 | calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type)) 298 | message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'." 299 | else: 300 | self.logger.warning(f"Statistics calculation not fully implemented for type: {col_type}") 301 | message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'." 302 | 303 | except Exception as calc_err: 304 | self.logger.exception( 305 | f"Error during type-specific calculation for column '{column_name}': {calc_err}") 306 | error_msg = f"Calculation error for type {field.type}: {calc_err}" 307 | calculated_stats["Calculation Error"] = str(calc_err) # Add specific error key 308 | 309 | elif total_count > 0: 310 | message = "Column contains only NULL values." 311 | 312 | # Metadata Statistics --- 313 | metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name) 314 | 315 | except pa.lib.ArrowException as arrow_e: 316 | self.logger.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}") 317 | error_msg = f"Arrow processing error: {arrow_e}" 318 | except Exception as e: 319 | self.logger.exception(f"Unexpected error during stats calculation for column '{column_name}'") 320 | error_msg = f"Calculation failed unexpectedly: {e}" 321 | 322 | return self._create_stats_result( 323 | column_name, field, calculated_stats, metadata_stats, metadata_stats_error, error_msg, message 324 | ) 325 | 326 | def _decode_metadata_bytes(self, value: Optional[Union[bytes, str]]) -> Optional[str]: 327 | """Safely decodes bytes metadata values to UTF-8 strings, replacing errors.""" 328 | if isinstance(value, bytes): 329 | try: 330 | return value.decode('utf-8', errors='replace') 331 | except Exception as e: 332 | self.logger.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}") 333 | return f"[Decode Error: {value!r}]" 334 | return str(value) if value is not None else None 335 | 336 | def _decode_key_value_metadata(self, kv_meta: Optional[Dict[Union[str, bytes], Union[str, bytes]]]) -> Optional[ 337 | Dict[str, str]]: 338 | """Decodes keys and values of the key-value metadata dictionary.""" 339 | if not kv_meta: 340 | return None 341 | decoded_kv = {} 342 | try: 343 | for k, v in kv_meta.items(): 344 | key_str = self._decode_metadata_bytes(k) or "[Invalid Key]" 345 | val_str = self._decode_metadata_bytes(v) or "[Invalid Value]" 346 | decoded_kv[key_str] = val_str 347 | return decoded_kv 348 | except Exception as e: 349 | self.logger.warning(f"Could not decode key-value metadata: {e}") 350 | return {"error": f"Error decoding key-value metadata: {e}"} 351 | 352 | def _format_pyarrow_type(self, field_type: pa.DataType) -> str: 353 | """Formats a PyArrow DataType into a readable string, including details.""" 354 | if pa.types.is_timestamp(field_type): 355 | tz_str = f", tz='{field_type.tz}'" if field_type.tz else "" 356 | return f"TIMESTAMP(unit='{field_type.unit}'{tz_str})" 357 | if pa.types.is_time32(field_type) or pa.types.is_time64(field_type): 358 | return f"TIME(unit='{field_type.unit}')" 359 | if pa.types.is_duration(field_type): 360 | return f"DURATION(unit='{field_type.unit}')" 361 | if pa.types.is_decimal128(field_type) or pa.types.is_decimal256(field_type): 362 | return f"DECIMAL({field_type.precision}, {field_type.scale})" 363 | if pa.types.is_fixed_size_binary(field_type): 364 | return f"FIXED_SIZE_BINARY({field_type.byte_width})" 365 | if pa.types.is_list(field_type) or pa.types.is_large_list(field_type) or pa.types.is_fixed_size_list( 366 | field_type): 367 | prefix = "LIST" 368 | if pa.types.is_large_list(field_type): prefix = "LARGE_LIST" 369 | if pa.types.is_fixed_size_list(field_type): prefix = f"FIXED_SIZE_LIST({field_type.list_size})" 370 | value_type_str = self._format_pyarrow_type(field_type.value_type) 371 | return f"{prefix}" 372 | if pa.types.is_struct(field_type): 373 | num_fields_to_show = 3 374 | field_details = ", ".join( 375 | f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:num_fields_to_show]) 376 | suffix = "..." if field_type.num_fields > num_fields_to_show else "" 377 | return f"STRUCT<{field_details}{suffix}>" 378 | if pa.types.is_map(field_type): 379 | keys_sorted = getattr(field_type, 'keys_sorted', False) 380 | sorted_str = ", keys_sorted" if keys_sorted else "" 381 | key_type_str = self._format_pyarrow_type(field_type.key_type) 382 | item_type_str = self._format_pyarrow_type(field_type.item_type) 383 | return f"MAP" 384 | if pa.types.is_dictionary(field_type): 385 | index_type_str = self._format_pyarrow_type(field_type.index_type) 386 | value_type_str = self._format_pyarrow_type(field_type.value_type) 387 | ordered = getattr(field_type, 'ordered', False) 388 | return f"DICTIONARY" 389 | if pa.types.is_union(field_type): 390 | type_codes = getattr(field_type, 'type_codes', []) 391 | mode = getattr(field_type, 'mode', 'sparse') 392 | field_details = ", ".join( 393 | f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:3]) # Show first few fields 394 | suffix = "..." if field_type.num_fields > 3 else "" 395 | return f"UNION<{field_details}{suffix}> (mode='{mode}', codes={type_codes[:5]}{'...' if len(type_codes) > 5 else ''})" 396 | 397 | return str(field_type).upper() 398 | 399 | def _safe_compute(self, func, data, *args, **kwargs) -> Tuple[Optional[Any], Optional[str]]: 400 | """Helper to safely execute a pyarrow.compute function and handle errors.""" 401 | if data.null_count == len(data): 402 | return None, "Input data is all NULL" 403 | try: 404 | result_scalar = func(data, *args, **kwargs) 405 | return result_scalar.as_py() if result_scalar.is_valid else None, None 406 | except pa.lib.ArrowNotImplementedError as nie: 407 | return None, "Not Implemented" 408 | except Exception as e: 409 | return None, f"Compute Error: {e}" 410 | 411 | def _calculate_numeric_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]: 412 | """Calculates min, max, mean, stddev for numeric columns using _safe_compute.""" 413 | stats: Dict[str, Any] = {} 414 | min_val, err = self._safe_compute(pc.min, column_data) 415 | stats["Min"] = min_val if err is None else err 416 | max_val, err = self._safe_compute(pc.max, column_data) 417 | stats["Max"] = max_val if err is None else err 418 | mean_val, err = self._safe_compute(pc.mean, column_data) 419 | stats["Mean"] = f"{mean_val:.4f}" if mean_val is not None and err is None else (err or "N/A") 420 | stddev_val, err = self._safe_compute(pc.stddev, column_data, ddof=1) 421 | stats["StdDev"] = f"{stddev_val:.4f}" if stddev_val is not None and err is None else (err or "N/A") 422 | if stats["StdDev"] == "Not Implemented": 423 | variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1) 424 | stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else ( 425 | err_var or "N/A") 426 | distinct_val, err = self._safe_compute(pc.count_distinct, column_data) 427 | stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A") 428 | 429 | # Add histogram data for visualization 430 | try: 431 | # Convert to Python list for histogram calculation (sample if too large) 432 | data_length = len(column_data) 433 | sample_size = min(10000, data_length) # Limit to 10k samples for performance 434 | 435 | if data_length > sample_size: 436 | # Sample the data 437 | import random 438 | indices = sorted(random.sample(range(data_length), sample_size)) 439 | sampled_data = [column_data[i].as_py() for i in indices] 440 | else: 441 | sampled_data = column_data.to_pylist() 442 | 443 | # Filter out None values 444 | clean_data = [val for val in sampled_data if val is not None] 445 | 446 | if len(clean_data) > 10: # Only create histogram if we have enough data 447 | stats["_histogram_data"] = clean_data 448 | stats["_data_type"] = "numeric" 449 | 450 | except Exception as e: 451 | self.logger.warning(f"Failed to prepare histogram data: {e}") 452 | 453 | return stats 454 | 455 | def _calculate_temporal_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]: 456 | """Calculates min and max for temporal columns using _safe_compute.""" 457 | stats: Dict[str, Any] = {} 458 | min_val, err = self._safe_compute(pc.min, column_data) 459 | stats["Min"] = min_val if err is None else err # .as_py() handles conversion 460 | max_val, err = self._safe_compute(pc.max, column_data) 461 | stats["Max"] = max_val if err is None else err 462 | return stats 463 | 464 | def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]: 465 | """Calculates distinct count for string/binary columns.""" 466 | stats: Dict[str, Any] = {} 467 | distinct_val, err = self._safe_compute(pc.count_distinct, column_data) 468 | stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A") 469 | return stats 470 | 471 | def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]: 472 | """Calculates value counts (True/False) for boolean columns.""" 473 | stats: Dict[str, Any] = {} 474 | try: 475 | if column_data.null_count == len(column_data): 476 | stats["Value Counts"] = "All NULL" 477 | return stats 478 | 479 | # value_counts returns a StructArray [{values: bool, counts: int64}, ...] 480 | value_counts_struct = pc.value_counts(column_data) 481 | counts_dict = {} 482 | if len(value_counts_struct) > 0: 483 | for i in range(len(value_counts_struct)): 484 | value = value_counts_struct.field("values")[i].as_py() 485 | count = value_counts_struct.field("counts")[i].as_py() 486 | counts_dict[value] = count # Keys are True/False 487 | 488 | stats["Value Counts"] = {str(k): f"{v:,}" for k, v in counts_dict.items()} 489 | # Ensure both True and False are present, even if count is 0 490 | if 'True' not in stats["Value Counts"]: stats["Value Counts"]['True'] = "0" 491 | if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0" 492 | 493 | except Exception as vc_e: 494 | self.logger.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True) 495 | stats["Value Counts"] = "Error calculating" 496 | return stats 497 | 498 | def _calculate_dictionary_stats(self, column_data: pa.ChunkedArray, col_type: pa.DictionaryType) -> Dict[str, Any]: 499 | """Calculates stats for dictionary type based on its value type.""" 500 | stats: Dict[str, Any] = {"message": "Stats calculated on dictionary values."} # Start with message 501 | try: 502 | unwrapped_data = column_data.dictionary_decode() 503 | value_type = col_type.value_type 504 | self.logger.debug(f"Calculating dictionary stats based on value type: {value_type}") 505 | 506 | # Delegate calculation based on the *value* type 507 | if pa.types.is_floating(value_type) or pa.types.is_integer(value_type): 508 | stats.update(self._calculate_numeric_stats(unwrapped_data)) 509 | elif pa.types.is_temporal(value_type): 510 | stats.update(self._calculate_temporal_stats(unwrapped_data)) 511 | elif pa.types.is_string(value_type) or pa.types.is_large_string(value_type) \ 512 | or pa.types.is_binary(value_type) or pa.types.is_large_binary(value_type): 513 | stats.update(self._calculate_string_binary_stats(unwrapped_data)) 514 | # Add other dictionary value types if necessary (boolean, etc.) 515 | else: 516 | stats[ 517 | "message"] += f" (Stats for value type '{self._format_pyarrow_type(value_type)}' not fully implemented)." 518 | # Calculate distinct count on the original dictionary array (can be faster) 519 | distinct_val, err = self._safe_compute(pc.count_distinct, column_data) 520 | stats[ 521 | "Distinct Values (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else ( 522 | err or "N/A") 523 | 524 | except pa.lib.ArrowException as arrow_decode_err: 525 | self.logger.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}") 526 | stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}" 527 | except Exception as dict_e: 528 | self.logger.warning(f"Could not process dictionary type for stats: {dict_e}") 529 | stats["Dictionary Error"] = f"Processing Error: {dict_e}" 530 | return stats 531 | 532 | def _calculate_complex_type_stats(self, column_data: pa.ChunkedArray, col_type: pa.DataType) -> Dict[str, Any]: 533 | """Calculates basic stats (like distinct count) for complex types.""" 534 | stats: Dict[str, Any] = {} 535 | # Distinct count is often the most feasible stat for complex types 536 | distinct_val, err = self._safe_compute(pc.count_distinct, column_data) 537 | # Note: Distinct count on complex types can be approximate or may error depending on type 538 | stats["Distinct Count (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else ( 539 | err or "N/A") 540 | return stats 541 | 542 | def _get_stats_from_metadata(self, column_name: str) -> Tuple[Dict[str, Any], Optional[str]]: 543 | """Retrieves statistics stored within the Parquet file metadata per row group.""" 544 | metadata_stats: Dict[str, Any] = {} 545 | error_str: Optional[str] = None 546 | 547 | if not self.metadata or not self.schema: 548 | return {}, "Metadata or Schema not available" 549 | 550 | try: 551 | col_index = self.schema.get_field_index(column_name) 552 | 553 | for i in range(self.metadata.num_row_groups): 554 | group_key = f"RG {i}" 555 | try: 556 | rg_meta = self.metadata.row_group(i) 557 | metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index) 558 | except IndexError: 559 | self.logger.warning(f"Column index {col_index} out of bounds for row group {i}.") 560 | metadata_stats[group_key] = "Index Error" 561 | except Exception as e: 562 | self.logger.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}") 563 | metadata_stats[group_key] = f"Read Error: {e}" 564 | 565 | except KeyError: 566 | self.logger.warning(f"Column '{column_name}' not found in schema for metadata stats.") 567 | error_str = f"Column '{column_name}' not found in schema" 568 | except Exception as e: 569 | self.logger.exception(f"Failed to get metadata statistics structure for column '{column_name}'.") 570 | error_str = f"Error accessing metadata structure: {e}" 571 | 572 | return metadata_stats, error_str 573 | 574 | def _extract_stats_for_single_group(self, rg_meta: pq.RowGroupMetaData, col_index: int) -> Union[ 575 | str, Dict[str, Any]]: 576 | """Extracts stats from a column chunk's metadata within a row group.""" 577 | try: 578 | col_chunk_meta = rg_meta.column(col_index) 579 | stats = col_chunk_meta.statistics 580 | if not stats: return "No stats in metadata" 581 | 582 | def _format_stat(value, is_present, is_numeric=True): 583 | if not is_present: return "N/A" 584 | try: 585 | # Attempt to format nicely, fallback to repr for safety 586 | return f"{value:,}" if is_numeric else str(value) 587 | except Exception: 588 | return repr(value) 589 | 590 | return { 591 | "min": _format_stat(stats.min, stats.has_min_max, is_numeric=False), 592 | "max": _format_stat(stats.max, stats.has_min_max, is_numeric=False), 593 | "nulls": _format_stat(stats.null_count, stats.has_null_count), 594 | "distinct": _format_stat(stats.distinct_count, stats.has_distinct_count), 595 | "size_comp": _format_stat(col_chunk_meta.total_compressed_size, 596 | col_chunk_meta.total_compressed_size is not None), 597 | "size_uncomp": _format_stat(col_chunk_meta.total_uncompressed_size, 598 | col_chunk_meta.total_uncompressed_size is not None), 599 | } 600 | except IndexError: 601 | self.logger.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.") 602 | return "Index Error" 603 | except Exception as e: 604 | self.logger.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True) 605 | return f"Metadata Read Error: {e}" 606 | 607 | def _create_stats_result( 608 | self, 609 | column_name: str, 610 | field: Optional[pa.Field], 611 | calculated_stats: Optional[Dict] = None, 612 | metadata_stats: Optional[Dict] = None, 613 | metadata_stats_error: Optional[str] = None, 614 | calculation_error: Optional[str] = None, 615 | message: Optional[str] = None 616 | ) -> Dict[str, Any]: 617 | """Consistently packages the results of column statistics calculation.""" 618 | calculated_stats_dict = calculated_stats if calculated_stats is not None else {} 619 | 620 | col_type_str = "Unknown" 621 | col_nullable = None 622 | if field: 623 | try: 624 | col_type_str = self._format_pyarrow_type(field.type) 625 | col_nullable = field.nullable 626 | except Exception as e: 627 | self.logger.error(f"Error formatting type for column {column_name}: {e}") 628 | col_type_str = f"[Error formatting: {field.type}]" 629 | col_nullable = None 630 | 631 | return { 632 | "column": column_name, 633 | "type": col_type_str, 634 | "nullable": col_nullable, 635 | "calculated": calculated_stats_dict, 636 | "basic_metadata_stats": metadata_stats, 637 | "metadata_stats_error": metadata_stats_error, 638 | "error": calculation_error, 639 | "message": message 640 | } 641 | 642 | def _format_size(self, num_bytes: int) -> str: 643 | """Formats bytes into a human-readable string (KB, MB, GB).""" 644 | if num_bytes < 1024: 645 | return f"{num_bytes} Bytes" 646 | elif num_bytes < 1024 ** 2: 647 | return f"{num_bytes / 1024:.2f} KB" 648 | elif num_bytes < 1024 ** 3: 649 | return f"{num_bytes / 1024 ** 2:.2f} MB" 650 | else: 651 | return f"{num_bytes / 1024 ** 3:.2f} GB" 652 | --------------------------------------------------------------------------------