├── .gitignore ├── crates ├── tbl-cli │ ├── src │ │ ├── cli │ │ │ ├── mod.rs │ │ │ ├── subcommands │ │ │ │ ├── mod.rs │ │ │ │ ├── schemas.rs │ │ │ │ ├── ls.rs │ │ │ │ ├── data.rs │ │ │ │ └── schema.rs │ │ │ └── args.rs │ │ ├── main.rs │ │ ├── python.rs │ │ ├── types.rs │ │ ├── styles.rs │ │ ├── summary.rs │ │ ├── output.rs │ │ └── transform.rs │ ├── Cargo.toml │ └── build.rs └── tbl-core │ ├── src │ ├── filesystem │ │ ├── mod.rs │ │ ├── sizes.rs │ │ ├── inputs.rs │ │ ├── manipulate.rs │ │ ├── gather.rs │ │ └── outputs.rs │ ├── parquet │ │ ├── parquet_scan.rs │ │ ├── mod.rs │ │ ├── parquet_merge.rs │ │ ├── parquet_cast.rs │ │ ├── parquet_drop.rs │ │ ├── parquet_summary.rs │ │ └── parquet_insert.rs │ ├── lib.rs │ ├── types.rs │ └── formats.rs │ └── Cargo.toml ├── Cargo.toml ├── LICENSE-MIT ├── LICENSE-APACHE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | TODO.md 3 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | mod args; 2 | mod subcommands; 3 | 4 | pub(crate) use args::*; 5 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/subcommands/mod.rs: -------------------------------------------------------------------------------- 1 | mod data; 2 | pub(crate) use data::*; 3 | 4 | mod ls; 5 | pub(crate) use ls::*; 6 | 7 | mod schema; 8 | pub(crate) use schema::*; 9 | 10 | mod schemas; 11 | pub(crate) use schemas::*; 12 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/subcommands/schemas.rs: -------------------------------------------------------------------------------- 1 | use crate::{SchemasArgs, TblCliError}; 2 | 3 | pub(crate) async fn schemas_command(_args: SchemasArgs) -> Result<(), TblCliError> { 4 | println!("[not implemented yet]"); 5 | Ok(()) 6 | } 7 | -------------------------------------------------------------------------------- /crates/tbl-core/src/filesystem/mod.rs: -------------------------------------------------------------------------------- 1 | /// path gathering functions 2 | pub mod gather; 3 | pub use gather::*; 4 | 5 | /// path input functions 6 | pub mod inputs; 7 | pub use inputs::*; 8 | 9 | /// path manipulate functions 10 | pub mod manipulate; 11 | pub use manipulate::*; 12 | 13 | /// path outputs functions 14 | pub mod outputs; 15 | pub use outputs::*; 16 | 17 | /// path size 18 | pub mod sizes; 19 | pub use sizes::*; 20 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/parquet_scan.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use polars::prelude::*; 3 | use std::path::PathBuf; 4 | 5 | /// create lazy frame by scanning input paths 6 | pub fn create_lazyframe(paths: &[PathBuf]) -> Result { 7 | let scan_args = polars::prelude::ScanArgsParquet::default(); 8 | let arc_paths = Arc::from(paths.to_vec().into_boxed_slice()); 9 | Ok(LazyFrame::scan_parquet_files(arc_paths, scan_args)?) 10 | } 11 | -------------------------------------------------------------------------------- /crates/tbl-core/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! utilities for reading and editing tabular files 2 | 3 | #![allow(dead_code)] 4 | #![warn(missing_docs, unreachable_pub, unused_crate_dependencies)] 5 | #![deny(unused_must_use, rust_2018_idioms)] 6 | #![doc(test( 7 | no_crate_inject, 8 | attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_variables)) 9 | ))] 10 | 11 | /// filesystem utilities 12 | pub mod filesystem; 13 | 14 | /// parquet utilities 15 | pub mod parquet; 16 | 17 | /// types 18 | pub mod types; 19 | 20 | /// formats 21 | pub mod formats; 22 | 23 | pub use types::*; 24 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/mod.rs: -------------------------------------------------------------------------------- 1 | /// parquet summary functions 2 | pub mod parquet_summary; 3 | pub use parquet_summary::*; 4 | 5 | /// parquet drop functions 6 | pub mod parquet_drop; 7 | pub use parquet_drop::*; 8 | 9 | /// parquet cast functions 10 | pub mod parquet_cast; 11 | pub use parquet_cast::*; 12 | 13 | /// parquet merge functions 14 | pub mod parquet_merge; 15 | pub use parquet_merge::*; 16 | 17 | /// parquet insert functions 18 | pub mod parquet_insert; 19 | pub use parquet_insert::*; 20 | 21 | /// parquet parquet_scan 22 | pub mod parquet_scan; 23 | pub use parquet_scan::*; 24 | -------------------------------------------------------------------------------- /crates/tbl-core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tbl-core" 3 | description = "utilities for reading and modifying tabular files" 4 | version.workspace = true 5 | edition.workspace = true 6 | license.workspace = true 7 | homepage.workspace = true 8 | repository.workspace = true 9 | 10 | [dependencies] 11 | arrow = { workspace = true } 12 | colored = "2.1.0" 13 | futures = "0.3.30" 14 | hex = "0.4.3" 15 | parquet = { version = "52.0.0", features = ["async"] } 16 | polars = { workspace = true } 17 | thiserror = { workspace = true } 18 | tokio = { workspace = true } 19 | 20 | [dev-dependencies] 21 | tempfile = "3.10.1" 22 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | //! utilities for reading and editing tabular files 2 | 3 | #![allow(dead_code)] 4 | #![warn(missing_docs, unreachable_pub, unused_crate_dependencies)] 5 | #![deny(unused_must_use, rust_2018_idioms)] 6 | #![doc(test( 7 | no_crate_inject, 8 | attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_variables)) 9 | ))] 10 | 11 | mod cli; 12 | pub(crate) use cli::*; 13 | 14 | pub(crate) mod styles; 15 | 16 | mod types; 17 | use types::*; 18 | 19 | mod python; 20 | 21 | mod summary; 22 | 23 | mod transform; 24 | 25 | mod output; 26 | 27 | #[tokio::main] 28 | async fn main() -> Result<(), TblCliError> { 29 | cli::run_cli().await 30 | } 31 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [workspace] 3 | members = ["crates/tbl-core", "crates/tbl-cli"] 4 | resolver = "2" 5 | 6 | [workspace.package] 7 | version = "0.1.1" 8 | edition = "2021" 9 | license = "MIT OR Apache-2.0" 10 | homepage = "https://github.com/paradigmxyz/tbl" 11 | repository = "https://github.com/paradigmxyz/tbl" 12 | exclude = [".github/"] 13 | 14 | [workspace.dependencies] 15 | thiserror = "1.0" 16 | tokio = { version = "1.32.0", features = ["full"] } 17 | arrow = "52.0.0" 18 | polars = { version = "0.41.3", features = ["json", "parquet", "lazy", "csv", "dtype-u8", "dtype-u16", "dtype-decimal", "string_encoding", "binary_encoding", "concat_str", "replace", "strings", "streaming", "timezones"] } 19 | -------------------------------------------------------------------------------- /crates/tbl-core/src/filesystem/sizes.rs: -------------------------------------------------------------------------------- 1 | use futures::stream::{FuturesUnordered, StreamExt}; 2 | use std::path::Path; 3 | use tokio::fs; 4 | 5 | /// get total number of bytes across files 6 | pub async fn get_total_bytes_of_files(file_paths: &[&Path]) -> Result { 7 | let futures = file_paths.iter().map(|path| async move { 8 | let metadata = fs::metadata(path).await?; 9 | Ok::(if metadata.is_file() { 10 | metadata.len() 11 | } else { 12 | 0 13 | }) 14 | }); 15 | 16 | let mut total: u64 = 0; 17 | let mut futures: FuturesUnordered<_> = futures.collect(); 18 | while let Some(result) = futures.next().await { 19 | total += result?; 20 | } 21 | 22 | Ok(total) 23 | } 24 | -------------------------------------------------------------------------------- /crates/tbl-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tbl-cli" 3 | description = "tbl is a tool for reading and editing tabular data files" 4 | version.workspace = true 5 | edition.workspace = true 6 | license.workspace = true 7 | homepage.workspace = true 8 | repository.workspace = true 9 | 10 | [[bin]] 11 | name = "tbl" 12 | path = "src/main.rs" 13 | 14 | [dependencies] 15 | clap = { version = "4.4.8", features = ["derive"] } 16 | tokio = { workspace = true } 17 | thiserror = { workspace = true } 18 | tbl-core = { version = "0.1.0", path = "../tbl-core" } 19 | term_size = "0.3.2" 20 | polars = { workspace = true } 21 | toolstr = "0.1.5" 22 | toolstr_colored = "2.1.1" 23 | inquire = "0.7.5" 24 | anstyle = "1.0.7" 25 | color-print = "0.3.6" 26 | chrono = "0.4.38" 27 | hex = "0.4.3" 28 | 29 | [build-dependencies] 30 | built = "0.7" 31 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2024 tbl contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/tbl-core/src/filesystem/inputs.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use std::path::PathBuf; 3 | 4 | /// get file paths 5 | pub fn get_input_paths( 6 | inputs: &Option>, 7 | tree: bool, 8 | sort: bool, 9 | ) -> Result, TblError> { 10 | // get paths 11 | let raw_paths = match inputs { 12 | Some(raw_paths) => raw_paths.to_vec(), 13 | None => vec![std::env::current_dir()?], 14 | }; 15 | 16 | // expand tree if specified 17 | let mut paths: Vec = vec![]; 18 | for raw_path in raw_paths.into_iter() { 19 | if raw_path.is_dir() { 20 | let sub_paths = if tree { 21 | super::gather::get_tree_tabular_files(&raw_path)? 22 | } else { 23 | super::gather::get_directory_tabular_files(&raw_path)? 24 | }; 25 | paths.extend(sub_paths); 26 | } else if super::gather::is_tabular_file(&raw_path) { 27 | paths.push(raw_path); 28 | } else { 29 | println!("skipping non-tabular file {:?}", raw_path) 30 | } 31 | } 32 | 33 | // sort 34 | if sort { 35 | paths.sort() 36 | } 37 | 38 | Ok(paths) 39 | } 40 | -------------------------------------------------------------------------------- /crates/tbl-cli/build.rs: -------------------------------------------------------------------------------- 1 | use std::process::Command; 2 | 3 | fn main() { 4 | // Get the most recent tag 5 | let tag_output = Command::new("git") 6 | .args(["describe", "--tags", "--abbrev=0"]) 7 | .output() 8 | .expect("Failed to execute git command for tag"); 9 | 10 | let tag = String::from_utf8(tag_output.stdout) 11 | .expect("Invalid UTF-8 output from git for tag") 12 | .trim() 13 | .to_string(); 14 | 15 | // Get the git description (includes commits since tag, if any) 16 | let desc_output = Command::new("git") 17 | .args(["describe", "--always", "--dirty"]) 18 | .output() 19 | .expect("Failed to execute git command for description"); 20 | 21 | let git_description = String::from_utf8(desc_output.stdout) 22 | .expect("Invalid UTF-8 output from git for description") 23 | .trim() 24 | .to_string(); 25 | 26 | // Combine tag and description 27 | let version_string = if tag == git_description { 28 | // If they're the same, just use one 29 | tag 30 | } else { 31 | format!("{}-{}", tag, git_description) 32 | }; 33 | 34 | println!("cargo:rustc-env=GIT_DESCRIPTION={}", version_string); 35 | 36 | built::write_built_file().expect("Failed to acquire build-time information"); 37 | } 38 | -------------------------------------------------------------------------------- /crates/tbl-core/src/types.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | /// Tbl Error 4 | #[derive(Error, Debug)] 5 | pub enum TblError { 6 | /// Error wrapper for standard IO errors. 7 | #[error(transparent)] 8 | IOError(#[from] std::io::Error), 9 | 10 | /// Error wrapper for polars errors. 11 | #[error(transparent)] 12 | PolarsError(#[from] polars::prelude::PolarsError), 13 | 14 | /// Error wrapper for parquet errors. 15 | #[error(transparent)] 16 | ParquetError(#[from] parquet::errors::ParquetError), 17 | 18 | /// Error wrapper for tokio errors. 19 | #[error(transparent)] 20 | TokioJoinError(#[from] tokio::task::JoinError), 21 | 22 | /// Error wrapper for tokio errors. 23 | #[error(transparent)] 24 | StripPrefixError(#[from] std::path::StripPrefixError), 25 | 26 | /// Error wrapper for tokio errors. 27 | #[error(transparent)] 28 | ArrowError(#[from] arrow::error::ArrowError), 29 | 30 | /// Error wrapper for schema errors. 31 | #[error("Schema error: {0}")] 32 | SchemaError(String), 33 | 34 | /// Error wrapper for input errors. 35 | #[error("Input error: {0}")] 36 | InputError(String), 37 | 38 | /// General Error 39 | #[error("Input error: {0}")] 40 | Error(String), 41 | 42 | /// Error wrapper for AcquireError 43 | #[error(transparent)] 44 | TokioAcquireError(#[from] tokio::sync::AcquireError), 45 | } 46 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/python.rs: -------------------------------------------------------------------------------- 1 | use crate::TblCliError; 2 | use std::path::PathBuf; 3 | use std::process::Command; 4 | 5 | pub(crate) fn load_df_interactive( 6 | paths: Vec, 7 | lazy: bool, 8 | executable: Option, 9 | ) -> Result<(), TblCliError> { 10 | let paths: Vec<_> = paths 11 | .iter() 12 | .map(|path| format!("'{}'", path.to_string_lossy())) 13 | .collect(); 14 | let paths_str = paths.join(",\n "); 15 | 16 | let input_word = if paths.len() == 1 { "input" } else { "inputs" }; 17 | 18 | let (pl_function, pl_variable, final_str, final_print) = if lazy { 19 | ("scan", "lf", "\\n# use `df = lf.collect()` to collect", "") 20 | } else { 21 | ("read", "df", "print(df)\\n", "\nprint(df)") 22 | }; 23 | 24 | let python_code = format!( 25 | r#" 26 | import polars as pl 27 | 28 | inputs = [ 29 | {} 30 | ] 31 | 32 | {} = pl.{}_parquet(inputs) 33 | print() 34 | print('import polars as pl') 35 | print() 36 | print('# {}ing ' + str(len(inputs)) + ' {} into {}') 37 | print('inputs = [...]') 38 | print('{} = pl.{}_parquet(inputs)') 39 | print("{}") 40 | {} 41 | "#, 42 | paths_str, 43 | pl_variable, 44 | pl_function, 45 | pl_function, 46 | input_word, 47 | pl_variable, 48 | pl_variable, 49 | pl_function, 50 | final_str, 51 | final_print, 52 | ); 53 | 54 | let executable = if let Some(executable) = executable { 55 | executable 56 | } else { 57 | "ipython".to_string() 58 | }; 59 | 60 | Command::new(executable) 61 | .arg("-i") 62 | .arg("-c") 63 | .arg(python_code) 64 | .spawn()? 65 | .wait()?; 66 | 67 | Ok(()) 68 | } 69 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/types.rs: -------------------------------------------------------------------------------- 1 | use tbl_core::TblError; 2 | use thiserror::Error; 3 | 4 | #[derive(Error, Debug)] 5 | pub(crate) enum TblCliError { 6 | /// Error wrapper for standard IO errors. 7 | #[error(transparent)] 8 | IO(#[from] std::io::Error), 9 | 10 | /// Error wrapper for standard IO errors. 11 | #[error(transparent)] 12 | Tbl(#[from] TblError), 13 | 14 | /// Error caused by arguments 15 | #[error("Argument error: {0}")] 16 | Arg(String), 17 | 18 | /// Error wrapper for standard IO errors. 19 | #[error(transparent)] 20 | StripPrefix(#[from] std::path::StripPrefixError), 21 | 22 | /// Error wrapper for toolstr errors. 23 | #[error(transparent)] 24 | ToolstrError(#[from] toolstr::FormatError), 25 | 26 | /// Error wrapper for toolstr errors. 27 | #[error(transparent)] 28 | PolarsError(#[from] polars::prelude::PolarsError), 29 | 30 | /// Error caused by missing schema 31 | #[error("Argument error: {0}")] 32 | MissingSchemaError(String), 33 | 34 | /// Error parsing an int 35 | #[error(transparent)] 36 | ParseIntError(#[from] std::num::ParseIntError), 37 | 38 | /// General Error 39 | #[error("Input error: {0}")] 40 | Error(String), 41 | } 42 | 43 | pub(crate) enum OutputMode { 44 | PrintToStdout, 45 | SaveToSingleFile, 46 | ModifyInplace, 47 | SaveToDirectory, 48 | Partition, 49 | InteractiveLf, 50 | InteractiveDf, 51 | } 52 | 53 | impl OutputMode { 54 | pub(crate) fn writes_to_disk(&self) -> bool { 55 | matches!( 56 | self, 57 | OutputMode::SaveToSingleFile 58 | | OutputMode::SaveToDirectory 59 | | OutputMode::ModifyInplace 60 | | OutputMode::Partition 61 | ) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /crates/tbl-core/src/filesystem/manipulate.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use std::path::{Component, Path, PathBuf}; 3 | 4 | /// get common prefix of paths 5 | pub fn get_common_prefix(paths: &[PathBuf]) -> Result { 6 | if paths.is_empty() { 7 | return Err(TblError::InputError("no paths given".to_string())); 8 | } 9 | 10 | let mut components_iter = paths.iter().map(|p| p.components()); 11 | let mut common_components: Vec> = components_iter 12 | .next() 13 | .ok_or(TblError::Error( 14 | "cannot parse common path components".to_string(), 15 | ))? 16 | .collect(); 17 | 18 | for components in components_iter { 19 | common_components = common_components 20 | .iter() 21 | .zip(components) 22 | .take_while(|(a, b)| a == &b) 23 | .map(|(a, _)| *a) 24 | .collect(); 25 | } 26 | 27 | Ok(common_components.iter().collect()) 28 | } 29 | 30 | /// convert file path to new input 31 | pub fn convert_file_path( 32 | input: &Path, 33 | output_dir: &Option, 34 | file_prefix: &Option, 35 | file_postfix: &Option, 36 | ) -> Result { 37 | // change output directory 38 | let output = match output_dir.as_ref() { 39 | Some(output_dir) => { 40 | let file_name = input 41 | .file_name() 42 | .ok_or_else(|| TblError::Error("Invalid input path".to_string()))?; 43 | output_dir.join(file_name) 44 | } 45 | None => input.to_path_buf(), 46 | }; 47 | 48 | if file_prefix.is_some() || file_postfix.is_some() { 49 | let stem = output 50 | .file_stem() 51 | .ok_or_else(|| TblError::Error("Invalid output path".to_string()))?; 52 | let extension = output.extension(); 53 | 54 | let new_filename = format!( 55 | "{}{}{}{}", 56 | file_prefix.as_deref().unwrap_or(""), 57 | stem.to_string_lossy(), 58 | file_postfix.as_deref().unwrap_or(""), 59 | extension.map_or_else(String::new, |ext| format!(".{}", ext.to_string_lossy())) 60 | ); 61 | 62 | Ok(output.with_file_name(new_filename)) 63 | } else { 64 | Ok(output) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /crates/tbl-core/src/filesystem/gather.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use futures::stream::StreamExt; 3 | use std::path::{Path, PathBuf}; 4 | 5 | /// return tabular file paths within directory 6 | pub fn get_directory_tabular_files(dir_path: &Path) -> Result, TblError> { 7 | let mut tabular_files = Vec::new(); 8 | 9 | for entry in std::fs::read_dir(dir_path)? { 10 | let entry = entry?; 11 | let path = entry.path(); 12 | 13 | if path.is_file() && is_tabular_file(&path) { 14 | tabular_files.push(path); 15 | } 16 | } 17 | 18 | Ok(tabular_files) 19 | } 20 | 21 | /// get tabular files inside directory tree 22 | pub fn get_tree_tabular_files(dir_path: &std::path::Path) -> Result, TblError> { 23 | let mut tabular_files = Vec::new(); 24 | for entry in std::fs::read_dir(dir_path)? { 25 | let entry = entry?; 26 | let path = entry.path(); 27 | if path.is_file() && is_tabular_file(&path) { 28 | tabular_files.push(path); 29 | } else if path.is_dir() { 30 | let sub_dir_files = get_tree_tabular_files(&path)?; 31 | tabular_files.extend(sub_dir_files); 32 | } 33 | } 34 | Ok(tabular_files) 35 | } 36 | 37 | /// return true if file_path has a tabular extension 38 | pub fn is_tabular_file(file_path: &std::path::Path) -> bool { 39 | // let tabular_extensions = ["parquet", "csv"]; 40 | let tabular_extensions = ["parquet"]; 41 | 42 | if let Some(extension) = file_path.extension() { 43 | let extension = extension.to_string_lossy().to_string(); 44 | tabular_extensions.contains(&extension.as_str()) 45 | } else { 46 | false 47 | } 48 | } 49 | 50 | /// count number of existing files 51 | pub async fn count_existing_files(paths: &[PathBuf]) -> usize { 52 | const CONCURRENT_LIMIT: usize = 1000; // Adjust based on your system's capabilities 53 | 54 | futures::stream::iter(paths) 55 | .map(tokio::fs::metadata) 56 | .buffer_unordered(CONCURRENT_LIMIT) 57 | .filter_map(|result| async move { 58 | match result { 59 | Ok(metadata) => Some(metadata.is_file()), 60 | Err(_) => None, 61 | } 62 | }) 63 | .fold(0, |acc, is_file| async move { 64 | if is_file { 65 | acc + 1 66 | } else { 67 | acc 68 | } 69 | }) 70 | .await 71 | } 72 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/parquet_merge.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use futures::StreamExt; 3 | use parquet::arrow::arrow_writer::ArrowWriter; 4 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder; 5 | use parquet::file::properties::WriterProperties; 6 | use std::io::BufWriter as StdBufWriter; 7 | use std::path::PathBuf; 8 | use tokio::fs::File; 9 | use tokio::io::AsyncWriteExt; 10 | 11 | /// merge parquet files into one 12 | pub async fn merge_parquets( 13 | input_paths: &Vec, 14 | output_path: &PathBuf, 15 | batch_size: usize, 16 | ) -> Result<(), crate::TblError> { 17 | if input_paths.is_empty() { 18 | return Err(crate::TblError::Error( 19 | "No input files provided".to_string(), 20 | )); 21 | } 22 | 23 | let tmp_output_path = super::parquet_drop::create_tmp_target(output_path.as_path()); 24 | let mut output_file = File::create(&tmp_output_path).await?; 25 | let mut buffer = Vec::new(); 26 | 27 | // Read the schema from the first file 28 | let first_file = File::open(&input_paths[0]).await?; 29 | let builder = ParquetRecordBatchStreamBuilder::new(first_file) 30 | .await? 31 | .with_batch_size(batch_size); 32 | let schema = builder.schema().clone(); 33 | 34 | let writer_props = WriterProperties::builder().build(); 35 | let mut arrow_writer = ArrowWriter::try_new( 36 | StdBufWriter::new(&mut buffer), 37 | schema.clone(), 38 | Some(writer_props), 39 | )?; 40 | 41 | for input_path in input_paths { 42 | let input_file = File::open(input_path).await?; 43 | let builder = ParquetRecordBatchStreamBuilder::new(input_file) 44 | .await? 45 | .with_batch_size(batch_size); 46 | let mut reader_stream = builder.build()?; 47 | 48 | // Verify that the schema matches 49 | if reader_stream.schema() != &schema { 50 | println!("SCHEMA OF {}:", input_paths[0].to_string_lossy()); 51 | println!("{:?}", schema); 52 | println!(); 53 | println!("SCHEMA OF {}:", input_path.to_string_lossy()); 54 | println!("{:?}", reader_stream.schema()); 55 | return Err(TblError::SchemaError( 56 | "schemas of files are not equal".to_string(), 57 | )); 58 | } 59 | 60 | while let Some(batch) = reader_stream.next().await { 61 | let batch = batch?; 62 | arrow_writer.write(&batch)?; 63 | } 64 | } 65 | 66 | arrow_writer.close()?; 67 | output_file.write_all(&buffer).await?; 68 | output_file.flush().await?; 69 | tokio::fs::rename(tmp_output_path, output_path).await?; 70 | 71 | Ok(()) 72 | } 73 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/parquet_cast.rs: -------------------------------------------------------------------------------- 1 | use polars::prelude::*; 2 | use std::collections::HashMap; 3 | use std::path::PathBuf; 4 | // use arrow::datatypes::Schema as ArrowSchema; 5 | // use arrow::record_batch::RecordBatch; 6 | // use parquet::arrow::arrow_writer::ArrowWriter; 7 | // use parquet::file::properties::WriterProperties; 8 | // use std::io::BufWriter; 9 | // use std::sync::Arc; 10 | // use tokio::fs::File; 11 | // use tokio::io::AsyncWriteExt; 12 | use crate::types::TblError; 13 | 14 | /// cast columns of parquet file to new type 15 | pub async fn cast_parquet_columns( 16 | _input_path: PathBuf, 17 | _output_path: PathBuf, 18 | _columns_to_cast: HashMap, 19 | _batch_size: usize, 20 | ) -> Result<(), crate::TblError> { 21 | Err(TblError::Error("not implemented".to_string())) 22 | // // Create a LazyFrame from the input Parquet file 23 | // let lf = LazyFrame::scan_parquet( 24 | // input_path.to_str().ok_or_else(|| crate::TblError::Error("Invalid input path".to_string()))?, 25 | // ScanArgsParquet::default() 26 | // )?; 27 | 28 | // // Apply the casts 29 | // let casted_lf = lf.with_columns( 30 | // columns_to_cast.iter().map(|(col_name, new_type)| { 31 | // col(col_name).cast(new_type.clone()) 32 | // }).collect::>() 33 | // ); 34 | 35 | // // Collect the schema 36 | // let schema = casted_lf.schema().map_err(|e| crate::TblError::PolarsError(e))?; 37 | 38 | // // Create temporary output path 39 | // let tmp_output_path = super::parquet_drop::create_tmp_target(output_path.as_path()); 40 | 41 | // // Open output file 42 | // let mut output_file = File::create(&tmp_output_path).await?; 43 | 44 | // // Convert Polars schema to Arrow schema 45 | // let arrow_schema: Arc = Arc::new(schema.to_arrow(true)); 46 | 47 | // // Set up Arrow writer 48 | // let writer_props = WriterProperties::builder().build(); 49 | // let mut buffer = Vec::new(); 50 | // let mut arrow_writer = ArrowWriter::try_new( 51 | // BufWriter::new(&mut buffer), 52 | // arrow_schema.clone(), 53 | // Some(writer_props), 54 | // )?; 55 | 56 | // // Process data in batches 57 | // let df = casted_lf.collect()?; 58 | // for batch in df.iter_chunks(false) { 59 | // let arrow_batch = RecordBatch::try_from_iter( 60 | // arrow_schema.fields().iter().zip(batch.iter()).map(|(field, array)| { 61 | // Ok((field.name().to_string(), array.clone() as Arc)) 62 | // }) 63 | // )?; 64 | // arrow_writer.write(&arrow_batch)?; 65 | // } 66 | 67 | // // Finish writing 68 | // arrow_writer.close()?; 69 | // output_file.write_all(&buffer).await?; 70 | // output_file.flush().await?; 71 | 72 | // // Rename temporary file to final output file 73 | // std::fs::rename(tmp_output_path, output_path)?; 74 | 75 | // Ok(()) 76 | } 77 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/subcommands/ls.rs: -------------------------------------------------------------------------------- 1 | use crate::{LsArgs, TblCliError}; 2 | use toolstr::Colorize; 3 | 4 | pub(crate) async fn ls_command(ls_args: LsArgs) -> Result<(), TblCliError> { 5 | // get paths 6 | let paths = tbl_core::filesystem::get_input_paths(&ls_args.paths, ls_args.tree, true)?; 7 | 8 | if paths.is_empty() { 9 | println!("[no tabular paths]"); 10 | return Ok(()); 11 | } 12 | 13 | // print file names 14 | print_file_names(&paths, ls_args.n, ls_args.absolute)?; 15 | 16 | // print stats 17 | print_stats(&paths).await?; 18 | 19 | Ok(()) 20 | } 21 | 22 | fn print_file_names( 23 | paths: &[std::path::PathBuf], 24 | n: Option, 25 | absolute: bool, 26 | ) -> Result<(), TblCliError> { 27 | // clear common prefix 28 | let display_paths = if absolute || (paths.len() == 1) { 29 | paths.to_vec() 30 | } else { 31 | let common_prefix = tbl_core::filesystem::get_common_prefix(paths)?; 32 | let mut new_paths = Vec::new(); 33 | for path in paths.iter() { 34 | new_paths.push(path.strip_prefix(&common_prefix)?.to_owned()) 35 | } 36 | new_paths 37 | }; 38 | 39 | // decide number of files to print 40 | let n_print = match n { 41 | Some(n) => n, 42 | None => { 43 | if let Some((_, height)) = term_size::dimensions() { 44 | if height >= 5 { 45 | height - 4 46 | } else { 47 | 1 48 | } 49 | } else { 50 | 100 51 | } 52 | } 53 | }; 54 | 55 | // print out file names or paths 56 | for path in display_paths.iter().take(n_print) { 57 | println!("{}", path.to_string_lossy().purple()) 58 | } 59 | if n_print < paths.len() { 60 | println!( 61 | "{}", 62 | format!( 63 | "... {} files not shown", 64 | tbl_core::formats::format_with_commas((paths.len() - n_print) as u64).bold() 65 | ) 66 | .truecolor(150, 150, 150) 67 | ); 68 | } 69 | 70 | Ok(()) 71 | } 72 | 73 | async fn print_stats(paths: &[std::path::PathBuf]) -> Result<(), TblCliError> { 74 | // get total file size 75 | let mut total_size: u64 = 0; 76 | for path in paths.iter() { 77 | let metadata = std::fs::metadata(path)?; 78 | total_size += metadata.len(); 79 | } 80 | 81 | // get row counts 82 | let path_refs: Vec<&std::path::Path> = 83 | paths.iter().map(|path_buf| path_buf.as_path()).collect(); 84 | let row_counts = tbl_core::parquet::get_parquet_row_counts(&path_refs).await?; 85 | 86 | // print total summary 87 | println!( 88 | "{} rows stored in {} across {} tabular files", 89 | tbl_core::formats::format_with_commas(row_counts.iter().sum()) 90 | .green() 91 | .bold(), 92 | tbl_core::formats::format_bytes(total_size).green().bold(), 93 | tbl_core::formats::format_with_commas(paths.len() as u64) 94 | .green() 95 | .bold() 96 | ); 97 | 98 | Ok(()) 99 | } 100 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/styles.rs: -------------------------------------------------------------------------------- 1 | use toolstr::Colorize; 2 | use toolstr_colored::ColoredString; 3 | 4 | pub(crate) fn get_styles() -> clap::builder::Styles { 5 | let white = anstyle::Color::Rgb(anstyle::RgbColor(255, 255, 255)); 6 | let green = anstyle::Color::Rgb(anstyle::RgbColor(0, 225, 0)); 7 | let grey = anstyle::Color::Rgb(anstyle::RgbColor(170, 170, 170)); 8 | let title = anstyle::Style::new().bold().fg_color(Some(green)); 9 | let arg = anstyle::Style::new().bold().fg_color(Some(white)); 10 | let comment = anstyle::Style::new().fg_color(Some(grey)); 11 | clap::builder::Styles::styled() 12 | .header(title) 13 | .error(comment) 14 | .usage(title) 15 | .literal(arg) 16 | .placeholder(comment) 17 | .valid(title) 18 | .invalid(comment) 19 | } 20 | 21 | pub(crate) trait FontStyle { 22 | fn colorize_background(self) -> ColoredString; 23 | fn colorize_title(self) -> ColoredString; 24 | fn colorize_comment(self) -> ColoredString; 25 | fn colorize_string(self) -> ColoredString; 26 | fn colorize_constant(self) -> ColoredString; 27 | fn colorize_function(self) -> ColoredString; 28 | fn colorize_variable(self) -> ColoredString; 29 | } 30 | 31 | impl FontStyle for &str { 32 | fn colorize_background(self) -> ColoredString { 33 | self.truecolor(40, 42, 54) 34 | } 35 | 36 | fn colorize_title(self) -> ColoredString { 37 | self.truecolor(206, 147, 249).bold() 38 | } 39 | 40 | fn colorize_comment(self) -> ColoredString { 41 | self.truecolor(98, 114, 164) 42 | } 43 | 44 | fn colorize_string(self) -> ColoredString { 45 | self.truecolor(241, 250, 140) 46 | } 47 | 48 | fn colorize_constant(self) -> ColoredString { 49 | self.truecolor(185, 242, 159) 50 | } 51 | 52 | fn colorize_function(self) -> ColoredString { 53 | self.truecolor(139, 233, 253) 54 | } 55 | 56 | fn colorize_variable(self) -> ColoredString { 57 | self.truecolor(100, 170, 170) 58 | } 59 | } 60 | 61 | use inquire::ui::{Attributes, Color, IndexPrefix, RenderConfig, StyleSheet, Styled}; 62 | 63 | pub(crate) fn get_render_config() -> RenderConfig<'static> { 64 | let highlight_color = Color::DarkGreen; 65 | 66 | let mut render_config = RenderConfig::default(); 67 | render_config.prompt = StyleSheet::new().with_attr(Attributes::BOLD); 68 | render_config.prompt_prefix = Styled::new("").with_fg(Color::LightRed); 69 | render_config.answered_prompt_prefix = Styled::new("").with_fg(Color::LightRed); 70 | render_config.placeholder = StyleSheet::new().with_fg(Color::LightRed); 71 | render_config.selected_option = Some(StyleSheet::new().with_fg(highlight_color)); 72 | render_config.highlighted_option_prefix = Styled::new("→").with_fg(highlight_color); 73 | render_config.selected_checkbox = Styled::new("☑").with_fg(highlight_color); 74 | render_config.scroll_up_prefix = Styled::new("⇞"); 75 | render_config.scroll_down_prefix = Styled::new("⇟"); 76 | render_config.unselected_checkbox = Styled::new("☐"); 77 | render_config.option_index_prefix = IndexPrefix::Simple; 78 | render_config.error_message = render_config 79 | .error_message 80 | .with_prefix(Styled::new("❌").with_fg(Color::LightRed)); 81 | render_config.answer = StyleSheet::new() 82 | .with_attr(Attributes::BOLD) 83 | .with_fg(highlight_color); 84 | let grey = Color::Rgb { 85 | r: 100, 86 | g: 100, 87 | b: 100, 88 | }; 89 | render_config.help_message = StyleSheet::new() 90 | .with_fg(grey) 91 | .with_attr(Attributes::ITALIC); 92 | 93 | render_config 94 | } 95 | -------------------------------------------------------------------------------- /crates/tbl-core/src/formats.rs: -------------------------------------------------------------------------------- 1 | use colored::Colorize; 2 | 3 | /// format bytes 4 | pub fn format_bytes(bytes: u64) -> String { 5 | let units = ["B", "KB", "MB", "GB", "TB", "PB", "EB"]; 6 | let mut size = bytes as f64; 7 | let mut unit = 0; 8 | 9 | while size >= 1024.0 && unit < units.len() - 1 { 10 | size /= 1024.0; 11 | unit += 1; 12 | } 13 | 14 | format!("{:.2} {}", size, units[unit]) 15 | } 16 | 17 | /// format number with commas 18 | pub fn format_with_commas(number: u64) -> String { 19 | let num_str = number.to_string(); 20 | let mut result = String::new(); 21 | let mut count = 0; 22 | 23 | for c in num_str.chars().rev() { 24 | if count == 3 { 25 | result.push(','); 26 | count = 0; 27 | } 28 | result.push(c); 29 | count += 1; 30 | } 31 | 32 | result.chars().rev().collect() 33 | } 34 | 35 | const TITLE_R: u8 = 0; 36 | const TITLE_G: u8 = 225; 37 | const TITLE_B: u8 = 0; 38 | const ERROR_R: u8 = 225; 39 | const ERROR_G: u8 = 0; 40 | const ERROR_B: u8 = 0; 41 | 42 | /// print header 43 | pub fn print_header>(header: A) { 44 | let header_str = header.as_ref().white().bold(); 45 | let underline = "─" 46 | .repeat(header_str.len()) 47 | .truecolor(TITLE_R, TITLE_G, TITLE_B); 48 | println!("{}", header_str); 49 | println!("{}", underline); 50 | } 51 | 52 | /// print header error 53 | pub fn print_header_error>(header: A) { 54 | let header_str = header.as_ref().white().bold(); 55 | let underline = "─" 56 | .repeat(header_str.len()) 57 | .truecolor(ERROR_R, ERROR_G, ERROR_B); 58 | println!("{}", header_str); 59 | println!("{}", underline); 60 | } 61 | 62 | /// print bullet as `- key` 63 | pub fn print_bullet_key>(key: A) { 64 | let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B); 65 | let key_str = key.as_ref().white().bold(); 66 | println!("{}{}", bullet_str, key_str); 67 | } 68 | 69 | /// print bullet as `- key: value` 70 | pub fn print_bullet, B: AsRef>(key: A, value: B) { 71 | let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B); 72 | let key_str = key.as_ref().white().bold(); 73 | let value_str = value.as_ref().truecolor(170, 170, 170); 74 | let colon_str = ": ".truecolor(TITLE_R, TITLE_G, TITLE_B); 75 | println!("{}{}{}{}", bullet_str, key_str, colon_str, value_str); 76 | } 77 | 78 | /// print bullet as `- key (value)` 79 | pub fn print_bullet_parenthetical, B: AsRef>(key: A, value: B) { 80 | let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B); 81 | let key_str = key.as_ref().white().bold(); 82 | let value_str = value.as_ref().truecolor(170, 170, 170); 83 | println!("{}{} ({})", bullet_str, key_str, value_str); 84 | } 85 | 86 | /// print bullet as ` - key: value` 87 | pub fn print_bullet_indent, B: AsRef>(key: A, value: B, indent: usize) { 88 | let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B); 89 | let key_str = key.as_ref().white().bold(); 90 | let value_str = value.as_ref().truecolor(170, 170, 170); 91 | let colon_str = ": ".truecolor(TITLE_R, TITLE_G, TITLE_B); 92 | println!( 93 | "{}{}{}{}{}", 94 | " ".repeat(indent), 95 | bullet_str, 96 | key_str, 97 | colon_str, 98 | value_str 99 | ); 100 | } 101 | 102 | /// print bullet as ` - key` 103 | pub fn print_bullet_key_indent>(key: A, indent: usize) { 104 | let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B); 105 | let key_str = key.as_ref().white().bold(); 106 | println!("{}{}{}", " ".repeat(indent), bullet_str, key_str,); 107 | } 108 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/parquet_drop.rs: -------------------------------------------------------------------------------- 1 | use arrow::datatypes::Schema; 2 | use arrow::record_batch::RecordBatch; 3 | use futures::stream::StreamExt; 4 | use parquet::arrow::arrow_writer::ArrowWriter; 5 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder; 6 | use parquet::file::properties::WriterProperties; 7 | use std::io::BufWriter; 8 | use std::path::PathBuf; 9 | use std::sync::Arc; 10 | use tokio::fs::File; 11 | use tokio::io::AsyncWriteExt; 12 | 13 | /// create temporary path target similar to the final target path 14 | pub fn create_tmp_target(path: &std::path::Path) -> PathBuf { 15 | let mut new_path = path.to_path_buf(); 16 | let suffix = "_tmp"; 17 | if let Some(stem) = path.file_stem() { 18 | let mut new_stem = stem.to_string_lossy().into_owned(); 19 | new_stem.push_str(suffix); 20 | if let Some(extension) = path.extension() { 21 | new_stem.push('.'); 22 | new_stem.push_str(&extension.to_string_lossy()); 23 | } 24 | new_path.set_file_name(new_stem); 25 | } 26 | 27 | new_path 28 | } 29 | 30 | /// drop columns from parquet column 31 | pub async fn drop_parquet_columns( 32 | input_path: PathBuf, 33 | output_path: PathBuf, 34 | columns_to_drop: Vec, 35 | batch_size: usize, 36 | ) -> Result<(), crate::TblError> { 37 | let input_file = File::open(input_path).await?; 38 | let tmp_output_path = create_tmp_target(output_path.as_path()); 39 | let mut output_file = File::create(&tmp_output_path).await?; 40 | let builder = ParquetRecordBatchStreamBuilder::new(input_file) 41 | .await? 42 | .with_batch_size(batch_size); 43 | let mut reader_stream = builder.build()?; 44 | let original_schema = reader_stream.schema().clone(); 45 | 46 | // Create new schema without dropped columns 47 | let new_schema = Arc::new(Schema::new( 48 | original_schema 49 | .fields() 50 | .iter() 51 | .filter_map(|field| { 52 | if !columns_to_drop.contains(field.name()) { 53 | Some(field.clone()) 54 | } else { 55 | None 56 | } 57 | }) 58 | .collect::>(), 59 | )); 60 | 61 | let writer_props = WriterProperties::builder().build(); 62 | let mut buffer = Vec::new(); 63 | let mut arrow_writer = ArrowWriter::try_new( 64 | BufWriter::new(&mut buffer), 65 | new_schema.clone(), 66 | Some(writer_props), 67 | )?; 68 | 69 | while let Some(batch) = reader_stream.next().await { 70 | let batch = batch?; 71 | let new_columns = batch 72 | .columns() 73 | .iter() 74 | .enumerate() 75 | .filter_map(|(i, col)| { 76 | if !columns_to_drop.contains(original_schema.field(i).name()) { 77 | Some(col.clone()) 78 | } else { 79 | None 80 | } 81 | }) 82 | .collect::>(); 83 | 84 | let new_batch = RecordBatch::try_new(new_schema.clone(), new_columns)?; 85 | arrow_writer.write(&new_batch)?; 86 | } 87 | 88 | arrow_writer.close()?; 89 | output_file.write_all(&buffer).await?; 90 | output_file.flush().await?; 91 | 92 | std::fs::rename(tmp_output_path, output_path)?; 93 | 94 | Ok(()) 95 | } 96 | 97 | /// drop columns from multiple parquet files 98 | pub async fn drop_parquets_columns( 99 | input_output_paths: Vec<(PathBuf, PathBuf)>, 100 | columns_to_drop: Vec, 101 | batch_size: usize, 102 | max_concurrent: usize, 103 | ) -> Result<(), crate::TblError> { 104 | let semaphore = Arc::new(tokio::sync::Semaphore::new(max_concurrent)); 105 | 106 | let results = futures::stream::iter(input_output_paths) 107 | .map(|(input, output)| { 108 | let columns_to_drop = columns_to_drop.clone(); 109 | let sem = Arc::clone(&semaphore); 110 | async move { 111 | let _permit = sem.acquire().await?; 112 | drop_parquet_columns(input, output, columns_to_drop, batch_size).await 113 | } 114 | }) 115 | .buffer_unordered(max_concurrent) 116 | .collect::>() 117 | .await; 118 | 119 | // Check if any operations failed 120 | for result in results { 121 | result?; 122 | } 123 | 124 | Ok(()) 125 | } 126 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/summary.rs: -------------------------------------------------------------------------------- 1 | use crate::{DataArgs, OutputMode, TblCliError}; 2 | use std::path::{Path, PathBuf}; 3 | use tbl_core::formats::{print_bullet, print_header}; 4 | 5 | pub(crate) async fn print_summary( 6 | inputs_and_outputs: &[(Vec, Option)], 7 | output_mode: &OutputMode, 8 | args: &DataArgs, 9 | ) -> Result<(), TblCliError> { 10 | let mut n_input_files = 0; 11 | let mut all_input_files = Vec::new(); 12 | let mut _n_output_files = 0; 13 | for (input_files, output_file) in inputs_and_outputs.iter() { 14 | n_input_files += input_files.len(); 15 | all_input_files.extend(input_files.iter().map(|p| p.as_path())); 16 | if output_file.is_some() { 17 | _n_output_files += 1; 18 | } 19 | } 20 | 21 | // compute total size of input files 22 | let n_input_bytes = tbl_core::filesystem::get_total_bytes_of_files(&all_input_files).await?; 23 | 24 | print_input_summary(n_input_files, &all_input_files, n_input_bytes, args); 25 | println!(); 26 | println!(); 27 | print_transform_summary(args); 28 | println!(); 29 | println!(); 30 | print_output_mode_summary(n_input_files, output_mode, args); 31 | Ok(()) 32 | } 33 | 34 | fn print_input_summary( 35 | n_input_files: usize, 36 | input_files: &[&Path], 37 | n_input_bytes: u64, 38 | _args: &DataArgs, 39 | ) { 40 | print_header("Inputs"); 41 | print_bullet( 42 | "n_input_bytes", 43 | tbl_core::formats::format_bytes(n_input_bytes), 44 | ); 45 | print_bullet( 46 | "n_input_files", 47 | tbl_core::formats::format_with_commas(n_input_files as u64), 48 | ); 49 | 50 | let n_show_files = 10; 51 | for path in input_files.iter().take(n_show_files) { 52 | let path: String = path.to_string_lossy().to_string(); 53 | tbl_core::formats::print_bullet_key_indent(path, 4); 54 | } 55 | if input_files.len() > n_show_files { 56 | tbl_core::formats::print_bullet_key_indent("...", 4); 57 | } 58 | } 59 | 60 | fn print_transform_summary(args: &DataArgs) { 61 | print_header("Transformations"); 62 | let mut transforming = false; 63 | if let Some(with_columns) = &args.with_columns { 64 | print_bullet("adding columns", format!("{:?}", with_columns)); 65 | transforming = true; 66 | } 67 | if let Some(filter) = &args.filter { 68 | print_bullet("filtering rows", format!("{:?}", filter)); 69 | transforming = true; 70 | } 71 | if let Some(drop) = &args.drop { 72 | print_bullet("dropping columns", format!("{:?}", drop)); 73 | transforming = true; 74 | } 75 | if let Some(cast) = &args.cast { 76 | print_bullet("casting types", format!("{:?}", cast)); 77 | transforming = true; 78 | } 79 | if !transforming { 80 | println!("[no transformations]"); 81 | } 82 | } 83 | 84 | fn print_output_mode_summary(n_input_files: usize, output_mode: &OutputMode, args: &DataArgs) { 85 | print_header("Outputs"); 86 | match output_mode { 87 | OutputMode::PrintToStdout => { 88 | print_bullet("output_mode", "PRINT_TO_STDOUT"); 89 | let summary = format!("loading {} files and printing to stdout", n_input_files); 90 | print_bullet("summary", summary); 91 | } 92 | OutputMode::SaveToSingleFile => { 93 | print_bullet("output_mode", "SAVE_TO_ONE_FILE"); 94 | let summary = format!( 95 | "loading {} files and merging result into 1 output file", 96 | n_input_files 97 | ); 98 | print_bullet("summary", summary); 99 | if let Some(output_file) = &args.output_file { 100 | print_bullet("output_file", output_file.to_string_lossy()); 101 | } 102 | } 103 | OutputMode::SaveToDirectory => { 104 | print_bullet("output_mode", "SAVE_TO_NEW_DIR"); 105 | let summary = format!( 106 | "loading {} files and saving results to new directory", 107 | n_input_files 108 | ); 109 | print_bullet("summary", summary); 110 | if let Some(output_dir) = &args.output_dir { 111 | print_bullet("output_dir", output_dir.to_string_lossy()); 112 | } 113 | } 114 | OutputMode::ModifyInplace => { 115 | print_bullet("output_mode", "MODIFY_INPLACE"); 116 | let summary = format!("modifying {} files in-place", n_input_files); 117 | print_bullet("summary", summary); 118 | } 119 | OutputMode::Partition => { 120 | print_bullet("output_mode", "REPARTITION"); 121 | let summary = format!("repartitioning {} files", n_input_files); 122 | print_bullet("summary", summary); 123 | } 124 | OutputMode::InteractiveLf => { 125 | print_bullet("output_mode", "INTERACTIVE"); 126 | let summary = format!( 127 | "starting interactive session, loading {} files into LazyFrame", 128 | n_input_files 129 | ); 130 | print_bullet("summary", summary); 131 | } 132 | OutputMode::InteractiveDf => { 133 | print_bullet("output_mode", "INTERACTIVE"); 134 | let summary = format!( 135 | "starting interactive session, loading {} files into LazyFrame", 136 | n_input_files 137 | ); 138 | print_bullet("summary", summary); 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/subcommands/data.rs: -------------------------------------------------------------------------------- 1 | use crate::{DataArgs, OutputMode, TblCliError}; 2 | use std::path::PathBuf; 3 | use tbl_core::filesystem::{get_input_paths, get_output_paths, OutputPathSpec}; 4 | 5 | pub(crate) async fn data_command(args: DataArgs) -> Result<(), TblCliError> { 6 | inquire::set_global_render_config(crate::styles::get_render_config()); 7 | 8 | // decide output mode 9 | let output_mode = decide_output_mode(&args)?; 10 | 11 | // create input output pairs 12 | let io = gather_inputs_and_outputs(&output_mode, &args)?; 13 | 14 | // print data summary 15 | if !args.no_summary { 16 | crate::summary::print_summary(&io, &output_mode, &args).await?; 17 | } 18 | 19 | // exit early as needed 20 | exit_early_if_needed(args.dry, args.confirm, !args.no_summary, &output_mode, &io); 21 | 22 | // process each input output pair 23 | for (input_paths, output_path) in io.into_iter() { 24 | process_io(input_paths, output_path, &output_mode, &args)? 25 | } 26 | 27 | Ok(()) 28 | } 29 | 30 | fn decide_output_mode(args: &DataArgs) -> Result { 31 | match ( 32 | args.inplace, 33 | &args.output_file, 34 | &args.output_dir, 35 | &args.partition, 36 | args.df, 37 | args.lf, 38 | ) { 39 | (false, None, None, None, false, false) => Ok(OutputMode::PrintToStdout), 40 | (true, None, None, None, false, false) => Ok(OutputMode::ModifyInplace), 41 | (false, Some(_), None, None, false, false) => Ok(OutputMode::SaveToSingleFile), 42 | (false, None, Some(_), None, false, false) => Ok(OutputMode::SaveToDirectory), 43 | (false, None, _, Some(_), false, false) => Ok(OutputMode::Partition), 44 | (false, None, None, None, true, false) => Ok(OutputMode::InteractiveDf), 45 | (false, None, None, None, false, true) => Ok(OutputMode::InteractiveLf), 46 | _ => Err(TblCliError::Error( 47 | "can only specify one output mode".to_string(), 48 | )), 49 | } 50 | } 51 | 52 | #[allow(clippy::type_complexity)] 53 | fn gather_inputs_and_outputs( 54 | output_mode: &OutputMode, 55 | args: &DataArgs, 56 | ) -> Result, Option)>, TblCliError> { 57 | // parse input output pairs 58 | let mut io = Vec::new(); 59 | match output_mode { 60 | OutputMode::PrintToStdout 61 | | OutputMode::Partition 62 | | OutputMode::InteractiveLf 63 | | OutputMode::InteractiveDf => { 64 | let input_paths = get_input_paths(&args.paths, args.tree, true)?; 65 | io.push((input_paths, None)) 66 | } 67 | OutputMode::SaveToSingleFile => { 68 | let input_paths = get_input_paths(&args.paths, args.tree, true)?; 69 | io.push((input_paths, args.output_file.clone())) 70 | } 71 | OutputMode::ModifyInplace => { 72 | let input_paths = get_input_paths(&args.paths, args.tree, true)?; 73 | for input_path in input_paths.into_iter() { 74 | io.push(([input_path.clone()].to_vec(), Some(input_path))) 75 | } 76 | } 77 | OutputMode::SaveToDirectory => { 78 | if let Some(output_dir) = args.output_dir.clone() { 79 | let _ = std::fs::create_dir(output_dir); 80 | }; 81 | let output_spec = OutputPathSpec { 82 | inputs: args.paths.clone(), 83 | output_dir: args.output_dir.clone(), 84 | tree: args.tree, 85 | file_prefix: args.output_prefix.clone(), 86 | file_postfix: args.output_postfix.clone(), 87 | sort: true, 88 | }; 89 | let (input_paths, output_paths) = get_output_paths(output_spec)?; 90 | for (input_path, output_path) in input_paths.into_iter().zip(output_paths) { 91 | io.push(([input_path].to_vec(), Some(output_path))) 92 | } 93 | } 94 | }; 95 | 96 | // filter empty io pairs 97 | let io = io 98 | .into_iter() 99 | .filter(|(inputs, _)| !inputs.is_empty()) 100 | .collect(); 101 | 102 | Ok(io) 103 | } 104 | 105 | fn exit_early_if_needed( 106 | dry: bool, 107 | confirm: bool, 108 | summary: bool, 109 | output_mode: &OutputMode, 110 | io: &[(Vec, Option)], 111 | ) { 112 | // exit if performing dry run 113 | if dry { 114 | if summary { 115 | println!(); 116 | println!(); 117 | tbl_core::formats::print_header("Data") 118 | } 119 | println!("[dry run, exiting]"); 120 | std::process::exit(0); 121 | } 122 | 123 | // exit if no files selected 124 | if io.is_empty() { 125 | if summary { 126 | println!(); 127 | println!(); 128 | tbl_core::formats::print_header("Data") 129 | } 130 | println!("[no tabular files selected]"); 131 | std::process::exit(0) 132 | }; 133 | 134 | // exit if user does not confirm write operations 135 | if output_mode.writes_to_disk() & !confirm { 136 | if summary { 137 | println!(); 138 | println!(); 139 | } 140 | let prompt = "continue? "; 141 | if let Ok(true) = inquire::Confirm::new(prompt).with_default(false).prompt() { 142 | } else { 143 | println!("[exiting]"); 144 | std::process::exit(0) 145 | } 146 | } 147 | } 148 | 149 | fn process_io( 150 | input_paths: Vec, 151 | output_path: Option, 152 | output_mode: &OutputMode, 153 | args: &DataArgs, 154 | ) -> Result<(), TblCliError> { 155 | // create lazy frame 156 | let lf = tbl_core::parquet::create_lazyframe(&input_paths)?; 157 | 158 | // transform into output frames 159 | let lf = crate::transform::apply_transformations(lf, args)?; 160 | 161 | // output data 162 | crate::output::output_lazyframe(lf, input_paths, output_path, output_mode, args) 163 | } 164 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/output.rs: -------------------------------------------------------------------------------- 1 | use crate::styles::FontStyle; 2 | use crate::{DataArgs, OutputMode, TblCliError}; 3 | use color_print::cstr; 4 | use polars::prelude::*; 5 | use std::io::stdout; 6 | use std::path::PathBuf; 7 | use toolstr::Colorize; 8 | 9 | pub(crate) fn output_lazyframe( 10 | lf: LazyFrame, 11 | input_paths: Vec, 12 | output_path: Option, 13 | output_mode: &OutputMode, 14 | args: &DataArgs, 15 | ) -> Result<(), TblCliError> { 16 | match output_mode { 17 | OutputMode::PrintToStdout => print_lazyframe(lf, args), 18 | OutputMode::SaveToSingleFile => save_lf_to_disk(lf, output_path, args), 19 | OutputMode::SaveToDirectory => save_lf_to_disk(lf, output_path, args), 20 | OutputMode::ModifyInplace => save_lf_to_disk(lf, output_path, args), 21 | OutputMode::Partition => partition_data(lf, input_paths, args), 22 | OutputMode::InteractiveLf => enter_interactive_session(lf, input_paths, args), 23 | OutputMode::InteractiveDf => enter_interactive_session(lf, input_paths, args), 24 | } 25 | } 26 | 27 | fn print_lazyframe(lf: LazyFrame, args: &DataArgs) -> Result<(), TblCliError> { 28 | let df = lf.collect()?; 29 | 30 | let mut df = match args.hex { 31 | true => binary_to_hex(&mut df.clone())?, 32 | false => df, 33 | }; 34 | 35 | if !args.no_summary { 36 | println!(); 37 | println!(); 38 | tbl_core::formats::print_header("Data"); 39 | }; 40 | 41 | let n_show = match &args.n { 42 | Some(n) if n == "all" => df.height(), 43 | Some(n) => n.parse::()?, 44 | None => 20, 45 | }; 46 | let n_missing = if df.height() >= n_show { 47 | df.height() - n_show 48 | } else { 49 | 0 50 | }; 51 | 52 | if args.csv { 53 | let df = binary_to_hex(&mut df)?; 54 | print_dataframe_as_csv(&df, n_show)?; 55 | } else if args.json | args.jsonl { 56 | let df = binary_to_hex(&mut df)?; 57 | print_dataframe_as_json(&df, n_show, args.jsonl)?; 58 | } else { 59 | let df = df.head(Some(n_show)); 60 | println!("{}", df); 61 | }; 62 | 63 | if n_missing > 0 { 64 | println!( 65 | "{} rows omitted, use {} to show all rows", 66 | n_missing.to_string().colorize_constant().bold(), 67 | cstr!("-n all") 68 | ); 69 | } 70 | 71 | Ok(()) 72 | } 73 | 74 | fn print_dataframe_as_csv(df: &DataFrame, n: usize) -> Result<(), PolarsError> { 75 | let mut writer = CsvWriter::new(stdout()); 76 | let df: DataFrame = df.head(Some(n)); 77 | writer.finish(&mut df.clone()) 78 | } 79 | 80 | fn print_dataframe_as_json(df: &DataFrame, n: usize, jsonl: bool) -> Result<(), PolarsError> { 81 | let mut writer = JsonWriter::new(stdout()); 82 | 83 | if !jsonl { 84 | writer = writer.with_json_format(polars::prelude::JsonFormat::Json); 85 | }; 86 | 87 | let df: DataFrame = df.head(Some(n)); 88 | let result = writer.finish(&mut df.clone()); 89 | 90 | if !jsonl { 91 | println!() 92 | }; 93 | 94 | result 95 | } 96 | 97 | fn binary_to_hex(df: &mut DataFrame) -> Result { 98 | let mut df = df.clone(); 99 | 100 | let binary_columns: Vec = df 101 | .get_columns() 102 | .iter() 103 | .filter_map(|s| { 104 | if matches!(s.dtype(), DataType::Binary) { 105 | Some(s.name().to_string()) 106 | } else { 107 | None 108 | } 109 | }) 110 | .collect(); 111 | 112 | for col_name in binary_columns { 113 | let hex_col_with_prefix = df 114 | .clone() 115 | .lazy() 116 | .select(&[ 117 | concat_str([lit("0x"), col(&col_name).binary().hex_encode()], "", true) 118 | .alias(&col_name), 119 | ]) 120 | .collect()? 121 | .column(&col_name)? 122 | .clone(); 123 | 124 | df = df.with_column(hex_col_with_prefix)?.clone(); 125 | } 126 | 127 | Ok(df) 128 | } 129 | 130 | fn save_lf_to_disk( 131 | lf: LazyFrame, 132 | output_path: Option, 133 | args: &DataArgs, 134 | ) -> Result<(), TblCliError> { 135 | let output_path = match output_path { 136 | Some(output_path) => output_path, 137 | None => return Err(TblCliError::Error("no output path specified".to_string())), 138 | }; 139 | 140 | // Create a temporary path by appending "_tmp" to the original path 141 | let tmp_path = output_path.with_file_name(format!( 142 | "{}_tmp", 143 | output_path 144 | .file_name() 145 | .ok_or_else(|| TblCliError::Error("File name is missing".to_string()))? 146 | .to_str() 147 | .ok_or_else(|| TblCliError::Error("File name is not valid UTF-8".to_string()))? 148 | )); 149 | 150 | // Write to the temporary file 151 | if output_path.ends_with(".csv") | args.csv { 152 | let options = CsvWriterOptions::default(); 153 | lf.sink_csv(&tmp_path, options)?; 154 | } else if output_path.ends_with(".json") | args.json { 155 | let options = JsonWriterOptions::default(); 156 | lf.sink_json(&tmp_path, options)?; 157 | } else { 158 | let options = ParquetWriteOptions::default(); 159 | let result = lf.clone().sink_parquet(&tmp_path, options); 160 | if result.is_err() { 161 | // sink_parquet() is still missing some options, so if it fails use backup 162 | let file = std::fs::File::create(&tmp_path)?; 163 | let writer = ParquetWriter::new(file) 164 | .with_compression(ParquetCompression::Snappy) 165 | .with_statistics(StatisticsOptions { 166 | min_value: true, 167 | max_value: true, 168 | distinct_count: true, 169 | null_count: true, 170 | }); 171 | writer.finish(&mut lf.clone().collect()?)?; 172 | } 173 | }; 174 | 175 | // Move the temporary file to the final output path 176 | std::fs::rename(&tmp_path, &output_path).map_err(|e| TblCliError::Error(e.to_string()))?; 177 | 178 | Ok(()) 179 | } 180 | 181 | fn partition_data( 182 | _lf: LazyFrame, 183 | _input_paths: Vec, 184 | _args: &DataArgs, 185 | ) -> Result<(), TblCliError> { 186 | Err(TblCliError::Error( 187 | "partition functionality not implemented".to_string(), 188 | )) 189 | } 190 | 191 | fn enter_interactive_session( 192 | _lf: LazyFrame, 193 | input_paths: Vec, 194 | args: &DataArgs, 195 | ) -> Result<(), TblCliError> { 196 | crate::python::load_df_interactive(input_paths, args.lf, args.executable.clone()) 197 | } 198 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/parquet_summary.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use futures::stream::{self, StreamExt}; 3 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder; 4 | use polars::prelude::*; 5 | use std::collections::HashMap; 6 | 7 | /// get the number of rows in a parquet file 8 | pub async fn get_parquet_row_count(path: &std::path::Path) -> Result { 9 | let file = tokio::fs::File::open(path).await?; 10 | let builder = ParquetRecordBatchStreamBuilder::new(file) 11 | .await? 12 | .with_batch_size(1); 13 | let file_metadata = builder.metadata().file_metadata(); 14 | Ok(file_metadata.num_rows() as u64) 15 | } 16 | 17 | /// get the number of rows in multiple parquet files 18 | pub async fn get_parquet_row_counts(paths: &[&std::path::Path]) -> Result, TblError> { 19 | let row_counts = stream::iter(paths) 20 | .map(|path| get_parquet_row_count(path)) 21 | .buffered(10) 22 | .collect::>>() 23 | .await; 24 | 25 | row_counts 26 | .into_iter() 27 | .collect::, TblError>>() 28 | } 29 | 30 | /// get parquet schema 31 | pub async fn get_parquet_schema(path: &std::path::Path) -> Result, TblError> { 32 | let path = path.to_path_buf(); 33 | tokio::task::spawn_blocking(move || { 34 | let scan_args = ScanArgsParquet::default(); 35 | let mut lf = LazyFrame::scan_parquet(path, scan_args)?; 36 | let schema = lf.schema()?; 37 | Ok(schema) 38 | }) 39 | .await? 40 | } 41 | 42 | /// get parquet schemas 43 | pub async fn get_parquet_schemas( 44 | paths: &[std::path::PathBuf], 45 | ) -> Result>, TblError> { 46 | let schemas = stream::iter(paths) 47 | .map(|path| get_parquet_schema(path)) 48 | .buffered(10) 49 | .collect::, TblError>>>() 50 | .await; 51 | 52 | schemas 53 | .into_iter() 54 | .collect::>, TblError>>() 55 | } 56 | 57 | /// TabularSummary 58 | #[derive(Clone, Default)] 59 | pub struct TabularSummary { 60 | /// n_files 61 | pub n_files: u64, 62 | /// n_bytes_compressed 63 | pub n_bytes_compressed: u64, 64 | /// n_bytes_uncompressed 65 | pub n_bytes_uncompressed: u64, 66 | /// n_rows 67 | pub n_rows: u64, 68 | /// schema 69 | pub schema: Arc, 70 | /// columns 71 | pub columns: Vec, 72 | } 73 | 74 | /// TabularColumnSummary 75 | #[derive(Default, Clone, Debug)] 76 | pub struct TabularColumnSummary { 77 | /// n_bytes_compressed 78 | pub n_bytes_compressed: u64, 79 | /// n_bytes_uncompressed 80 | pub n_bytes_uncompressed: u64, 81 | // /// n_null 82 | // pub n_null: u64, 83 | // /// n_unique 84 | // pub n_unique: u64, 85 | // pub min_value 86 | // pub max_value 87 | } 88 | 89 | /// get summary of parquet file 90 | pub async fn get_parquet_summary(path: &std::path::Path) -> Result { 91 | let metadata = std::fs::metadata(path)?; 92 | let n_bytes_compressed = metadata.len(); 93 | let n_rows = get_parquet_row_count(path).await?; 94 | let schema = get_parquet_schema(path).await?; 95 | 96 | let parquet_metadata = get_parquet_metadata(path).await?; 97 | let columns = get_parquet_column_summaries(parquet_metadata.clone()).await?; 98 | let n_bytes_uncompressed = get_parquet_n_bytes_uncompressed(parquet_metadata); 99 | 100 | Ok(TabularSummary { 101 | n_files: 1, 102 | n_bytes_compressed, 103 | n_bytes_uncompressed, 104 | n_rows, 105 | schema, 106 | columns, 107 | }) 108 | } 109 | 110 | /// get parquet file metadata 111 | pub async fn get_parquet_metadata( 112 | path: &std::path::Path, 113 | ) -> Result, TblError> { 114 | let file = tokio::fs::File::open(path).await?; 115 | let builder = ParquetRecordBatchStreamBuilder::new(file) 116 | .await? 117 | .with_batch_size(1); 118 | Ok(builder.metadata().clone()) 119 | } 120 | 121 | /// get parquet uncompressed bytes 122 | pub fn get_parquet_n_bytes_uncompressed( 123 | metadata: Arc, 124 | ) -> u64 { 125 | metadata 126 | .row_groups() 127 | .iter() 128 | .map(|rg| rg.total_byte_size() as u64) 129 | .sum::() 130 | } 131 | 132 | /// get column summaries for parquet file 133 | pub async fn get_parquet_column_summaries( 134 | metadata: Arc, 135 | ) -> Result, TblError> { 136 | let n_columns = metadata 137 | .row_groups() 138 | .first() 139 | .map(|rg| rg.columns().len()) 140 | .unwrap_or(0); 141 | let mut columns: Vec = vec![TabularColumnSummary::default(); n_columns]; 142 | for rg in metadata.row_groups() { 143 | for (column, column_metadata) in columns.iter_mut().zip(rg.columns()) { 144 | column.n_bytes_compressed += column_metadata.compressed_size() as u64; 145 | column.n_bytes_uncompressed += column_metadata.uncompressed_size() as u64; 146 | } 147 | } 148 | Ok(columns) 149 | } 150 | 151 | /// get parquet schemas 152 | pub async fn get_parquet_summaries( 153 | paths: &[std::path::PathBuf], 154 | ) -> Result, TblError> { 155 | let schemas = stream::iter(paths) 156 | .map(|path| get_parquet_summary(path)) 157 | .buffered(10) 158 | .collect::>>() 159 | .await; 160 | 161 | schemas 162 | .into_iter() 163 | .collect::, TblError>>() 164 | } 165 | 166 | /// combine tabular summaries 167 | pub fn combine_tabular_summaries( 168 | summaries: &[&TabularSummary], 169 | include_columns: bool, 170 | ) -> Result { 171 | let mut total_summary = TabularSummary::default(); 172 | for (s, summary) in summaries.iter().enumerate() { 173 | if s == 0 { 174 | total_summary.schema = summary.schema.clone(); 175 | } 176 | total_summary.n_files += summary.n_files; 177 | total_summary.n_bytes_compressed += summary.n_bytes_compressed; 178 | total_summary.n_bytes_uncompressed += summary.n_bytes_uncompressed; 179 | total_summary.n_rows += summary.n_rows; 180 | if include_columns { 181 | total_summary.columns = combine_tabular_columns_summaries( 182 | total_summary.columns.as_slice(), 183 | summary.columns.as_slice(), 184 | )?; 185 | } 186 | } 187 | Ok(total_summary) 188 | } 189 | 190 | fn combine_tabular_columns_summaries( 191 | lhs: &[TabularColumnSummary], 192 | rhs: &[TabularColumnSummary], 193 | ) -> Result, TblError> { 194 | if lhs.is_empty() { 195 | Ok(rhs.to_vec()) 196 | } else if rhs.is_empty() { 197 | Ok(lhs.to_vec()) 198 | } else if lhs.len() != rhs.len() { 199 | Err(TblError::SchemaError( 200 | "different number of columns".to_string(), 201 | )) 202 | } else { 203 | Ok(lhs 204 | .iter() 205 | .zip(rhs.iter()) 206 | .map(|(lhs, rhs)| combine_tabular_column_summary(lhs, rhs)) 207 | .collect()) 208 | } 209 | } 210 | 211 | fn combine_tabular_column_summary( 212 | lhs: &TabularColumnSummary, 213 | rhs: &TabularColumnSummary, 214 | ) -> TabularColumnSummary { 215 | TabularColumnSummary { 216 | n_bytes_compressed: lhs.n_bytes_compressed + rhs.n_bytes_compressed, 217 | n_bytes_uncompressed: lhs.n_bytes_uncompressed + rhs.n_bytes_uncompressed, 218 | } 219 | } 220 | 221 | /// summarize by schema 222 | pub fn summarize_by_schema( 223 | summaries: &[&TabularSummary], 224 | ) -> Result, TabularSummary>, TblError> { 225 | let mut by_schema: HashMap, Vec<&TabularSummary>> = HashMap::new(); 226 | for summary in summaries.iter() { 227 | by_schema 228 | .entry(summary.schema.clone()) 229 | .or_default() 230 | .push(summary) 231 | } 232 | by_schema 233 | .into_iter() 234 | .map(|(k, v)| combine_tabular_summaries(v.as_slice(), true).map(|combined| (k, combined))) 235 | .collect() 236 | } 237 | -------------------------------------------------------------------------------- /crates/tbl-core/src/parquet/parquet_insert.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use arrow::array::{ArrayRef, StringArray}; 3 | use arrow::array::{BinaryArray, BooleanArray, UInt32Array, UInt64Array}; 4 | use arrow::datatypes::{DataType, Field, Schema}; 5 | use arrow::record_batch::RecordBatch; 6 | use futures::stream::{self}; 7 | use futures::StreamExt; 8 | use hex; 9 | use parquet::arrow::arrow_writer::ArrowWriter; 10 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder; 11 | use parquet::file::properties::WriterProperties; 12 | use std::io::BufWriter as StdBufWriter; 13 | use std::path::{Path, PathBuf}; 14 | use std::sync::Arc; 15 | use tokio::fs::File; 16 | use tokio::io::AsyncWriteExt; 17 | use tokio::sync::Semaphore; 18 | 19 | /// insert columns into multiple parquet files 20 | #[allow(clippy::too_many_arguments)] 21 | pub async fn insert_parquets_columns( 22 | inputs: &[PathBuf], 23 | outputs: &[PathBuf], 24 | column_names: Vec, 25 | column_dtypes: Vec, 26 | default_values: Option>, 27 | index: Option>, 28 | batch_size: usize, 29 | max_concurrent: usize, 30 | ) -> Result<(), TblError> { 31 | if inputs.len() != outputs.len() { 32 | return Err(TblError::Error( 33 | "Number of inputs must match number of outputs".to_string(), 34 | )); 35 | } 36 | 37 | let semaphore = Arc::new(Semaphore::new(max_concurrent)); 38 | 39 | let results = stream::iter(inputs.iter().zip(outputs.iter())) 40 | .map(|(input, output)| { 41 | let sem_clone = semaphore.clone(); 42 | let column_names = column_names.clone(); 43 | let column_dtypes = column_dtypes.clone(); 44 | let default_values = default_values.clone(); 45 | let index = index.clone(); 46 | 47 | async move { 48 | let _permit = sem_clone 49 | .acquire() 50 | .await 51 | .map_err(|e| TblError::Error(e.to_string()))?; 52 | 53 | insert_parquet_columns( 54 | input, 55 | output, 56 | column_names, 57 | column_dtypes, 58 | default_values, 59 | index, 60 | batch_size, 61 | ) 62 | .await 63 | } 64 | }) 65 | .buffer_unordered(max_concurrent) 66 | .collect::>() 67 | .await; 68 | 69 | // Check if any of the operations resulted in an error 70 | for result in results { 71 | result?; 72 | } 73 | 74 | Ok(()) 75 | } 76 | 77 | /// Insert columns into a parquet file 78 | pub async fn insert_parquet_columns( 79 | input: &Path, 80 | output: &Path, 81 | column_names: Vec, 82 | column_dtypes: Vec, 83 | default_values: Option>, 84 | index: Option>, 85 | batch_size: usize, 86 | ) -> Result<(), TblError> { 87 | if column_names.len() != column_dtypes.len() { 88 | return Err(TblError::Error( 89 | "Column names and dtypes must have the same length".to_string(), 90 | )); 91 | } 92 | 93 | if let Some(ref default_values) = default_values { 94 | if default_values.len() != column_names.len() { 95 | return Err(TblError::Error( 96 | "Default values must have the same length as column names and dtypes".to_string(), 97 | )); 98 | } 99 | } 100 | 101 | if let Some(ref index_values) = index { 102 | if index_values.len() != column_names.len() { 103 | return Err(TblError::Error( 104 | "Index values must have the same length as column names and dtypes".to_string(), 105 | )); 106 | } 107 | } 108 | 109 | let input_file = File::open(&input).await?; 110 | let builder = ParquetRecordBatchStreamBuilder::new(input_file) 111 | .await? 112 | .with_batch_size(batch_size); 113 | let mut reader_stream = builder.build()?; 114 | let original_schema = reader_stream.schema(); 115 | 116 | // Create new schema with inserted columns 117 | let mut new_fields = original_schema.fields().to_vec(); 118 | let insert_positions = index.unwrap_or_else(|| { 119 | (0..column_names.len()) 120 | .map(|i| new_fields.len() + i) 121 | .collect() 122 | }); 123 | for (i, (name, dtype)) in column_names.iter().zip(column_dtypes.iter()).enumerate() { 124 | let pos = insert_positions[i]; 125 | new_fields.insert(pos, Arc::new(Field::new(name, dtype.clone(), true))); 126 | } 127 | let new_schema = Arc::new(Schema::new(new_fields)); 128 | 129 | let tmp_output_path = super::parquet_drop::create_tmp_target(output); 130 | let mut output_file = File::create(&tmp_output_path).await?; 131 | let mut buffer = Vec::new(); 132 | 133 | let writer_props = WriterProperties::builder().build(); 134 | let mut arrow_writer = ArrowWriter::try_new( 135 | StdBufWriter::new(&mut buffer), 136 | new_schema.clone(), 137 | Some(writer_props), 138 | )?; 139 | 140 | while let Some(batch) = reader_stream.next().await { 141 | let batch = batch?; 142 | let mut new_columns = batch.columns().to_vec(); 143 | 144 | for (i, dtype) in column_dtypes.iter().enumerate() { 145 | let pos = insert_positions[i]; 146 | let default_value = default_values.as_ref().map(|values| values[i].as_str()); 147 | let new_column = create_new_column(batch.num_rows(), dtype, default_value)?; 148 | new_columns.insert(pos, new_column); 149 | } 150 | 151 | let new_batch = RecordBatch::try_new(new_schema.clone(), new_columns)?; 152 | arrow_writer.write(&new_batch)?; 153 | } 154 | 155 | arrow_writer.close()?; 156 | output_file.write_all(&buffer).await?; 157 | output_file.flush().await?; 158 | tokio::fs::rename(tmp_output_path, output).await?; 159 | 160 | Ok(()) 161 | } 162 | 163 | fn create_new_column( 164 | len: usize, 165 | dtype: &DataType, 166 | default_value: Option<&str>, 167 | ) -> Result { 168 | match dtype { 169 | DataType::Int32 => { 170 | let value = default_value 171 | .map(|v| v.parse::().map_err(|e| TblError::Error(e.to_string()))) 172 | .transpose()?; 173 | Ok(Arc::new(arrow::array::Int32Array::from(vec![value; len]))) 174 | } 175 | DataType::Int64 => { 176 | let value = default_value 177 | .map(|v| v.parse::().map_err(|e| TblError::Error(e.to_string()))) 178 | .transpose()?; 179 | Ok(Arc::new(arrow::array::Int64Array::from(vec![value; len]))) 180 | } 181 | DataType::UInt32 => { 182 | let value = default_value 183 | .map(|v| v.parse::().map_err(|e| TblError::Error(e.to_string()))) 184 | .transpose()?; 185 | Ok(Arc::new(UInt32Array::from(vec![value; len]))) 186 | } 187 | DataType::UInt64 => { 188 | let value = default_value 189 | .map(|v| v.parse::().map_err(|e| TblError::Error(e.to_string()))) 190 | .transpose()?; 191 | Ok(Arc::new(UInt64Array::from(vec![value; len]))) 192 | } 193 | DataType::Float32 => { 194 | let value = default_value 195 | .map(|v| v.parse::().map_err(|e| TblError::Error(e.to_string()))) 196 | .transpose()?; 197 | Ok(Arc::new(arrow::array::Float32Array::from(vec![value; len]))) 198 | } 199 | DataType::Float64 => { 200 | let value = default_value 201 | .map(|v| v.parse::().map_err(|e| TblError::Error(e.to_string()))) 202 | .transpose()?; 203 | Ok(Arc::new(arrow::array::Float64Array::from(vec![value; len]))) 204 | } 205 | DataType::Utf8 => { 206 | let value = default_value.unwrap_or(""); 207 | Ok(Arc::new(StringArray::from(vec![value; len]))) 208 | } 209 | DataType::Binary => { 210 | let value = default_value 211 | .map(|v| { 212 | if let Some(stripped) = v.strip_prefix("0x") { 213 | hex::decode(stripped).map_err(|e| TblError::Error(e.to_string())) 214 | } else { 215 | Err(TblError::Error( 216 | "Binary default value must start with '0x'".to_string(), 217 | )) 218 | } 219 | }) 220 | .transpose()? 221 | .unwrap_or_else(Vec::new); 222 | Ok(Arc::new(BinaryArray::from(vec![ 223 | Some(value.as_slice()); 224 | len 225 | ]))) 226 | } 227 | DataType::Boolean => { 228 | let value = default_value 229 | .map(|v| { 230 | v.parse::() 231 | .map_err(|e| TblError::Error(e.to_string())) 232 | }) 233 | .transpose()?; 234 | Ok(Arc::new(BooleanArray::from(vec![value; len]))) 235 | } 236 | // Add more data types as needed 237 | _ => Err(TblError::Error(format!( 238 | "Unsupported data type: {:?}", 239 | dtype 240 | ))), 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/subcommands/schema.rs: -------------------------------------------------------------------------------- 1 | use crate::styles::FontStyle; 2 | use crate::{SchemaArgs, TblCliError}; 3 | use polars::prelude::*; 4 | use std::collections::HashMap; 5 | use std::path::PathBuf; 6 | use std::sync::Arc; 7 | use tbl_core::formats::{format_bytes, format_with_commas}; 8 | use tbl_core::parquet::{combine_tabular_summaries, summarize_by_schema, TabularSummary}; 9 | use toolstr::Colorize; 10 | 11 | pub(crate) async fn schema_command(args: SchemaArgs) -> Result<(), TblCliError> { 12 | // get schemas 13 | let paths = tbl_core::filesystem::get_input_paths(&args.paths, args.tree, true)?; 14 | let summaries = tbl_core::parquet::get_parquet_summaries(&paths).await?; 15 | let ref_summaries: Vec<&tbl_core::parquet::TabularSummary> = summaries.iter().collect(); 16 | let by_schema = summarize_by_schema(ref_summaries.as_slice())?; 17 | 18 | // summarize entire set 19 | let total_summary = combine_tabular_summaries(&ref_summaries, false)?; 20 | 21 | // clear common prefix 22 | let paths = if args.absolute { 23 | paths 24 | } else { 25 | let common_prefix = tbl_core::filesystem::get_common_prefix(&paths)?; 26 | let mut new_paths = Vec::new(); 27 | for path in paths { 28 | new_paths.push(path.strip_prefix(&common_prefix)?.to_owned()) 29 | } 30 | new_paths 31 | }; 32 | 33 | // collect example paths for each schema 34 | let n_example_paths = 3; 35 | let example_paths = if args.examples { 36 | let mut example_paths = HashMap::, Vec>::new(); 37 | for (path, summary) in paths.iter().zip(summaries.iter()) { 38 | example_paths 39 | .entry(Arc::clone(&summary.schema)) 40 | .or_default() 41 | .push(path.clone()); 42 | } 43 | Some(example_paths) 44 | } else { 45 | None 46 | }; 47 | 48 | // decide how many schemas to show 49 | let n_to_show = std::cmp::min(args.n.unwrap_or(3), by_schema.len()); 50 | 51 | // decide what to sort by 52 | let sort_by = match args.sort.as_str() { 53 | "rows" => SortSchemasBy::Rows, 54 | "bytes" => SortSchemasBy::Bytes, 55 | "files" => SortSchemasBy::Files, 56 | _ => { 57 | return Err(TblCliError::Arg( 58 | "must sort by rows, bytes, or files".to_string(), 59 | )) 60 | } 61 | }; 62 | 63 | // print output 64 | print_schemas( 65 | by_schema, 66 | total_summary, 67 | n_to_show, 68 | sort_by, 69 | n_example_paths, 70 | example_paths, 71 | )?; 72 | 73 | Ok(()) 74 | } 75 | 76 | fn count_unique_schemas(schemas: &Vec<&Arc>) -> HashMap, usize> { 77 | let mut schema_counts = HashMap::new(); 78 | 79 | for schema in schemas { 80 | let counter = schema_counts.entry(Arc::clone(schema)).or_insert(0); 81 | *counter += 1; 82 | } 83 | 84 | schema_counts 85 | } 86 | 87 | pub(crate) enum SortSchemasBy { 88 | Files, 89 | Bytes, 90 | Rows, 91 | } 92 | 93 | impl std::fmt::Display for SortSchemasBy { 94 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 95 | let s = match self { 96 | SortSchemasBy::Files => "files", 97 | SortSchemasBy::Bytes => "bytes", 98 | SortSchemasBy::Rows => "rows", 99 | }; 100 | write!(f, "{}", s) 101 | } 102 | } 103 | 104 | fn top_n_schemas( 105 | schema_summaries: HashMap, TabularSummary>, 106 | n_to_show: usize, 107 | sort_by: SortSchemasBy, 108 | ) -> Vec { 109 | let mut summaries: Vec<_> = schema_summaries.values().cloned().collect(); 110 | match sort_by { 111 | SortSchemasBy::Rows => summaries.sort_by(|a, b| b.n_rows.cmp(&a.n_rows)), 112 | SortSchemasBy::Files => summaries.sort_by(|a, b| b.n_files.cmp(&a.n_files)), 113 | SortSchemasBy::Bytes => { 114 | summaries.sort_by(|a, b| b.n_bytes_compressed.cmp(&a.n_bytes_compressed)) 115 | } 116 | } 117 | summaries.into_iter().take(n_to_show).collect() 118 | } 119 | 120 | fn print_schemas( 121 | schema_summaries: HashMap, TabularSummary>, 122 | total_summary: TabularSummary, 123 | n_to_show: usize, 124 | sort_by: SortSchemasBy, 125 | n_example_paths: usize, 126 | example_paths: Option, Vec>>, 127 | ) -> Result<(), TblCliError> { 128 | let n_schemas = schema_summaries.len(); 129 | 130 | // print summary 131 | let schema_word = if n_schemas == 1 { "schema" } else { "schemas" }; 132 | println!( 133 | "{} unique {}, {} rows, {} files, {}", 134 | format_with_commas(n_schemas as u64).green().bold(), 135 | schema_word, 136 | format_with_commas(total_summary.n_rows).green().bold(), 137 | format_with_commas(total_summary.n_files).green().bold(), 138 | format_bytes(total_summary.n_bytes_compressed) 139 | .green() 140 | .bold(), 141 | ); 142 | println!(); 143 | if n_schemas > 1 { 144 | println!( 145 | "showing top {} schemas by number of {}:", 146 | format!("{}", n_to_show).green().bold(), 147 | sort_by, 148 | ); 149 | println!(); 150 | if example_paths.is_some() { 151 | println!(); 152 | }; 153 | } 154 | 155 | // print top schemas 156 | let format = toolstr::NumberFormat::new().percentage().precision(2); 157 | let top_n = top_n_schemas(schema_summaries, n_to_show, sort_by); 158 | for (i, summary) in top_n.into_iter().enumerate() { 159 | let file_percent = (summary.n_files as f64) / (total_summary.n_files as f64); 160 | let file_percent = format.format(file_percent)?; 161 | 162 | let row_percent = if total_summary.n_rows == 0 { 163 | 0.0 164 | } else { 165 | (summary.n_rows as f64) / (total_summary.n_rows as f64) 166 | }; 167 | let row_percent = format.format(row_percent)?; 168 | 169 | let byte_percent = if total_summary.n_bytes_compressed == 0 { 170 | 0.0 171 | } else { 172 | (summary.n_bytes_compressed as f64) / (total_summary.n_bytes_compressed as f64) 173 | }; 174 | let byte_percent = format.format(byte_percent)?; 175 | 176 | if n_schemas > 1 { 177 | println!( 178 | "{} {}{} {} rows ({}), {} files ({}), {} ({})", 179 | "Schema".colorize_title(), 180 | format!("{}", i + 1).green().bold(), 181 | ":".colorize_title(), 182 | format_with_commas(summary.n_rows).green().bold(), 183 | row_percent.green().bold(), 184 | format_with_commas(summary.n_files).green().bold(), 185 | file_percent.green().bold(), 186 | format_bytes(summary.n_bytes_compressed).green().bold(), 187 | byte_percent.green().bold(), 188 | ); 189 | println!(); 190 | } 191 | print_schema(summary.schema.clone(), &summary)?; 192 | 193 | if let Some(example_paths) = example_paths.as_ref() { 194 | if let Some(paths_vec) = example_paths.get(&summary.schema) { 195 | println!(); 196 | if n_example_paths == 1 { 197 | println!("{}", "Example path".colorize_title()); 198 | } else { 199 | println!("{}", "Example paths".colorize_title()); 200 | }; 201 | for (i, path) in paths_vec.iter().take(n_example_paths).enumerate() { 202 | println!( 203 | "{} {}", 204 | format!("{}.", i + 1).colorize_variable(), 205 | path.to_string_lossy().colorize_comment() 206 | ); 207 | } 208 | } 209 | } 210 | 211 | if i < n_to_show - 1 { 212 | println!(); 213 | println!(); 214 | } 215 | } 216 | if n_to_show < n_schemas { 217 | println!(); 218 | println!( 219 | "{} more schemas not shown", 220 | format!("{}", n_schemas - n_to_show).bold().green() 221 | ) 222 | } 223 | 224 | Ok(()) 225 | } 226 | 227 | fn print_schema(schema: Arc, summary: &TabularSummary) -> Result<(), TblCliError> { 228 | // gather data 229 | let names: Vec = schema.iter_names().map(|x| x.to_string()).collect(); 230 | let dtypes: Vec = schema.iter_dtypes().map(|x| x.to_string()).collect(); 231 | let uncompressed: Vec<_> = summary 232 | .columns 233 | .iter() 234 | .map(|x| format_bytes(x.n_bytes_uncompressed)) 235 | .collect(); 236 | let compressed: Vec<_> = summary 237 | .columns 238 | .iter() 239 | .map(|x| format_bytes(x.n_bytes_compressed)) 240 | .collect(); 241 | 242 | let total_disk_bytes: u64 = summary.columns.iter().map(|x| x.n_bytes_compressed).sum(); 243 | let percent_disk: Vec<_> = summary 244 | .columns 245 | .iter() 246 | .map(|x| { 247 | format!( 248 | "{:.2}%", 249 | 100.0 * (x.n_bytes_compressed as f64) / (total_disk_bytes as f64) 250 | ) 251 | }) 252 | .collect(); 253 | 254 | // build table 255 | let mut table = toolstr::Table::new(); 256 | table.add_column("column name", names)?; 257 | table.add_column("dtype", dtypes)?; 258 | table.add_column("full size", uncompressed)?; 259 | table.add_column("disk size", compressed)?; 260 | table.add_column("disk %", percent_disk)?; 261 | 262 | // create format 263 | let mut name_column = toolstr::ColumnFormatShorthand::default().name("column name"); 264 | let mut dtype_column = toolstr::ColumnFormatShorthand::default().name("dtype"); 265 | let mut uncompressed_column = toolstr::ColumnFormatShorthand::default().name("full size"); 266 | let mut compressed_column = toolstr::ColumnFormatShorthand::default().name("disk size"); 267 | let mut disk_percent_column = toolstr::ColumnFormatShorthand::default().name("disk %"); 268 | name_column.font_style = Some("".colorize_function().into()); 269 | dtype_column.font_style = Some("".colorize_variable().into()); 270 | uncompressed_column.font_style = Some("".colorize_constant().into()); 271 | compressed_column.font_style = Some("".colorize_constant().into()); 272 | disk_percent_column.font_style = Some("".colorize_constant().into()); 273 | 274 | let mut format = toolstr::TableFormat { 275 | // indent: 4, 276 | label_font_style: Some("".colorize_title().into()), 277 | border_font_style: Some("".colorize_comment().into()), 278 | ..Default::default() 279 | }; 280 | format.add_column(name_column); 281 | format.add_column(dtype_column); 282 | format.add_column(compressed_column); 283 | format.add_column(uncompressed_column); 284 | format.add_column(disk_percent_column); 285 | 286 | // print table 287 | format.print(table)?; 288 | 289 | Ok(()) 290 | } 291 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # tbl ┳━┳ 3 | 4 | `tbl` is a cli tool for reading and editing parquet files 5 | 6 | #### Goals of `tbl`: 7 | - be a swiss army knife for reading/editing parquet (kind of like [`jq`](https://github.com/jqlang/jq) is for JSON) 8 | - make it effortless to manage multi-file multi-schema parquet datasets 9 | - use a cli-native version of [polars](https://github.com/pola-rs/polars) syntax, so if you know python polars you already mostly know `tbl` 10 | 11 | #### Example use cases: 12 | - quickly look up schemas, row counts, and per-column storage usage 13 | - migrate from one schema to another, like add/remove/rename a column 14 | - perform these operations on multiple files in parallel 15 | 16 | 17 | To discuss `tbl`, check out the [Paradigm Data Tools](https://t.me/paradigm_data) telegram group. 18 | 19 | 20 | ## Contents 21 | 1. [Installation](#installation) 22 | 2. [Example Usage](#example-usage) 23 | 1. [Listing files](#listing-files) 24 | 2. [Looking up schemas](#looking-up-schemas) 25 | 3. [Selecting input files](#selecting-input-files) 26 | 4. [Performing edits](#performing-edits) 27 | 5. [Selecting output mode](#selecting-output-mode) 28 | 4. [API Reference](#api-reference) 29 | 1. [`tbl`](#tbl) 30 | 2. [`tbl ls`](#tbl-ls) 31 | 3. [`tbl schema`](#tbl-schema) 32 | 6. [FAQ](#faq) 33 | 1. [What is parquet?](#what-is-parquet) 34 | 2. [What other parquet cli tools exist?](#what-other-parquet-cli-tools-exist) 35 | 3. [Why use `tbl` when `duckdb` has a cli?](#why-use-tbl-when-duckdb-has-a-cli) 36 | 4. [What is the plan for `tbl`?](#what-is-the-plan-for-tbl) 37 | 38 | ## Installation 39 | 40 | ##### Install from crates.io 41 | ```bash 42 | cargo install tbl-cli 43 | ``` 44 | 45 | ##### Install from source 46 | ```bash 47 | git clone https://github.com/paradigmxyz/tbl 48 | cd tbl 49 | cargo install --path crates/tbl-cli 50 | ``` 51 | 52 | ## Example Usage 53 | 54 | ### Listing files 55 | 56 | `tbl` can list files and display their statistics, similar to the `ls` cli command. 57 | 58 | The command `tbl ls` produces output: 59 | 60 | ``` 61 | blocks__00000000_to_00000999.parquet 62 | blocks__00001000_to_00001999.parquet 63 | blocks__00002000_to_00002999.parquet 64 | blocks__00003000_to_00003999.parquet 65 | blocks__00004000_to_00004999.parquet 66 | blocks__00005000_to_00005999.parquet 67 | blocks__00006000_to_00006999.parquet 68 | blocks__00007000_to_00007999.parquet 69 | blocks__00008000_to_00008999.parquet 70 | blocks__00009000_to_00009999.parquet 71 | ... 19,660 files not shown 72 | 19,041,325 rows stored in 1.05 GB across 19,708 tabular files 73 | ``` 74 | 75 | See full list of `tbl ls` options [below](#tbl-ls). 76 | 77 | ### Looking up schemas 78 | 79 | `tbl` can display the schemas of parquet files. 80 | 81 | The command `tbl schema` produces output: 82 | 83 | ``` 84 | 1 unique schema, 19,041,325 rows, 19,708 files, 1.05 GB 85 | 86 | column name │ dtype │ disk size │ full size │ disk % 87 | ──────────────────┼──────────┼─────────────┼─────────────┼──────── 88 | block_hash │ binary │ 649.97 MB │ 657.93 MB │ 63.78% 89 | author │ binary │ 40.52 MB │ 40.59 MB │ 3.98% 90 | block_number │ u32 │ 76.06 MB │ 75.75 MB │ 7.46% 91 | gas_used │ u64 │ 84.23 MB │ 133.29 MB │ 8.26% 92 | extra_data │ binary │ 46.66 MB │ 76.91 MB │ 4.58% 93 | timestamp │ u32 │ 76.06 MB │ 75.75 MB │ 7.46% 94 | base_fee_per_gas │ u64 │ 41.85 MB │ 49.58 MB │ 4.11% 95 | chain_id │ u64 │ 3.74 MB │ 3.70 MB │ 0.37% 96 | ``` 97 | 98 | See full list of `tbl schema` options [below](#tbl-schema). 99 | 100 | ### Selecting input files 101 | 102 | `tbl` can operate on one file, or many files across multiple directories. 103 | 104 | These input selection options can be used with each `tbl` subcommand: 105 | 106 | | input selection | command | 107 | | --- | --- | 108 | | Select all tabular files in current directory | `tbl` (default behavior) | 109 | | Select a single file | `tbl /path/to/file.parquet` | 110 | | Select files using a glob | `tbl *.parquet` | 111 | | Select files from multiple directories | `tbl /path/to/dir1 /path/to/dir2` | 112 | | Select files recursively | `tbl /path/to/dir --tree` | 113 | 114 | ### Performing edits 115 | 116 | `tbl` can perform many different operations on the selected files: 117 | 118 | | operation | command | 119 | | --- | --- | 120 | | Rename a column | `tbl --rename old_name=new_name` | 121 | | Cast to a new type | `tbl --cast col1=u64 col2=String` | 122 | | Add new columns | `tbl --with-columns name:String date:Date=2024-01-01` | 123 | | Drop columns | `tbl --drop col1 col2 col3` | 124 | | Filter rows | `tbl --filter col1=val1`
`tbl --filter col1!=val1`
`tbl --filter "col1>val1"`
`tbl --filter "col1 `tbl --filter "col1>=val1"`
`tbl --filter "col1<=val1"` | 125 | | Sort rows | `tbl --sort col1 col2:desc` | 126 | | Select columns | `tbl --select col1 col2 col3` | 127 | 128 | See full list of transformation operations [below](#tbl). 129 | 130 | ### Selecting output mode 131 | 132 | `tbl` can output its results in many different modes: 133 | 134 | | output mode | description | command | 135 | | --- | --- | --- | 136 | | Single File | output all results to single file | `tbl --output-file /path/to/file.parquet` | 137 | | Inplace | modify each file inplace | `tbl --inplace` | 138 | | New Directory | create equivalent files in a new directory | `tbl --output-dir /path/to/dir` | 139 | | Interactive | load dataframe in interactive python session | `tbl --df` | 140 | | Stdout | output data to stdout | `tbl` (default behavior) | 141 | 142 | See full list of output options [below](#tbl). 143 | 144 | ## API Reference 145 | 146 | #### `tbl` 147 | ##### Output of `tbl -h`: 148 | 149 | ```markdown 150 | tbl is a tool for reading and editing tabular data files 151 | 152 | Usage: tbl has two modes 153 | 1. Summary mode: tbl [ls | schema] [SUMMARY_OPTIONS] 154 | 2. Data mode: tbl [DATA_OPTIONS] 155 | 156 | Get help with SUMMARY_OPTIONS using tbl [ls | schema] -h 157 | 158 | Data mode is the default mode. DATA_OPTIONS are documented below 159 | 160 | Optional Subcommands: 161 | ls Display list of tabular files, similar to the cli `ls` command 162 | schema Display table representation of each schema in the selected files 163 | 164 | General Options: 165 | -h, --help display help message 166 | -V, --version display version 167 | 168 | Input Options: 169 | [PATHS]... input path(s) to use 170 | -t, --tree recursively use all files in tree as inputs 171 | 172 | Transform Options: 173 | -c, --columns ... select only these columns [alias --select] 174 | --drop ... drop column(s) 175 | --with-columns ... insert columns, syntax NAME:TYPE [alias --with] 176 | --rename ... rename column(s), syntax OLD_NAME=NEW_NAME 177 | --cast ... change column type(s), syntax COLUMN=TYPE 178 | --set ... set column values, syntax COLUMN=VALUE 179 | --nullify ... set column values to null 180 | --filter ... filter rows by values, syntax COLUMN=VALUE 181 | --sort ... sort rows, syntax COLUMN[:desc] 182 | --head keep only the first n rows [alias --limit] 183 | --tail keep only the last n rows 184 | --offset skip the first n rows of table 185 | --value-counts compute value counts of column(s) 186 | 187 | Output Options: 188 | --no-summary skip printing a summary 189 | -n, --n number of rows to print in stdout, all for all 190 | --csv output data as csv 191 | --json output data as json 192 | --jsonl output data as json lines 193 | --hex encode binary columns as hex for output 194 | --inplace modify files in place 195 | --output-file write all data to a single new file 196 | --output-dir rewrite all files into this output directory 197 | --output-prefix prefix to add to output filenames 198 | --output-postfix postfix to add to output filenames 199 | --df load as DataFrame in interactive python session 200 | --lf load as LazyFrame in interactive python session 201 | --executable python executable to use with --df or --lf 202 | --confirm confirm that files should be edited 203 | --dry dry run without editing files 204 | 205 | Output Modes: 206 | 1. output results in single file --output-file /path/to/file.parquet 207 | 2. modify each file inplace --inplace 208 | 3. copy files into a new dir --output-dir /path/to/dir 209 | 4. load as interactive python --df | --lf 210 | 5. output data to stdout (default behavior) 211 | ``` 212 | 213 | #### `tbl ls` 214 | ##### Output of `tbl ls -h`: 215 | 216 | ```markdown 217 | Display list of tabular files, similar to the cli `ls` command 218 | 219 | Usage: tbl ls [OPTIONS] [PATHS]... 220 | 221 | Arguments: 222 | [PATHS]... input path(s) to use 223 | 224 | Options: 225 | -t, --tree recursively list all files in tree 226 | --absolute show absolute paths instead of relative 227 | --n number of file names to print 228 | --sort sort by number of rows, files, or bytes [default: bytes] 229 | 230 | General Options: 231 | -h, --help display help message 232 | ``` 233 | 234 | #### `tbl schema` 235 | ##### Output of `tbl schema -h`: 236 | 237 | ```markdown 238 | Display table representation of each schema in the selected files 239 | 240 | Usage: tbl schema [OPTIONS] [PATHS]... 241 | 242 | Arguments: 243 | [PATHS]... input path(s) to use 244 | 245 | Options: 246 | -t, --tree recursively list all files in tree 247 | --columns columns to print 248 | --n number of schemas to print 249 | --examples show examples 250 | --absolute show absolute paths in examples 251 | --sort sort by number of rows, files, or bytes [default: bytes] 252 | 253 | General Options: 254 | -h, --help display help message 255 | ``` 256 | 257 | ## FAQ 258 | 259 | ### What is parquet? 260 | 261 | [Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a file format for storing tabular datasets. In many cases parquet is a simpler and faster alternative to using an actual database. Parquet has become an industry standard and its ecosystem of tools is growing rapidly. 262 | 263 | ### What other parquet cli tools exist? 264 | 265 | The most common tools are [`duckdb`](https://duckdb.org/docs/api/cli/overview), [`pqrs`](https://github.com/manojkarthick/pqrs), and [`parquet-cli`](https://github.com/apache/parquet-java/blob/master/parquet-cli/README.md). 266 | 267 | ### Why use `tbl` when `duckdb` has a cli? 268 | 269 | `duckdb` is an incredible tool. We recommend checking it out, especially when you're running complex workloads. However there are 3 reasons you might prefer `tbl` as a cli tool: 270 | 1. **CLI-Native:** Compared to `duckdb`'s SQL, `tbl` has a cli-native syntax. This makes `tbl` simpler to use with fewer keystrokes: 271 | 1. `duckdb "DESCRIBE read_parquet('test.parquet')"` vs `tbl schema test.parquet` 272 | 2. `duckdb "SELECT * FROM read_parquet('test.parquet')"` vs `tbl test.parquet` 273 | 3. `duckdb "SELECT * FROM read_parquet('test.parquet') ORDER BY co1"` vs `tbl test.parquet --sort col1` 274 | 2. **High Level vs Low Level:** Sometimes SQL can also be a very low-level language. `tbl` and `polars` let you operate on a higher level of abstraction which reduces cognitive load: 275 | 1. `duckdb`: `duckdb "SELECT col1, COUNT(col1) FROM read_parquet('test.parquet') GROUP BY col1"` 276 | 2. `tbl`: `tbl test.parquet --value-counts col1` 277 | 3. **Operational QoL:** `tbl` is built specifically for making it easy to manage large parquet archives. Features like `--tree`, `--inplace`, and multi-schema commands make life easier for archive management. 278 | 279 | ### What is the plan for `tbl`? 280 | 281 | There are a few features that we are currently exploring: 282 | 1. **S3 and cloud buckets**: ability to read and write cloud bucket parquet files using the same operations that can be performed on local files 283 | 2. **Re-partitioning**: ability to change how a set of parquet files are partitioned, such as changing the partition key or partition size 284 | 3. **Direct python syntax**: ability to directly use python polars syntax to perform complex operations like `group_by()`, `join()`, and more 285 | 4. **Idempotent Workflows**: ability to interrupt and re-run commands arbitrarily would make migrations more robust 286 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/cli/args.rs: -------------------------------------------------------------------------------- 1 | use super::subcommands::*; 2 | use crate::TblCliError; 3 | use clap::{Parser, Subcommand}; 4 | use color_print::cstr; 5 | use std::path::PathBuf; 6 | 7 | pub(crate) async fn run_cli() -> Result<(), TblCliError> { 8 | let args = Cli::parse(); 9 | 10 | if args.version { 11 | let version = env!("GIT_DESCRIPTION"); 12 | if version.is_empty() { 13 | println!(env!("CARGO_PKG_VERSION")); 14 | } else { 15 | println!("{}", version); 16 | } 17 | std::process::exit(0); 18 | } 19 | 20 | match args.command { 21 | Some(Subcommands::Ls(args)) => ls_command(args).await, 22 | Some(Subcommands::Schema(args)) => schema_command(args).await, 23 | Some(Subcommands::Schemas(args)) => schemas_command(args).await, 24 | _ => data_command(args.data_args).await, 25 | } 26 | } 27 | 28 | /// Utility for creating and managing MESC RPC configurations 29 | #[derive(Clone, Parser)] 30 | #[clap( 31 | author, 32 | about = cstr!("tbl is a tool for reading and editing tabular data files"), 33 | override_usage = cstr!("tbl has two modes 34 | 1. Summary mode: tbl [ls | schema] [SUMMARY_OPTIONS] 35 | 2. Data mode: tbl [DATA_OPTIONS] 36 | 37 | Get help with SUMMARY_OPTIONS using tbl [ls | schema] -h 38 | 39 | Data mode is the default mode. DATA_OPTIONS are documented below 40 | "), 41 | after_help = cstr!("Output Modes: 42 | 1. output results in single file --output-file /path/to/file.parquet 43 | 2. modify each file inplace --inplace 44 | 3. copy files into a new dir --output-dir /path/to/dir 45 | 4. load as interactive python --df | --lf 46 | 5. output data to stdout (default behavior)"), 47 | long_about = None, 48 | disable_help_subcommand = true, 49 | disable_help_flag = true, 50 | disable_version_flag = true, 51 | args_conflicts_with_subcommands = true, 52 | subcommand_help_heading = "Optional Subcommands", 53 | styles=crate::styles::get_styles() 54 | )] 55 | pub(crate) struct Cli { 56 | #[clap(subcommand)] 57 | pub(crate) command: Option, 58 | 59 | /// display help message 60 | #[clap(short, long, verbatim_doc_comment, action = clap::ArgAction::HelpLong, help_heading = "General Options")] 61 | help: Option, 62 | 63 | /// display version 64 | #[clap( 65 | short = 'V', 66 | long, 67 | verbatim_doc_comment, 68 | help_heading = "General Options" 69 | )] 70 | version: bool, 71 | 72 | #[clap(flatten)] 73 | data_args: DataArgs, 74 | } 75 | 76 | /// Define your subcommands as an enum 77 | #[derive(Clone, Subcommand)] 78 | #[command()] 79 | pub(crate) enum Subcommands { 80 | /// Display list of tabular files, similar to the cli `ls` command 81 | Ls(LsArgs), 82 | 83 | /// Display table representation of each schema in the selected files 84 | Schema(SchemaArgs), 85 | 86 | /// Display single summary of all schemas 87 | #[command(hide = true)] 88 | Schemas(SchemasArgs), 89 | 90 | /// Load, transform, and output file data [default subcommand] 91 | #[command(hide = true)] 92 | Data, 93 | } 94 | 95 | /// Arguments for the `schema` subcommand 96 | #[derive(Clone, Parser)] 97 | pub(crate) struct LsArgs { 98 | /// display help message 99 | #[clap(short, long, action = clap::ArgAction::HelpLong, help_heading = "General Options")] 100 | help: Option, 101 | 102 | /// input path(s) to use 103 | #[clap()] 104 | pub(crate) paths: Option>, 105 | 106 | /// recursively list all files in tree 107 | #[clap(short, long)] 108 | pub(crate) tree: bool, 109 | 110 | /// show absolute paths instead of relative 111 | #[clap(long)] 112 | pub(crate) absolute: bool, 113 | 114 | /// display bytes stats 115 | #[clap(long, hide = true)] 116 | pub(crate) bytes: bool, 117 | 118 | /// display stats of each schema group 119 | #[clap(long, hide = true)] 120 | pub(crate) stats: bool, 121 | 122 | /// number of file names to print 123 | #[clap(long)] 124 | pub(crate) n: Option, 125 | 126 | /// sort by number of rows, files, or bytes 127 | #[clap(long, default_value = "bytes")] 128 | pub(crate) sort: String, 129 | } 130 | 131 | /// Arguments for the `schema` subcommand 132 | #[derive(Clone, Parser)] 133 | pub(crate) struct SchemaArgs { 134 | /// display help message 135 | #[clap(short, long, action = clap::ArgAction::HelpLong, help_heading = "General Options")] 136 | help: Option, 137 | 138 | /// input path(s) to use 139 | #[clap()] 140 | pub(crate) paths: Option>, 141 | 142 | /// recursively list all files in tree 143 | #[clap(short, long)] 144 | pub(crate) tree: bool, 145 | 146 | /// display bytes stats 147 | #[clap(long, hide = true)] 148 | pub(crate) bytes: bool, 149 | 150 | /// display stats of each schema group 151 | #[clap(long, hide = true)] 152 | pub(crate) stats: bool, 153 | 154 | /// columns to print 155 | #[clap(long)] 156 | pub(crate) columns: Option>, 157 | 158 | /// number of schemas to print 159 | #[clap(long)] 160 | pub(crate) n: Option, 161 | 162 | /// show examples 163 | #[clap(long)] 164 | pub(crate) examples: bool, 165 | 166 | /// show absolute paths in examples 167 | #[clap(long)] 168 | pub(crate) absolute: bool, 169 | 170 | /// sort by number of rows, files, or bytes 171 | #[clap(long, default_value = "bytes")] 172 | pub(crate) sort: String, 173 | } 174 | 175 | /// Arguments for the `schema` subcommand 176 | #[derive(Clone, Parser)] 177 | pub(crate) struct SchemasArgs { 178 | /// display help message 179 | #[clap(short, long, action = clap::ArgAction::HelpLong, help_heading = "General Options")] 180 | help: Option, 181 | 182 | /// input path(s) to use 183 | #[clap()] 184 | pub(crate) paths: Option>, 185 | 186 | /// recursively list all files in tree 187 | #[clap(short, long)] 188 | pub(crate) tree: bool, 189 | 190 | /// sort by number of rows, files, or bytes 191 | #[clap(long, default_value = "bytes")] 192 | pub(crate) sort: String, 193 | } 194 | 195 | /// Arguments for the `data` subcommand 196 | #[derive(Clone, Parser)] 197 | pub(crate) struct DataArgs { 198 | // 199 | // // input options 200 | // 201 | /// input path(s) to use 202 | #[clap( 203 | verbatim_doc_comment, 204 | help_heading = "Input Options", 205 | display_order = 1 206 | )] 207 | pub(crate) paths: Option>, 208 | 209 | /// recursively use all files in tree as inputs 210 | #[clap(short, long, verbatim_doc_comment, help_heading = "Input Options")] 211 | pub(crate) tree: bool, 212 | 213 | // 214 | // // transform options 215 | // 216 | /// select only these columns [alias --columns] 217 | #[clap( 218 | short, 219 | long, 220 | help = cstr!("select only these columns [alias --select]"), 221 | help_heading = "Transform Options", 222 | aliases = ["select"], 223 | value_name="COLUMN", 224 | num_args(1..) 225 | )] 226 | pub(crate) columns: Option>, 227 | 228 | /// drop column(s) 229 | #[clap(short, long, help_heading = "Transform Options", num_args(1..))] 230 | pub(crate) drop: Option>, 231 | 232 | /// add new columns, syntax NAME:TYPE [alias --with] 233 | #[clap( 234 | long, 235 | help = cstr!("insert columns, syntax NAME:TYPE [alias --with]"), 236 | help_heading = "Transform Options", 237 | value_name="NEW_COL", 238 | num_args(1..), 239 | aliases = ["with"] 240 | )] 241 | pub(crate) with_columns: Option>, 242 | 243 | /// rename column(s), syntax OLD_NAME=NEW_NAME 244 | #[clap( 245 | short, 246 | long, 247 | help = cstr!("rename column(s), syntax OLD_NAME=NEW_NAME"), 248 | help_heading = "Transform Options", 249 | num_args(1..) 250 | )] 251 | pub(crate) rename: Option>, 252 | 253 | /// change column type(s), syntax COLUMN=TYPE 254 | #[clap( 255 | long, 256 | help = cstr!("change column type(s), syntax COLUMN=TYPE"), 257 | help_heading = "Transform Options", 258 | num_args(1..) 259 | )] 260 | pub(crate) cast: Option>, 261 | 262 | /// set column values 263 | #[clap( 264 | long, 265 | help = cstr!("set column value, syntax COLUMN=VALUE"), 266 | help_heading = "Transform Options", 267 | value_name="COLUMN", 268 | num_args(1..) 269 | )] 270 | pub(crate) set: Option>, 271 | 272 | /// set column values to null 273 | #[clap( 274 | long, 275 | help_heading = "Transform Options", 276 | value_name="COLUMN", 277 | num_args(1..) 278 | )] 279 | pub(crate) nullify: Option>, 280 | 281 | /// replace values of a column 282 | #[clap( 283 | long, 284 | help = cstr!("replace values, syntax COLUMN.OLD_VALUE=NEW_VALUE"), 285 | help_heading = "Transform Options", 286 | value_name="VALUE", 287 | num_args(1..) 288 | )] 289 | pub(crate) replace: Option>, 290 | 291 | /// filter rows by values, syntax COLUMN=VALUE 292 | #[clap( 293 | short, 294 | long, 295 | help = cstr!("filter rows by values, syntax COLUMN=VALUE 296 | or COLUMN.is_null or COLUMN.is_not_null"), 297 | help_heading = "Transform Options", 298 | num_args(1..) 299 | )] 300 | pub(crate) filter: Option>, 301 | 302 | /// sort rows, syntax COLUMN[:desc] 303 | #[clap( 304 | short, 305 | long, 306 | help = cstr!("sort rows, syntax COLUMN[:desc]"), 307 | help_heading = "Transform Options", 308 | num_args(1..) 309 | )] 310 | pub(crate) sort: Option>, 311 | 312 | /// keep only the first n rows [alias --limit] 313 | #[clap( 314 | long, 315 | help = cstr!("keep only the first n rows [alias --limit]"), 316 | help_heading = "Transform Options", 317 | aliases = ["limit"] 318 | )] 319 | pub(crate) head: Option, 320 | 321 | /// keep only the last n rows 322 | #[clap(long, help_heading = "Transform Options")] 323 | pub(crate) tail: Option, 324 | 325 | /// skip the first n rows of table 326 | #[clap(long, help_heading = "Transform Options")] 327 | pub(crate) offset: Option, 328 | 329 | /// compute value counts of column(s) 330 | #[clap(long, help_heading = "Transform Options", value_name = "COLUMN")] 331 | pub(crate) value_counts: Option, 332 | 333 | // 334 | // // output options 335 | // 336 | /// skip printing a summary 337 | #[clap(long, help_heading = "Output Options")] 338 | pub(crate) no_summary: bool, 339 | 340 | /// number of rows to print in stdout, all for all 341 | #[clap( 342 | short, 343 | long, 344 | help = cstr!("number of rows to print in stdout, all for all"), 345 | help_heading = "Output Options" 346 | )] 347 | pub(crate) n: Option, 348 | 349 | /// output data as csv 350 | #[clap(long, help_heading = "Output Options")] 351 | pub(crate) csv: bool, 352 | 353 | /// output data as json 354 | #[clap(long, help_heading = "Output Options")] 355 | pub(crate) json: bool, 356 | 357 | /// output data as json lines 358 | #[clap(long, help_heading = "Output Options")] 359 | pub(crate) jsonl: bool, 360 | 361 | /// encode binary columns as hex for output 362 | #[clap(long, help_heading = "Output Options")] 363 | pub(crate) hex: bool, 364 | 365 | /// modify files in place 366 | #[clap(long, help_heading = "Output Options")] 367 | pub(crate) inplace: bool, 368 | 369 | /// write all data to a single new file 370 | #[clap(long, help_heading = "Output Options", value_name = "FILE_PATH")] 371 | pub(crate) output_file: Option, 372 | 373 | /// rewrite all files into this output directory 374 | #[clap(long, help_heading = "Output Options", value_name = "DIR_PATH")] 375 | pub(crate) output_dir: Option, 376 | 377 | /// prefix to add to output filenames 378 | #[clap(long, help_heading = "Output Options", value_name = "PRE-FIX")] 379 | pub(crate) output_prefix: Option, 380 | 381 | /// postfix to add to output filenames 382 | #[clap(long, help_heading = "Output Options", value_name = "POST-FIX")] 383 | pub(crate) output_postfix: Option, 384 | 385 | /// partition output over this column 386 | #[clap( 387 | long, 388 | help_heading = "Output Options", 389 | value_name = "COLUMN", 390 | hide = true 391 | )] 392 | pub(crate) partition: Option, 393 | 394 | /// partition mode, by range of values per partition 395 | #[clap( 396 | long, 397 | help_heading = "Output Options", 398 | value_name = "SIZE", 399 | hide = true 400 | )] 401 | pub(crate) partition_by_value: Option, 402 | 403 | /// partition mode, by max bytes per partition 404 | #[clap( 405 | long, 406 | help_heading = "Output Options", 407 | value_name = "BYTES", 408 | hide = true 409 | )] 410 | pub(crate) partition_by_bytes: Option, 411 | 412 | /// partition mode, by max rows per partition 413 | #[clap( 414 | long, 415 | help_heading = "Output Options", 416 | value_name = "ROWS", 417 | hide = true 418 | )] 419 | pub(crate) partition_by_rows: Option, 420 | 421 | /// load as DataFrame in interactive python session 422 | #[clap(long, help_heading = "Output Options")] 423 | pub(crate) df: bool, 424 | 425 | /// load as LazyFrame in interactive python session 426 | #[clap(long, help_heading = "Output Options")] 427 | pub(crate) lf: bool, 428 | 429 | /// python executable to use with --df or --lf 430 | #[clap( 431 | long, 432 | help = cstr!("python executable to use with --df or --lf"), 433 | help_heading = "Output Options" 434 | )] 435 | pub(crate) executable: Option, 436 | 437 | /// confirm that files should be edited 438 | #[clap(long, help_heading = "Output Options")] 439 | pub(crate) confirm: bool, 440 | 441 | /// dry run without editing files 442 | #[clap(long, help_heading = "Output Options")] 443 | pub(crate) dry: bool, 444 | } 445 | -------------------------------------------------------------------------------- /crates/tbl-cli/src/transform.rs: -------------------------------------------------------------------------------- 1 | use crate::{DataArgs, TblCliError}; 2 | use polars::prelude::*; 3 | use std::str::FromStr; 4 | 5 | pub(crate) fn apply_transformations( 6 | lf: LazyFrame, 7 | args: &DataArgs, 8 | ) -> Result { 9 | let lf = apply_with_columns(lf, args.with_columns.as_deref())?; 10 | let lf = apply_filter(lf, args.filter.as_deref())?; 11 | let lf = apply_drop(lf, args.drop.as_deref())?; 12 | let lf = apply_cast(lf, args.cast.as_deref())?; 13 | let lf = apply_set(lf, args.set.as_deref())?; 14 | let lf = apply_nullify(lf, args.nullify.as_deref())?; 15 | let lf = apply_replace(lf, args.replace.as_deref())?; 16 | let lf = apply_select(lf, args.columns.as_deref())?; 17 | let lf = apply_offset(lf, args.offset)?; 18 | let lf = apply_head(lf, args.head)?; 19 | let lf = apply_tail(lf, args.tail)?; 20 | let lf = apply_value_counts(lf, args.value_counts.as_deref())?; 21 | let lf = apply_sort(lf, args.sort.as_deref())?; 22 | let lf = apply_rename(lf, args.rename.as_deref())?; 23 | Ok(lf) 24 | } 25 | 26 | pub(crate) fn apply_with_columns( 27 | lf: LazyFrame, 28 | columns: Option<&[String]>, 29 | ) -> Result { 30 | match columns { 31 | None => Ok(lf), 32 | Some(columns) => { 33 | let mut new_lf = lf; 34 | for col_spec in columns { 35 | new_lf = new_lf.with_column(parse_new_column_expr(col_spec)?); 36 | } 37 | Ok(new_lf) 38 | } 39 | } 40 | } 41 | 42 | fn parse_new_column_expr(col_spec: &str) -> Result { 43 | let parts: Vec<&str> = col_spec.splitn(3, ':').collect(); 44 | if parts.len() < 2 || parts.len() > 3 { 45 | return Err(TblCliError::Error( 46 | "invalid format for with_column".to_string(), 47 | )); 48 | } 49 | let (name, type_str) = (parts[0], parts[1]); 50 | let value_str = parts.get(2).and_then(|s| s.split('=').nth(1)); 51 | let dtype = parse_dtype(type_str)?; 52 | let expr = if let Some(value) = value_str { 53 | create_value_expr(value, &dtype)? 54 | } else { 55 | lit(NULL).cast(dtype) 56 | }; 57 | let expr = expr.alias(name); 58 | Ok(expr) 59 | } 60 | 61 | fn parse_dtype(type_str: &str) -> Result { 62 | match type_str.to_lowercase().as_str() { 63 | "i8" => Ok(DataType::Int8), 64 | "i16" => Ok(DataType::Int16), 65 | "i32" => Ok(DataType::Int32), 66 | "i64" => Ok(DataType::Int64), 67 | "u8" => Ok(DataType::UInt8), 68 | "u16" => Ok(DataType::UInt16), 69 | "u32" => Ok(DataType::UInt32), 70 | "u64" => Ok(DataType::UInt64), 71 | "f32" => Ok(DataType::Float32), 72 | "f64" => Ok(DataType::Float64), 73 | "bool" => Ok(DataType::Boolean), 74 | "str" => Ok(DataType::String), 75 | "date" => Ok(DataType::Date), 76 | "datetime" => Ok(DataType::Datetime(TimeUnit::Microseconds, None)), 77 | _ => Err(TblCliError::Error("invalid data type".to_string())), 78 | } 79 | } 80 | 81 | fn create_value_expr(value: &str, dtype: &DataType) -> Result { 82 | match dtype { 83 | DataType::Int8 => Ok(lit( 84 | i8::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 85 | )), 86 | DataType::Int16 => Ok(lit( 87 | i16::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 88 | )), 89 | DataType::Int32 => Ok(lit( 90 | i32::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 91 | )), 92 | DataType::Int64 => Ok(lit( 93 | i64::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 94 | )), 95 | DataType::UInt8 => Ok(lit( 96 | u8::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 97 | )), 98 | DataType::UInt16 => Ok(lit( 99 | u16::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 100 | )), 101 | DataType::UInt32 => Ok(lit( 102 | u32::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 103 | )), 104 | DataType::UInt64 => Ok(lit( 105 | u64::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 106 | )), 107 | DataType::Float32 => Ok(lit( 108 | f32::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 109 | )), 110 | DataType::Float64 => Ok(lit( 111 | f64::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 112 | )), 113 | DataType::Boolean => Ok(lit( 114 | bool::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))? 115 | )), 116 | DataType::String => Ok(lit(value.to_string())), 117 | DataType::Date => { 118 | let naive_date = 119 | chrono::NaiveDate::parse_from_str(value, "%Y-%m-%d").map_err(|_| { 120 | TblCliError::Error("set default date string as %Y-%m-%d".to_string()) 121 | })?; 122 | Ok(lit(naive_date 123 | .and_hms_opt(0, 0, 0) 124 | .ok_or_else(|| TblCliError::Error("Failed to create NaiveDateTime".to_string()))? 125 | .and_utc() 126 | .timestamp_millis())) 127 | } 128 | DataType::Datetime(_, _) => { 129 | let naive_datetime = chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S") 130 | .map_err(|_| TblCliError::Error(value.to_string()))?; 131 | Ok(lit(naive_datetime.and_utc().timestamp_millis())) 132 | } 133 | _ => Err(TblCliError::Error("Unsupported dtype".to_string())), 134 | } 135 | } 136 | 137 | pub(crate) fn apply_filter( 138 | lf: LazyFrame, 139 | filters: Option<&[String]>, 140 | ) -> Result { 141 | let schema = lf 142 | .clone() 143 | .schema() 144 | .map_err(|e| TblCliError::Error(e.to_string()))?; 145 | 146 | match filters { 147 | None => Ok(lf), 148 | Some(filters) => { 149 | let mut new_lf = lf; 150 | for filter in filters { 151 | new_lf = apply_single_filter(new_lf, filter, &schema)?; 152 | } 153 | Ok(new_lf) 154 | } 155 | } 156 | } 157 | 158 | fn apply_single_filter( 159 | lf: LazyFrame, 160 | filter: &str, 161 | schema: &Schema, 162 | ) -> Result { 163 | if filter.contains("!=") { 164 | apply_comparison_filter(lf, filter, schema, "!=") 165 | } else if filter.contains(">=") { 166 | apply_comparison_filter(lf, filter, schema, ">=") 167 | } else if filter.contains("<=") { 168 | apply_comparison_filter(lf, filter, schema, "<=") 169 | } else if filter.contains('=') { 170 | apply_comparison_filter(lf, filter, schema, "=") 171 | } else if filter.contains(">") { 172 | apply_comparison_filter(lf, filter, schema, ">") 173 | } else if filter.contains("<") { 174 | apply_comparison_filter(lf, filter, schema, "<") 175 | } else if filter.ends_with(".is_null") { 176 | apply_null_filter(lf, filter, schema, true) 177 | } else if filter.ends_with(".is_not_null") { 178 | apply_null_filter(lf, filter, schema, false) 179 | } else { 180 | Err(TblCliError::Error("Invalid filter format".to_string())) 181 | } 182 | } 183 | 184 | fn apply_comparison_filter( 185 | lf: LazyFrame, 186 | filter: &str, 187 | schema: &Schema, 188 | operator: &str, 189 | ) -> Result { 190 | let parts: Vec<&str> = if operator == "=" { 191 | filter.split('=').collect() 192 | } else if operator == "!=" { 193 | filter.split("!=").collect() 194 | } else if operator == ">" { 195 | filter.split('>').collect() 196 | } else if operator == "<" { 197 | filter.split('<').collect() 198 | } else if operator == ">=" { 199 | filter.split(">=").collect() 200 | } else if operator == "<=" { 201 | filter.split("<=").collect() 202 | } else { 203 | return Err(TblCliError::Error(format!( 204 | "Invalid filter operator: {}", 205 | operator 206 | ))); 207 | }; 208 | 209 | if parts.len() != 2 { 210 | return Err(TblCliError::Error("Invalid filter format".to_string())); 211 | } 212 | 213 | let (column, value) = (parts[0], parts[1]); 214 | let column_type = schema 215 | .get(column) 216 | .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?; 217 | 218 | let filter_expr = match column_type { 219 | DataType::Binary => { 220 | if let Some(hex_value) = value.strip_prefix("0x") { 221 | let binary_value = hex::decode(hex_value) 222 | .map_err(|e| TblCliError::Error(format!("Invalid hex value: {}", e)))?; 223 | if operator == "=" { 224 | col(column).eq(lit(binary_value)) 225 | } else if operator == "!=" { 226 | col(column).neq(lit(binary_value)) 227 | } else if operator == ">" { 228 | col(column).gt(lit(binary_value)) 229 | } else if operator == "<" { 230 | col(column).lt(lit(binary_value)) 231 | } else if operator == ">=" { 232 | col(column).gt_eq(lit(binary_value)) 233 | } else if operator == "<=" { 234 | col(column).lt_eq(lit(binary_value)) 235 | } else { 236 | return Err(TblCliError::Error(format!( 237 | "Invalid filter operator: {}", 238 | operator 239 | ))); 240 | } 241 | } else { 242 | return Err(TblCliError::Error( 243 | "Binary value must start with 0x".to_string(), 244 | )); 245 | } 246 | } 247 | DataType::String => { 248 | if operator == "=" { 249 | col(column).eq(lit(value)) 250 | } else if operator == "!=" { 251 | col(column).neq(lit(value)) 252 | } else if operator == ">" { 253 | col(column).gt(lit(value)) 254 | } else if operator == "<" { 255 | col(column).lt(lit(value)) 256 | } else if operator == ">=" { 257 | col(column).gt_eq(lit(value)) 258 | } else if operator == "<=" { 259 | col(column).lt_eq(lit(value)) 260 | } else { 261 | return Err(TblCliError::Error(format!( 262 | "Invalid filter operator: {}", 263 | operator 264 | ))); 265 | } 266 | } 267 | DataType::UInt64 | DataType::Int64 | DataType::UInt32 | DataType::Int32 => { 268 | let int_value = if let Some(hex_value) = value.strip_prefix("0x") { 269 | i64::from_str_radix(hex_value, 16) 270 | .map_err(|e| TblCliError::Error(format!("Invalid hex integer: {}", e)))? 271 | } else { 272 | value 273 | .parse::() 274 | .map_err(|e| TblCliError::Error(format!("Invalid integer: {}", e)))? 275 | }; 276 | if operator == "=" { 277 | col(column).eq(lit(int_value)) 278 | } else if operator == "!=" { 279 | col(column).neq(lit(int_value)) 280 | } else if operator == ">" { 281 | col(column).gt(lit(int_value)) 282 | } else if operator == "<" { 283 | col(column).lt(lit(int_value)) 284 | } else if operator == ">=" { 285 | col(column).gt_eq(lit(int_value)) 286 | } else if operator == "<=" { 287 | col(column).lt_eq(lit(int_value)) 288 | } else { 289 | return Err(TblCliError::Error(format!( 290 | "Invalid filter operator: {}", 291 | operator 292 | ))); 293 | } 294 | } 295 | _ => { 296 | return Err(TblCliError::Error(format!( 297 | "Unsupported column type for '{}': {:?}", 298 | column, column_type 299 | ))) 300 | } 301 | }; 302 | 303 | Ok(lf.filter(filter_expr)) 304 | } 305 | 306 | fn apply_null_filter( 307 | lf: LazyFrame, 308 | filter: &str, 309 | schema: &Schema, 310 | is_null: bool, 311 | ) -> Result { 312 | let column = filter.trim_end_matches(if is_null { ".is_null" } else { ".is_not_null" }); 313 | 314 | if schema.get(column).is_none() { 315 | return Err(TblCliError::Error(format!("Column '{}' not found", column))); 316 | } 317 | 318 | let filter_expr = if is_null { 319 | col(column).is_null() 320 | } else { 321 | col(column).is_not_null() 322 | }; 323 | 324 | Ok(lf.filter(filter_expr)) 325 | } 326 | 327 | pub(crate) fn apply_rename( 328 | lf: LazyFrame, 329 | rename: Option<&[String]>, 330 | ) -> Result { 331 | match rename { 332 | None => Ok(lf), 333 | Some(rename) => { 334 | let (existing, new): (Vec, Vec) = 335 | rename 336 | .iter() 337 | .try_fold((Vec::new(), Vec::new()), |(mut old, mut new), r| { 338 | let parts: Vec<&str> = r.split('=').collect(); 339 | if parts.len() != 2 { 340 | return Err(TblCliError::Error("Invalid rename format".to_string())); 341 | } 342 | old.push(parts[0].to_string()); 343 | new.push(parts[1].to_string()); 344 | Ok((old, new)) 345 | })?; 346 | 347 | Ok(lf.rename(existing, new)) 348 | } 349 | } 350 | } 351 | 352 | pub(crate) fn apply_drop( 353 | lf: LazyFrame, 354 | columns: Option<&[String]>, 355 | ) -> Result { 356 | match columns { 357 | None => Ok(lf), 358 | Some(columns) => Ok(lf.drop(columns)), 359 | } 360 | } 361 | 362 | pub(crate) fn apply_cast(lf: LazyFrame, cast: Option<&[String]>) -> Result { 363 | match cast { 364 | None => Ok(lf), 365 | Some(cast) => { 366 | let mut new_lf = lf; 367 | for c in cast { 368 | let parts: Vec<&str> = c.split('=').collect(); 369 | if parts.len() != 2 { 370 | return Err(TblCliError::Error("InvalidCastFormat".to_string())); 371 | } 372 | let (column, dtype_str) = (parts[0], parts[1]); 373 | let dtype = parse_dtype(dtype_str)?; 374 | new_lf = new_lf.with_column(col(column).cast(dtype)); 375 | } 376 | Ok(new_lf) 377 | } 378 | } 379 | } 380 | 381 | pub(crate) fn apply_set(lf: LazyFrame, set: Option<&[String]>) -> Result { 382 | match set { 383 | None => Ok(lf), 384 | Some(set) => { 385 | let mut new_lf = lf; 386 | let schema = new_lf 387 | .schema() 388 | .map_err(|e| TblCliError::Error(e.to_string()))?; 389 | 390 | for s in set { 391 | println!("s: {:?}", s); 392 | let parts: Vec<&str> = s.split('=').collect(); 393 | println!("parts: {:?}", parts); 394 | if parts.len() != 2 { 395 | return Err(TblCliError::Error("Invalid set format".to_string())); 396 | } 397 | let (column, value) = (parts[0], parts[1]); 398 | 399 | let column_type = schema 400 | .get(column) 401 | .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?; 402 | println!("column_type: {:?}", column_type); 403 | println!("column: {:?}", column); 404 | 405 | let set_expr = raw_str_to_lit(column, value, column_type)?; 406 | println!("set_expr: {:?}", set_expr); 407 | new_lf = new_lf.with_column(set_expr.cast(column_type.clone())); 408 | println!("done"); 409 | } 410 | Ok(new_lf) 411 | } 412 | } 413 | } 414 | 415 | fn raw_str_to_lit(column: &str, value: &str, dtype: &DataType) -> Result { 416 | let lit_value = match dtype { 417 | DataType::Int8 => lit(i8::from_str(value) 418 | .map_err(|_| TblCliError::Error(format!("Invalid i8 value: {}", value)))?), 419 | DataType::Int16 => lit(i16::from_str(value) 420 | .map_err(|_| TblCliError::Error(format!("Invalid i16 value: {}", value)))?), 421 | DataType::Int32 => lit(i32::from_str(value) 422 | .map_err(|_| TblCliError::Error(format!("Invalid i32 value: {}", value)))?), 423 | DataType::Int64 => lit(i64::from_str(value) 424 | .map_err(|_| TblCliError::Error(format!("Invalid i64 value: {}", value)))?), 425 | DataType::UInt8 => lit(u8::from_str(value) 426 | .map_err(|_| TblCliError::Error(format!("Invalid u8 value: {}", value)))?), 427 | DataType::UInt16 => lit(u16::from_str(value) 428 | .map_err(|_| TblCliError::Error(format!("Invalid u16 value: {}", value)))?), 429 | DataType::UInt32 => lit(u32::from_str(value) 430 | .map_err(|_| TblCliError::Error(format!("Invalid u32 value: {}", value)))?), 431 | DataType::UInt64 => lit(u64::from_str(value) 432 | .map_err(|_| TblCliError::Error(format!("Invalid u64 value: {}", value)))?), 433 | DataType::Float32 => lit(f32::from_str(value) 434 | .map_err(|_| TblCliError::Error(format!("Invalid f32 value: {}", value)))?), 435 | DataType::Float64 => lit(f64::from_str(value) 436 | .map_err(|_| TblCliError::Error(format!("Invalid f64 value: {}", value)))?), 437 | DataType::Boolean => lit(bool::from_str(value) 438 | .map_err(|_| TblCliError::Error(format!("Invalid boolean value: {}", value)))?), 439 | DataType::String => lit(value.to_string()), 440 | DataType::Date => { 441 | let naive_date = 442 | chrono::NaiveDate::parse_from_str(value, "%Y-%m-%d").map_err(|_| { 443 | TblCliError::Error("Invalid date format. Use YYYY-MM-DD".to_string()) 444 | })?; 445 | lit(naive_date 446 | .and_hms_opt(0, 0, 0) 447 | .ok_or_else(|| TblCliError::Error("Failed to create NaiveDateTime".to_string()))? 448 | .and_utc() 449 | .timestamp_millis()) 450 | } 451 | DataType::Datetime(_, _) => { 452 | let naive_datetime = chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S") 453 | .map_err(|_| { 454 | TblCliError::Error( 455 | "Invalid datetime format. Use YYYY-MM-DD HH:MM:SS".to_string(), 456 | ) 457 | })?; 458 | lit(naive_datetime.and_utc().timestamp_millis()) 459 | } 460 | DataType::Binary => { 461 | if let Some(hex_value) = value.strip_prefix("0x") { 462 | let binary_value = hex::decode(hex_value) 463 | .map_err(|e| TblCliError::Error(format!("Invalid hex value: {}", e)))?; 464 | lit(binary_value) 465 | } else { 466 | return Err(TblCliError::Error( 467 | "Binary value must start with 0x".to_string(), 468 | )); 469 | } 470 | } 471 | _ => { 472 | return Err(TblCliError::Error(format!( 473 | "Unsupported column type for '{}': {:?}", 474 | column, dtype 475 | ))) 476 | } 477 | }; 478 | 479 | Ok(lit_value.alias(column)) 480 | } 481 | 482 | pub(crate) fn apply_nullify( 483 | lf: LazyFrame, 484 | raw_columns: Option<&[String]>, 485 | ) -> Result { 486 | match raw_columns { 487 | None => Ok(lf), 488 | Some(columns) => { 489 | let mut new_lf = lf; 490 | let schema = new_lf 491 | .schema() 492 | .map_err(|e| TblCliError::Error(e.to_string()))?; 493 | 494 | for column in columns.iter() { 495 | let column_type = schema 496 | .get(column) 497 | .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?; 498 | new_lf = new_lf.with_column( 499 | lit(LiteralValue::Null) 500 | .cast(column_type.clone()) 501 | .alias(column), 502 | ); 503 | } 504 | Ok(new_lf) 505 | } 506 | } 507 | } 508 | 509 | pub(crate) fn apply_replace( 510 | lf: LazyFrame, 511 | raw_values: Option<&[String]>, 512 | ) -> Result { 513 | match raw_values { 514 | None => Ok(lf), 515 | Some(values) => { 516 | let mut new_lf = lf; 517 | let schema = new_lf 518 | .schema() 519 | .map_err(|e| TblCliError::Error(e.to_string()))?; 520 | 521 | for value in values.iter() { 522 | // get column 523 | let parts: Vec<&str> = value.split('.').collect(); 524 | if parts.len() != 2 { 525 | return Err(TblCliError::Error("Invalid format".to_string())); 526 | } 527 | let (column, before_after) = (parts[0], parts[1]); 528 | 529 | // get old_value / new_value 530 | let parts: Vec<&str> = before_after.split('=').collect(); 531 | if parts.len() != 2 { 532 | return Err(TblCliError::Error("Invalid format".to_string())); 533 | } 534 | let (old_value, new_value) = (parts[0], parts[1]); 535 | 536 | let column_type = schema 537 | .get(column) 538 | .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?; 539 | 540 | let old_expr = raw_str_to_lit(column, old_value, column_type)?; 541 | let new_expr = raw_str_to_lit(column, new_value, column_type)?; 542 | new_lf = new_lf.with_column(col(column).replace(old_expr, new_expr)); 543 | } 544 | Ok(new_lf) 545 | } 546 | } 547 | } 548 | 549 | pub(crate) fn apply_sort( 550 | lf: LazyFrame, 551 | raw_columns: Option<&[String]>, 552 | ) -> Result { 553 | match raw_columns { 554 | None => Ok(lf), 555 | Some(raw_columns) => { 556 | let mut columns: Vec = Vec::new(); 557 | let mut descending: Vec = Vec::new(); 558 | for column in raw_columns.iter() { 559 | let column = column.to_string(); 560 | if column.ends_with(":desc") { 561 | columns.push(column[..column.len() - 5].to_string()); 562 | descending.push(true); 563 | } else { 564 | columns.push(column); 565 | descending.push(false); 566 | } 567 | } 568 | let options = polars::chunked_array::ops::SortMultipleOptions::default() 569 | .with_order_descending_multi(descending); 570 | Ok(lf.sort(columns, options)) 571 | } 572 | } 573 | } 574 | 575 | pub(crate) fn apply_select( 576 | lf: LazyFrame, 577 | columns: Option<&[String]>, 578 | ) -> Result { 579 | match columns { 580 | None => Ok(lf), 581 | Some(columns) => { 582 | let exprs: Vec = columns.iter().map(|c| col(c)).collect(); 583 | Ok(lf.select(&exprs)) 584 | } 585 | } 586 | } 587 | 588 | pub(crate) fn apply_head(lf: LazyFrame, n: Option) -> Result { 589 | match n { 590 | None => Ok(lf), 591 | Some(n) => Ok(lf.slice(0, n as u32)), 592 | } 593 | } 594 | 595 | pub(crate) fn apply_tail(lf: LazyFrame, n: Option) -> Result { 596 | match n { 597 | None => Ok(lf), 598 | Some(n) => Ok(lf.tail(n as u32)), 599 | } 600 | } 601 | 602 | pub(crate) fn apply_offset(lf: LazyFrame, n: Option) -> Result { 603 | match n { 604 | None => Ok(lf), 605 | Some(n) => Ok(lf.slice(n as i64, u32::MAX)), 606 | } 607 | } 608 | 609 | pub(crate) fn apply_value_counts(lf: LazyFrame, n: Option<&str>) -> Result { 610 | match n { 611 | None => Ok(lf), 612 | Some(column) => { 613 | // let expr = col(column).value_counts(true, false, "count".to_string(), false); 614 | // Ok(lf.select([expr])) 615 | let sort_options = SortMultipleOptions::new().with_order_descending(true); 616 | let value_counts = lf 617 | .group_by(&[col(column)]) 618 | .agg(&[col(column).count().alias("count")]) 619 | .sort(["count"], sort_options); 620 | Ok(value_counts) 621 | } 622 | } 623 | } 624 | -------------------------------------------------------------------------------- /crates/tbl-core/src/filesystem/outputs.rs: -------------------------------------------------------------------------------- 1 | use crate::TblError; 2 | use std::collections::HashMap; 3 | use std::path::PathBuf; 4 | 5 | /// output path spec 6 | #[derive(Default, Debug)] 7 | pub struct OutputPathSpec { 8 | /// inputs 9 | pub inputs: Option>, 10 | /// output_dir 11 | pub output_dir: Option, 12 | /// tree 13 | pub tree: bool, 14 | /// file_prefix 15 | pub file_prefix: Option, 16 | /// file_postfix 17 | pub file_postfix: Option, 18 | /// sort 19 | pub sort: bool, 20 | } 21 | 22 | impl OutputPathSpec { 23 | /// create new OutputPathSpec 24 | pub fn new() -> Self { 25 | OutputPathSpec::default() 26 | } 27 | 28 | /// set inputs 29 | pub fn inputs(mut self, inputs: I) -> Self 30 | where 31 | I: Into, 32 | { 33 | self.inputs = inputs.into().0; 34 | self 35 | } 36 | 37 | /// set output_dir 38 | pub fn output_dir(mut self, output_dir: T) -> Self 39 | where 40 | T: Into, 41 | { 42 | self.output_dir = output_dir.into().into(); 43 | self 44 | } 45 | 46 | /// set tree 47 | pub fn tree(mut self, tree: bool) -> Self { 48 | self.tree = tree; 49 | self 50 | } 51 | 52 | /// set file_prefix 53 | pub fn file_prefix(mut self, file_prefix: T) -> Self 54 | where 55 | T: Into>, 56 | { 57 | self.file_prefix = file_prefix.into(); 58 | self 59 | } 60 | 61 | /// set file_postfix 62 | pub fn file_postfix(mut self, file_postfix: T) -> Self 63 | where 64 | T: Into>, 65 | { 66 | self.file_postfix = file_postfix.into(); 67 | self 68 | } 69 | 70 | /// set sort 71 | pub fn sort(mut self, sort: bool) -> Self { 72 | self.sort = sort; 73 | self 74 | } 75 | } 76 | 77 | /// output dir type 78 | pub enum OutputDirType { 79 | /// &str 80 | Str(&'static str), 81 | /// String 82 | String(String), 83 | /// PathBuf 84 | PathBuf(PathBuf), 85 | /// None 86 | None, 87 | } 88 | 89 | impl From for Option { 90 | fn from(output_dir: OutputDirType) -> Self { 91 | match output_dir { 92 | OutputDirType::Str(s) => Some(PathBuf::from(s)), 93 | OutputDirType::String(s) => Some(PathBuf::from(s)), 94 | OutputDirType::PathBuf(p) => Some(p), 95 | OutputDirType::None => None, 96 | } 97 | } 98 | } 99 | 100 | // Implement From for all the required types 101 | impl From<&'static str> for OutputDirType { 102 | fn from(s: &'static str) -> Self { 103 | OutputDirType::Str(s) 104 | } 105 | } 106 | 107 | impl From for OutputDirType { 108 | fn from(s: String) -> Self { 109 | OutputDirType::String(s) 110 | } 111 | } 112 | 113 | impl From for OutputDirType { 114 | fn from(p: PathBuf) -> Self { 115 | OutputDirType::PathBuf(p) 116 | } 117 | } 118 | 119 | impl From> for OutputDirType 120 | where 121 | T: Into, 122 | { 123 | fn from(opt: Option) -> Self { 124 | match opt { 125 | Some(v) => v.into(), 126 | None => OutputDirType::None, 127 | } 128 | } 129 | } 130 | 131 | // New wrapper type 132 | /// InputPaths 133 | pub struct InputPaths(Option>); 134 | 135 | impl From> for InputPaths { 136 | fn from(v: Vec) -> Self { 137 | InputPaths(Some(v)) 138 | } 139 | } 140 | 141 | impl From>> for InputPaths { 142 | fn from(v: Option>) -> Self { 143 | InputPaths(v) 144 | } 145 | } 146 | 147 | impl From> for InputPaths { 148 | fn from(v: Vec) -> Self { 149 | InputPaths(Some(v.into_iter().map(PathBuf::from).collect())) 150 | } 151 | } 152 | 153 | impl From>> for InputPaths { 154 | fn from(v: Option>) -> Self { 155 | InputPaths(v.map(|strings| strings.into_iter().map(PathBuf::from).collect())) 156 | } 157 | } 158 | 159 | impl<'a> From> for InputPaths { 160 | fn from(v: Vec<&'a str>) -> Self { 161 | InputPaths(Some(v.into_iter().map(PathBuf::from).collect())) 162 | } 163 | } 164 | 165 | impl<'a> From>> for InputPaths { 166 | fn from(v: Option>) -> Self { 167 | InputPaths(v.map(|strings| strings.into_iter().map(PathBuf::from).collect())) 168 | } 169 | } 170 | 171 | /** get_output_dir() has many possible combinations of parameters 172 | 173 | possible dimensions of inputs 174 | - dimension: with or without --tree 175 | - dimension: with or without --output-dir 176 | - dimension: with or without --inputs 177 | - dimension: single or multiple --inputs 178 | - dimension: relative or absolute --inputs 179 | - dimension: file or directory --inputs 180 | 181 | cases that are easy: 182 | - without --inputs, without --tree, without --output-dir 183 | - read from CWD, write outputs to CWD 184 | - without --inputs, without --tree, with --output-dir 185 | - read from CWD, write outputs to --output-dir 186 | - without --inputs, with --tree, without --output-dir 187 | - read from CWD, write each file in its own original dir 188 | - without --inputs, with --tree, with --output-dir 189 | - read from CWD, write relative tree paths relative to --output-dir tree 190 | 191 | cases that are harder: 192 | - with single file --inputs 193 | - --tree doesnt matter 194 | - without --output-dir: writes file to that file's dir 195 | - with --output-dir: writes file to that dir 196 | - with single dir --inputs 197 | - without --tree, without --output-dir 198 | - read from the input dir, write to the input dir 199 | - without --tree, with --output-dir 200 | - read from the input dir, write to the output dir 201 | - with --tree, without --output-dir 202 | - use the input dir as tree root for both reading and writing 203 | - with --tree, with --output-dir 204 | - use input tree as reading tree root, output dir as writing tree root 205 | - with multiple --inputs 206 | - just treat each input path independently 207 | 208 | if --output-dir is used without --tree, every output goes directly in directory 209 | if --output-dir is used with --tree, the --output-dir is used as the new tree root 210 | */ 211 | pub fn get_output_paths( 212 | // inputs: Option>, 213 | // output_dir: Option, 214 | // tree: bool, 215 | output_spec: OutputPathSpec, 216 | ) -> Result<(Vec, Vec), TblError> { 217 | // gather inputs 218 | let output_dir = output_spec.output_dir; 219 | let inputs = match output_spec.inputs { 220 | None => vec![std::env::current_dir()?], 221 | Some(inputs) => inputs, 222 | }; 223 | 224 | // process each input separately 225 | let mut return_inputs: Vec = Vec::new(); 226 | let mut return_outputs: Vec = Vec::new(); 227 | for input in inputs { 228 | let metadata = std::fs::metadata(&input)?; 229 | if metadata.is_file() { 230 | // case 1: input is a file 231 | let output = super::manipulate::convert_file_path( 232 | &input, 233 | &output_dir, 234 | &output_spec.file_prefix, 235 | &output_spec.file_postfix, 236 | )?; 237 | return_inputs.push(input.clone()); 238 | return_outputs.push(output); 239 | } else if metadata.is_dir() { 240 | if !output_spec.tree { 241 | // case 2: input is a directory, non-tree mode 242 | for sub_input in super::gather::get_directory_tabular_files(&input)?.into_iter() { 243 | let output = super::manipulate::convert_file_path( 244 | &sub_input, 245 | &output_dir, 246 | &output_spec.file_prefix, 247 | &output_spec.file_postfix, 248 | )?; 249 | return_inputs.push(sub_input); 250 | return_outputs.push(output); 251 | } 252 | } else { 253 | // case 3: input is a directory, tree mode 254 | for sub_input in super::gather::get_tree_tabular_files(&input)?.into_iter() { 255 | // use relative path of tree leaf, change root to output_dir if provided 256 | let new_path = if let Some(output_dir) = output_dir.clone() { 257 | let relative_path = sub_input.strip_prefix(&input)?.to_path_buf(); 258 | output_dir.join(relative_path) 259 | } else { 260 | sub_input.clone() 261 | }; 262 | 263 | // change file prefix and postfix 264 | let output = super::manipulate::convert_file_path( 265 | &new_path, 266 | &None, 267 | &output_spec.file_prefix, 268 | &output_spec.file_postfix, 269 | )?; 270 | 271 | return_inputs.push(sub_input.clone()); 272 | return_outputs.push(output); 273 | } 274 | } 275 | } else { 276 | return Err(TblError::Error("".to_string())); 277 | }; 278 | } 279 | 280 | let (return_inputs, return_outputs) = if output_spec.sort { 281 | // Create a vector of paired inputs and outputs 282 | let mut paired = return_inputs 283 | .into_iter() 284 | .zip(return_outputs) 285 | .collect::>(); 286 | 287 | // Sort the paired vector based on the output paths 288 | paired.sort_by(|a, b| a.1.cmp(&b.1)); 289 | 290 | // Unzip the sorted paired vector back into separate input and output vectors 291 | paired.into_iter().unzip() 292 | } else { 293 | (return_inputs, return_outputs) 294 | }; 295 | 296 | // check that all output paths are unique to avoid collisions 297 | let mut count_per_output: HashMap = HashMap::new(); 298 | for output in return_outputs.iter() { 299 | *count_per_output.entry(output.clone()).or_insert(0) += 1; 300 | if count_per_output[output] > 1 { 301 | return Err(TblError::Error(format!( 302 | "Duplicate output path: {:?}", 303 | output 304 | ))); 305 | } 306 | } 307 | 308 | Ok((return_inputs, return_outputs)) 309 | } 310 | 311 | /* 312 | tests 313 | for the tests, generate the following file tree: 314 | root/ 315 | super_data_a.parquet 316 | super_data_b.parquet 317 | data1/ 318 | data1_a.parquet 319 | data1_b.parquet 320 | sub_data1_1/ 321 | sub_data1_a.parquet 322 | sub_data1_b.parquet 323 | data2/ 324 | data2_a.parquet 325 | data2_b.parquet 326 | test cases: 327 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"])) 328 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).tree(true)) 329 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./root")) 330 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./root").tree(true)) 331 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./other_root")) 332 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./other_root").tree(true)) 333 | 334 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"])) 335 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).tree(true)) 336 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./root")) 337 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./root").tree(true)) 338 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./other_root")) 339 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./other_root").tree(true)) 340 | 341 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"])) 342 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).tree(true)) 343 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./root")) 344 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./root").tree(true)) 345 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./other_root")) 346 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./other_root").tree(true)) 347 | 348 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"])) 349 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).tree(true)) 350 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./root")) 351 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./root").tree(true)) 352 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./other_root")) 353 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./other_root").tree(true)) 354 | */ 355 | #[cfg(test)] 356 | mod tests { 357 | use super::*; 358 | use std::fs::{self, File}; 359 | use tempfile::TempDir; 360 | 361 | fn create_test_file_tree() -> Result { 362 | let temp_dir = TempDir::new()?; 363 | println!("Created temporary directory: {:?}", temp_dir.path()); 364 | let root = temp_dir.path().join("root"); 365 | 366 | fs::create_dir(&root)?; 367 | File::create(root.join("super_data_a.parquet"))?; 368 | File::create(root.join("super_data_b.parquet"))?; 369 | 370 | let data1 = root.join("data1"); 371 | fs::create_dir(&data1)?; 372 | File::create(data1.join("data1_a.parquet"))?; 373 | File::create(data1.join("data1_b.parquet"))?; 374 | 375 | let sub_data1_1 = data1.join("sub_data1_1"); 376 | fs::create_dir(&sub_data1_1)?; 377 | File::create(sub_data1_1.join("sub_data1_a.parquet"))?; 378 | File::create(sub_data1_1.join("sub_data1_b.parquet"))?; 379 | 380 | let data2 = root.join("data2"); 381 | fs::create_dir(&data2)?; 382 | File::create(data2.join("data2_a.parquet"))?; 383 | File::create(data2.join("data2_b.parquet"))?; 384 | 385 | Ok(temp_dir) 386 | } 387 | 388 | struct TestCase { 389 | name: &'static str, 390 | spec: OutputPathSpec, 391 | expected_outputs: Vec<&'static str>, 392 | } 393 | 394 | macro_rules! generate_tests { 395 | ($($name:ident: $value:expr,)*) => { 396 | $( 397 | #[test] 398 | fn $name() -> Result<(), TblError> { 399 | let test_case: TestCase = $value; 400 | let mut spec = test_case.spec; 401 | 402 | // Create temporary directory and add its path to inputs and output_dir 403 | let temp_dir = create_test_file_tree()?; 404 | let temp_path = temp_dir.path().to_path_buf(); 405 | 406 | // Update inputs with temporary directory path 407 | if let Some(inputs) = spec.inputs.as_ref() { 408 | spec.inputs = Some(inputs.iter().map(|p| temp_path.join(p)).collect()); 409 | } else { 410 | spec.inputs = Some(vec![temp_path.join("root")]); 411 | } 412 | 413 | // Update output_dir with temporary directory path if it exists 414 | if let Some(output_dir) = spec.output_dir.as_ref() { 415 | spec.output_dir = Some(temp_path.join(output_dir)); 416 | } 417 | 418 | let (_inputs, outputs) = match get_output_paths(spec) { 419 | Ok((inputs, outputs)) => (inputs, outputs), 420 | Err(e) => return Err(TblError::Error(format!("{}", e).to_string())), 421 | }; 422 | 423 | let expected_outputs: Vec = test_case.expected_outputs 424 | .into_iter() 425 | .map(|p| temp_dir.path().join(p)) 426 | .collect(); 427 | 428 | let mut sorted_outputs = outputs.clone(); 429 | sorted_outputs.sort(); 430 | let mut sorted_expected_outputs = expected_outputs.clone(); 431 | sorted_expected_outputs.sort(); 432 | assert_eq!( 433 | sorted_outputs, 434 | sorted_expected_outputs, 435 | "Test case '{}' failed.\nExpected (sorted): {:?}\nGot (sorted): {:?}", 436 | test_case.name, 437 | sorted_expected_outputs, 438 | sorted_outputs 439 | ); 440 | 441 | Ok(()) 442 | } 443 | )* 444 | } 445 | } 446 | 447 | generate_tests! { 448 | test_root_input: TestCase { 449 | name: "Root input", 450 | spec: OutputPathSpec::new().inputs(vec!["root"]), 451 | expected_outputs: vec![ 452 | "root/super_data_a.parquet", 453 | "root/super_data_b.parquet", 454 | ], 455 | }, 456 | test_root_input_tree: TestCase { 457 | name: "Root input with tree", 458 | spec: OutputPathSpec::new().inputs(vec!["root"]).tree(true), 459 | expected_outputs: vec![ 460 | "root/super_data_a.parquet", 461 | "root/super_data_b.parquet", 462 | "root/data1/data1_a.parquet", 463 | "root/data1/data1_b.parquet", 464 | "root/data1/sub_data1_1/sub_data1_a.parquet", 465 | "root/data1/sub_data1_1/sub_data1_b.parquet", 466 | "root/data2/data2_a.parquet", 467 | "root/data2/data2_b.parquet", 468 | ], 469 | }, 470 | test_root_input_self_output_dir: TestCase { 471 | name: "Root input with self output dir", 472 | spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("root"), 473 | expected_outputs: vec![ 474 | "root/super_data_a.parquet", 475 | "root/super_data_b.parquet", 476 | ], 477 | }, 478 | test_root_input_self_output_dir_tree: TestCase { 479 | name: "Root input with self output dir tree", 480 | spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("root").tree(true), 481 | expected_outputs: vec![ 482 | "root/super_data_a.parquet", 483 | "root/super_data_b.parquet", 484 | "root/data1/data1_a.parquet", 485 | "root/data1/data1_b.parquet", 486 | "root/data1/sub_data1_1/sub_data1_a.parquet", 487 | "root/data1/sub_data1_1/sub_data1_b.parquet", 488 | "root/data2/data2_a.parquet", 489 | "root/data2/data2_b.parquet", 490 | ], 491 | }, 492 | test_root_input_output_dir: TestCase { 493 | name: "Root input with other output dir", 494 | spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("other_root"), 495 | expected_outputs: vec![ 496 | "other_root/super_data_a.parquet", 497 | "other_root/super_data_b.parquet", 498 | ], 499 | }, 500 | test_root_input_output_dir_tree: TestCase { 501 | name: "Root input with other output dir tree", 502 | spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("other_root").tree(true), 503 | expected_outputs: vec![ 504 | "other_root/super_data_a.parquet", 505 | "other_root/super_data_b.parquet", 506 | "other_root/data1/data1_a.parquet", 507 | "other_root/data1/data1_b.parquet", 508 | "other_root/data1/sub_data1_1/sub_data1_a.parquet", 509 | "other_root/data1/sub_data1_1/sub_data1_b.parquet", 510 | "other_root/data2/data2_a.parquet", 511 | "other_root/data2/data2_b.parquet", 512 | ], 513 | }, 514 | 515 | test_data1_input: TestCase { 516 | name: "Data1 input", 517 | spec: OutputPathSpec::new().inputs(vec!["root/data1"]), 518 | expected_outputs: vec![ 519 | "root/data1/data1_a.parquet", 520 | "root/data1/data1_b.parquet", 521 | ], 522 | }, 523 | test_data1_input_tree: TestCase { 524 | name: "Data1 input with tree", 525 | spec: OutputPathSpec::new().inputs(vec!["root/data1"]).tree(true), 526 | expected_outputs: vec![ 527 | "root/data1/data1_a.parquet", 528 | "root/data1/data1_b.parquet", 529 | "root/data1/sub_data1_1/sub_data1_a.parquet", 530 | "root/data1/sub_data1_1/sub_data1_b.parquet", 531 | ], 532 | }, 533 | test_data1_input_root_output: TestCase { 534 | name: "Data1 input with root output", 535 | spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("root"), 536 | expected_outputs: vec![ 537 | "root/data1_a.parquet", 538 | "root/data1_b.parquet", 539 | ], 540 | }, 541 | test_data1_input_root_output_tree: TestCase { 542 | name: "Data1 input with root output and tree", 543 | spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("root").tree(true), 544 | expected_outputs: vec![ 545 | "root/data1_a.parquet", 546 | "root/data1_b.parquet", 547 | "root/sub_data1_1/sub_data1_a.parquet", 548 | "root/sub_data1_1/sub_data1_b.parquet", 549 | ], 550 | }, 551 | test_data1_input_other_output: TestCase { 552 | name: "Data1 input with other output", 553 | spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("other_root"), 554 | expected_outputs: vec![ 555 | "other_root/data1_a.parquet", 556 | "other_root/data1_b.parquet", 557 | ], 558 | }, 559 | test_data1_input_other_output_tree: TestCase { 560 | name: "Data1 input with other output and tree", 561 | spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("other_root").tree(true), 562 | expected_outputs: vec![ 563 | "other_root/data1_a.parquet", 564 | "other_root/data1_b.parquet", 565 | "other_root/sub_data1_1/sub_data1_a.parquet", 566 | "other_root/sub_data1_1/sub_data1_b.parquet", 567 | ], 568 | }, 569 | test_data1_data2_input: TestCase { 570 | name: "Data1 and Data2 input", 571 | spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]), 572 | expected_outputs: vec![ 573 | "root/data1/data1_a.parquet", 574 | "root/data1/data1_b.parquet", 575 | "root/data2/data2_a.parquet", 576 | "root/data2/data2_b.parquet", 577 | ], 578 | }, 579 | test_data1_data2_input_tree: TestCase { 580 | name: "Data1 and Data2 input with tree", 581 | spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).tree(true), 582 | expected_outputs: vec![ 583 | "root/data1/data1_a.parquet", 584 | "root/data1/data1_b.parquet", 585 | "root/data1/sub_data1_1/sub_data1_a.parquet", 586 | "root/data1/sub_data1_1/sub_data1_b.parquet", 587 | "root/data2/data2_a.parquet", 588 | "root/data2/data2_b.parquet", 589 | ], 590 | }, 591 | test_data1_data2_input_root_output: TestCase { 592 | name: "Data1 and Data2 input with root output", 593 | spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("root"), 594 | expected_outputs: vec![ 595 | "root/data1_a.parquet", 596 | "root/data1_b.parquet", 597 | "root/data2_a.parquet", 598 | "root/data2_b.parquet", 599 | ], 600 | }, 601 | test_data1_data2_input_root_output_tree: TestCase { 602 | name: "Data1 and Data2 input with root output and tree", 603 | spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("root").tree(true), 604 | expected_outputs: vec![ 605 | "root/data1_a.parquet", 606 | "root/data1_b.parquet", 607 | "root/sub_data1_1/sub_data1_a.parquet", 608 | "root/sub_data1_1/sub_data1_b.parquet", 609 | "root/data2_a.parquet", 610 | "root/data2_b.parquet", 611 | ], 612 | }, 613 | test_data1_data2_input_other_output: TestCase { 614 | name: "Data1 and Data2 input with other output", 615 | spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("other_root"), 616 | expected_outputs: vec![ 617 | "other_root/data1_a.parquet", 618 | "other_root/data1_b.parquet", 619 | "other_root/data2_a.parquet", 620 | "other_root/data2_b.parquet", 621 | ], 622 | }, 623 | test_data1_data2_input_other_output_tree: TestCase { 624 | name: "Data1 and Data2 input with other output and tree", 625 | spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("other_root").tree(true), 626 | expected_outputs: vec![ 627 | "other_root/data1_a.parquet", 628 | "other_root/data1_b.parquet", 629 | "other_root/sub_data1_1/sub_data1_a.parquet", 630 | "other_root/sub_data1_1/sub_data1_b.parquet", 631 | "other_root/data2_a.parquet", 632 | "other_root/data2_b.parquet", 633 | ], 634 | }, 635 | test_specific_files_input: TestCase { 636 | name: "Specific files input", 637 | spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]), 638 | expected_outputs: vec![ 639 | "root/data1/data1_a.parquet", 640 | "root/super_data_a.parquet", 641 | ], 642 | }, 643 | test_specific_files_input_tree: TestCase { 644 | name: "Specific files input with tree", 645 | spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).tree(true), 646 | expected_outputs: vec![ 647 | "root/data1/data1_a.parquet", 648 | "root/super_data_a.parquet", 649 | ], 650 | }, 651 | test_specific_files_input_root_output: TestCase { 652 | name: "Specific files input with root output", 653 | spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("root"), 654 | expected_outputs: vec![ 655 | "root/data1_a.parquet", 656 | "root/super_data_a.parquet", 657 | ], 658 | }, 659 | test_specific_files_input_root_output_tree: TestCase { 660 | name: "Specific files input with root output and tree", 661 | spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("root").tree(true), 662 | expected_outputs: vec![ 663 | "root/data1_a.parquet", 664 | "root/super_data_a.parquet", 665 | ], 666 | }, 667 | test_specific_files_input_other_output: TestCase { 668 | name: "Specific files input with other output", 669 | spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("other_root"), 670 | expected_outputs: vec![ 671 | "other_root/data1_a.parquet", 672 | "other_root/super_data_a.parquet", 673 | ], 674 | }, 675 | test_specific_files_input_other_output_tree: TestCase { 676 | name: "Specific files input with other output and tree", 677 | spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("other_root").tree(true), 678 | expected_outputs: vec![ 679 | "other_root/data1_a.parquet", 680 | "other_root/super_data_a.parquet", 681 | ], 682 | }, 683 | 684 | } 685 | } 686 | --------------------------------------------------------------------------------