├── .gitignore
├── crates
    ├── tbl-cli
    │   ├── src
    │   │   ├── cli
    │   │   │   ├── mod.rs
    │   │   │   ├── subcommands
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── schemas.rs
    │   │   │   │   ├── ls.rs
    │   │   │   │   ├── data.rs
    │   │   │   │   └── schema.rs
    │   │   │   └── args.rs
    │   │   ├── main.rs
    │   │   ├── python.rs
    │   │   ├── types.rs
    │   │   ├── styles.rs
    │   │   ├── summary.rs
    │   │   ├── output.rs
    │   │   └── transform.rs
    │   ├── Cargo.toml
    │   └── build.rs
    └── tbl-core
    │   ├── src
    │       ├── filesystem
    │       │   ├── mod.rs
    │       │   ├── sizes.rs
    │       │   ├── inputs.rs
    │       │   ├── manipulate.rs
    │       │   ├── gather.rs
    │       │   └── outputs.rs
    │       ├── parquet
    │       │   ├── parquet_scan.rs
    │       │   ├── mod.rs
    │       │   ├── parquet_merge.rs
    │       │   ├── parquet_cast.rs
    │       │   ├── parquet_drop.rs
    │       │   ├── parquet_summary.rs
    │       │   └── parquet_insert.rs
    │       ├── lib.rs
    │       ├── types.rs
    │       └── formats.rs
    │   └── Cargo.toml
├── Cargo.toml
├── LICENSE-MIT
├── LICENSE-APACHE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | TODO.md
3 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/mod.rs:
--------------------------------------------------------------------------------
1 | mod args;
2 | mod subcommands;
3 | 
4 | pub(crate) use args::*;
5 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/subcommands/mod.rs:
--------------------------------------------------------------------------------
 1 | mod data;
 2 | pub(crate) use data::*;
 3 | 
 4 | mod ls;
 5 | pub(crate) use ls::*;
 6 | 
 7 | mod schema;
 8 | pub(crate) use schema::*;
 9 | 
10 | mod schemas;
11 | pub(crate) use schemas::*;
12 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/subcommands/schemas.rs:
--------------------------------------------------------------------------------
1 | use crate::{SchemasArgs, TblCliError};
2 | 
3 | pub(crate) async fn schemas_command(_args: SchemasArgs) -> Result<(), TblCliError> {
4 |     println!("[not implemented yet]");
5 |     Ok(())
6 | }
7 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/filesystem/mod.rs:
--------------------------------------------------------------------------------
 1 | /// path gathering functions
 2 | pub mod gather;
 3 | pub use gather::*;
 4 | 
 5 | /// path input functions
 6 | pub mod inputs;
 7 | pub use inputs::*;
 8 | 
 9 | /// path manipulate functions
10 | pub mod manipulate;
11 | pub use manipulate::*;
12 | 
13 | /// path outputs functions
14 | pub mod outputs;
15 | pub use outputs::*;
16 | 
17 | /// path size
18 | pub mod sizes;
19 | pub use sizes::*;
20 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/parquet_scan.rs:
--------------------------------------------------------------------------------
 1 | use crate::TblError;
 2 | use polars::prelude::*;
 3 | use std::path::PathBuf;
 4 | 
 5 | /// create lazy frame by scanning input paths
 6 | pub fn create_lazyframe(paths: &[PathBuf]) -> Result<LazyFrame, TblError> {
 7 |     let scan_args = polars::prelude::ScanArgsParquet::default();
 8 |     let arc_paths = Arc::from(paths.to_vec().into_boxed_slice());
 9 |     Ok(LazyFrame::scan_parquet_files(arc_paths, scan_args)?)
10 | }
11 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! utilities for reading and editing tabular files
 2 | 
 3 | #![allow(dead_code)]
 4 | #![warn(missing_docs, unreachable_pub, unused_crate_dependencies)]
 5 | #![deny(unused_must_use, rust_2018_idioms)]
 6 | #![doc(test(
 7 |     no_crate_inject,
 8 |     attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_variables))
 9 | ))]
10 | 
11 | /// filesystem utilities
12 | pub mod filesystem;
13 | 
14 | /// parquet utilities
15 | pub mod parquet;
16 | 
17 | /// types
18 | pub mod types;
19 | 
20 | /// formats
21 | pub mod formats;
22 | 
23 | pub use types::*;
24 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/mod.rs:
--------------------------------------------------------------------------------
 1 | /// parquet summary functions
 2 | pub mod parquet_summary;
 3 | pub use parquet_summary::*;
 4 | 
 5 | /// parquet drop functions
 6 | pub mod parquet_drop;
 7 | pub use parquet_drop::*;
 8 | 
 9 | /// parquet cast functions
10 | pub mod parquet_cast;
11 | pub use parquet_cast::*;
12 | 
13 | /// parquet merge functions
14 | pub mod parquet_merge;
15 | pub use parquet_merge::*;
16 | 
17 | /// parquet insert functions
18 | pub mod parquet_insert;
19 | pub use parquet_insert::*;
20 | 
21 | /// parquet parquet_scan
22 | pub mod parquet_scan;
23 | pub use parquet_scan::*;
24 | 


--------------------------------------------------------------------------------
/crates/tbl-core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tbl-core"
 3 | description = "utilities for reading and modifying tabular files"
 4 | version.workspace = true
 5 | edition.workspace = true
 6 | license.workspace = true
 7 | homepage.workspace = true
 8 | repository.workspace = true
 9 | 
10 | [dependencies]
11 | arrow = { workspace = true }
12 | colored = "2.1.0"
13 | futures = "0.3.30"
14 | hex = "0.4.3"
15 | parquet = { version = "52.0.0", features = ["async"] }
16 | polars = { workspace = true }
17 | thiserror = { workspace = true }
18 | tokio = { workspace = true }
19 | 
20 | [dev-dependencies]
21 | tempfile = "3.10.1"
22 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/main.rs:
--------------------------------------------------------------------------------
 1 | //! utilities for reading and editing tabular files
 2 | 
 3 | #![allow(dead_code)]
 4 | #![warn(missing_docs, unreachable_pub, unused_crate_dependencies)]
 5 | #![deny(unused_must_use, rust_2018_idioms)]
 6 | #![doc(test(
 7 |     no_crate_inject,
 8 |     attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_variables))
 9 | ))]
10 | 
11 | mod cli;
12 | pub(crate) use cli::*;
13 | 
14 | pub(crate) mod styles;
15 | 
16 | mod types;
17 | use types::*;
18 | 
19 | mod python;
20 | 
21 | mod summary;
22 | 
23 | mod transform;
24 | 
25 | mod output;
26 | 
27 | #[tokio::main]
28 | async fn main() -> Result<(), TblCliError> {
29 |     cli::run_cli().await
30 | }
31 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [workspace]
 3 | members = ["crates/tbl-core", "crates/tbl-cli"]
 4 | resolver = "2"
 5 | 
 6 | [workspace.package]
 7 | version = "0.1.1"
 8 | edition = "2021"
 9 | license = "MIT OR Apache-2.0"
10 | homepage = "https://github.com/paradigmxyz/tbl"
11 | repository = "https://github.com/paradigmxyz/tbl"
12 | exclude = [".github/"]
13 | 
14 | [workspace.dependencies]
15 | thiserror = "1.0"
16 | tokio = { version = "1.32.0", features = ["full"] }
17 | arrow = "52.0.0"
18 | polars = { version = "0.41.3", features = ["json", "parquet", "lazy", "csv", "dtype-u8", "dtype-u16", "dtype-decimal", "string_encoding", "binary_encoding", "concat_str", "replace", "strings", "streaming", "timezones"] }
19 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/filesystem/sizes.rs:
--------------------------------------------------------------------------------
 1 | use futures::stream::{FuturesUnordered, StreamExt};
 2 | use std::path::Path;
 3 | use tokio::fs;
 4 | 
 5 | /// get total number of bytes across files
 6 | pub async fn get_total_bytes_of_files(file_paths: &[&Path]) -> Result<u64, std::io::Error> {
 7 |     let futures = file_paths.iter().map(|path| async move {
 8 |         let metadata = fs::metadata(path).await?;
 9 |         Ok::<u64, std::io::Error>(if metadata.is_file() {
10 |             metadata.len()
11 |         } else {
12 |             0
13 |         })
14 |     });
15 | 
16 |     let mut total: u64 = 0;
17 |     let mut futures: FuturesUnordered<_> = futures.collect();
18 |     while let Some(result) = futures.next().await {
19 |         total += result?;
20 |     }
21 | 
22 |     Ok(total)
23 | }
24 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tbl-cli"
 3 | description = "tbl is a tool for reading and editing tabular data files"
 4 | version.workspace = true
 5 | edition.workspace = true
 6 | license.workspace = true
 7 | homepage.workspace = true
 8 | repository.workspace = true
 9 | 
10 | [[bin]]
11 | name = "tbl"
12 | path = "src/main.rs"
13 | 
14 | [dependencies]
15 | clap = { version = "4.4.8", features = ["derive"] }
16 | tokio = { workspace = true }
17 | thiserror = { workspace = true }
18 | tbl-core = { version = "0.1.0", path = "../tbl-core" }
19 | term_size = "0.3.2"
20 | polars = { workspace = true }
21 | toolstr = "0.1.5"
22 | toolstr_colored = "2.1.1"
23 | inquire = "0.7.5"
24 | anstyle = "1.0.7"
25 | color-print = "0.3.6"
26 | chrono = "0.4.38"
27 | hex = "0.4.3"
28 | 
29 | [build-dependencies]
30 | built = "0.7"
31 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2024 tbl contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/filesystem/inputs.rs:
--------------------------------------------------------------------------------
 1 | use crate::TblError;
 2 | use std::path::PathBuf;
 3 | 
 4 | /// get file paths
 5 | pub fn get_input_paths(
 6 |     inputs: &Option<Vec<PathBuf>>,
 7 |     tree: bool,
 8 |     sort: bool,
 9 | ) -> Result<Vec<PathBuf>, TblError> {
10 |     // get paths
11 |     let raw_paths = match inputs {
12 |         Some(raw_paths) => raw_paths.to_vec(),
13 |         None => vec![std::env::current_dir()?],
14 |     };
15 | 
16 |     // expand tree if specified
17 |     let mut paths: Vec<PathBuf> = vec![];
18 |     for raw_path in raw_paths.into_iter() {
19 |         if raw_path.is_dir() {
20 |             let sub_paths = if tree {
21 |                 super::gather::get_tree_tabular_files(&raw_path)?
22 |             } else {
23 |                 super::gather::get_directory_tabular_files(&raw_path)?
24 |             };
25 |             paths.extend(sub_paths);
26 |         } else if super::gather::is_tabular_file(&raw_path) {
27 |             paths.push(raw_path);
28 |         } else {
29 |             println!("skipping non-tabular file {:?}", raw_path)
30 |         }
31 |     }
32 | 
33 |     // sort
34 |     if sort {
35 |         paths.sort()
36 |     }
37 | 
38 |     Ok(paths)
39 | }
40 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/build.rs:
--------------------------------------------------------------------------------
 1 | use std::process::Command;
 2 | 
 3 | fn main() {
 4 |     // Get the most recent tag
 5 |     let tag_output = Command::new("git")
 6 |         .args(["describe", "--tags", "--abbrev=0"])
 7 |         .output()
 8 |         .expect("Failed to execute git command for tag");
 9 | 
10 |     let tag = String::from_utf8(tag_output.stdout)
11 |         .expect("Invalid UTF-8 output from git for tag")
12 |         .trim()
13 |         .to_string();
14 | 
15 |     // Get the git description (includes commits since tag, if any)
16 |     let desc_output = Command::new("git")
17 |         .args(["describe", "--always", "--dirty"])
18 |         .output()
19 |         .expect("Failed to execute git command for description");
20 | 
21 |     let git_description = String::from_utf8(desc_output.stdout)
22 |         .expect("Invalid UTF-8 output from git for description")
23 |         .trim()
24 |         .to_string();
25 | 
26 |     // Combine tag and description
27 |     let version_string = if tag == git_description {
28 |         // If they're the same, just use one
29 |         tag
30 |     } else {
31 |         format!("{}-{}", tag, git_description)
32 |     };
33 | 
34 |     println!("cargo:rustc-env=GIT_DESCRIPTION={}", version_string);
35 | 
36 |     built::write_built_file().expect("Failed to acquire build-time information");
37 | }
38 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/types.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | /// Tbl Error
 4 | #[derive(Error, Debug)]
 5 | pub enum TblError {
 6 |     /// Error wrapper for standard IO errors.
 7 |     #[error(transparent)]
 8 |     IOError(#[from] std::io::Error),
 9 | 
10 |     /// Error wrapper for polars errors.
11 |     #[error(transparent)]
12 |     PolarsError(#[from] polars::prelude::PolarsError),
13 | 
14 |     /// Error wrapper for parquet errors.
15 |     #[error(transparent)]
16 |     ParquetError(#[from] parquet::errors::ParquetError),
17 | 
18 |     /// Error wrapper for tokio errors.
19 |     #[error(transparent)]
20 |     TokioJoinError(#[from] tokio::task::JoinError),
21 | 
22 |     /// Error wrapper for tokio errors.
23 |     #[error(transparent)]
24 |     StripPrefixError(#[from] std::path::StripPrefixError),
25 | 
26 |     /// Error wrapper for tokio errors.
27 |     #[error(transparent)]
28 |     ArrowError(#[from] arrow::error::ArrowError),
29 | 
30 |     /// Error wrapper for schema errors.
31 |     #[error("Schema error: {0}")]
32 |     SchemaError(String),
33 | 
34 |     /// Error wrapper for input errors.
35 |     #[error("Input error: {0}")]
36 |     InputError(String),
37 | 
38 |     /// General Error
39 |     #[error("Input error: {0}")]
40 |     Error(String),
41 | 
42 |     /// Error wrapper for AcquireError
43 |     #[error(transparent)]
44 |     TokioAcquireError(#[from] tokio::sync::AcquireError),
45 | }
46 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/python.rs:
--------------------------------------------------------------------------------
 1 | use crate::TblCliError;
 2 | use std::path::PathBuf;
 3 | use std::process::Command;
 4 | 
 5 | pub(crate) fn load_df_interactive(
 6 |     paths: Vec<PathBuf>,
 7 |     lazy: bool,
 8 |     executable: Option<String>,
 9 | ) -> Result<(), TblCliError> {
10 |     let paths: Vec<_> = paths
11 |         .iter()
12 |         .map(|path| format!("'{}'", path.to_string_lossy()))
13 |         .collect();
14 |     let paths_str = paths.join(",\n    ");
15 | 
16 |     let input_word = if paths.len() == 1 { "input" } else { "inputs" };
17 | 
18 |     let (pl_function, pl_variable, final_str, final_print) = if lazy {
19 |         ("scan", "lf", "\\n# use `df = lf.collect()` to collect", "")
20 |     } else {
21 |         ("read", "df", "print(df)\\n", "\nprint(df)")
22 |     };
23 | 
24 |     let python_code = format!(
25 |         r#"
26 | import polars as pl
27 | 
28 | inputs = [
29 |     {}
30 | ]
31 | 
32 | {} = pl.{}_parquet(inputs)
33 | print()
34 | print('import polars as pl')
35 | print()
36 | print('# {}ing ' + str(len(inputs)) + ' {} into {}')
37 | print('inputs = [...]')
38 | print('{} = pl.{}_parquet(inputs)')
39 | print("{}")
40 | {}
41 | "#,
42 |         paths_str,
43 |         pl_variable,
44 |         pl_function,
45 |         pl_function,
46 |         input_word,
47 |         pl_variable,
48 |         pl_variable,
49 |         pl_function,
50 |         final_str,
51 |         final_print,
52 |     );
53 | 
54 |     let executable = if let Some(executable) = executable {
55 |         executable
56 |     } else {
57 |         "ipython".to_string()
58 |     };
59 | 
60 |     Command::new(executable)
61 |         .arg("-i")
62 |         .arg("-c")
63 |         .arg(python_code)
64 |         .spawn()?
65 |         .wait()?;
66 | 
67 |     Ok(())
68 | }
69 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/types.rs:
--------------------------------------------------------------------------------
 1 | use tbl_core::TblError;
 2 | use thiserror::Error;
 3 | 
 4 | #[derive(Error, Debug)]
 5 | pub(crate) enum TblCliError {
 6 |     /// Error wrapper for standard IO errors.
 7 |     #[error(transparent)]
 8 |     IO(#[from] std::io::Error),
 9 | 
10 |     /// Error wrapper for standard IO errors.
11 |     #[error(transparent)]
12 |     Tbl(#[from] TblError),
13 | 
14 |     /// Error caused by arguments
15 |     #[error("Argument error: {0}")]
16 |     Arg(String),
17 | 
18 |     /// Error wrapper for standard IO errors.
19 |     #[error(transparent)]
20 |     StripPrefix(#[from] std::path::StripPrefixError),
21 | 
22 |     /// Error wrapper for toolstr errors.
23 |     #[error(transparent)]
24 |     ToolstrError(#[from] toolstr::FormatError),
25 | 
26 |     /// Error wrapper for toolstr errors.
27 |     #[error(transparent)]
28 |     PolarsError(#[from] polars::prelude::PolarsError),
29 | 
30 |     /// Error caused by missing schema
31 |     #[error("Argument error: {0}")]
32 |     MissingSchemaError(String),
33 | 
34 |     /// Error parsing an int
35 |     #[error(transparent)]
36 |     ParseIntError(#[from] std::num::ParseIntError),
37 | 
38 |     /// General Error
39 |     #[error("Input error: {0}")]
40 |     Error(String),
41 | }
42 | 
43 | pub(crate) enum OutputMode {
44 |     PrintToStdout,
45 |     SaveToSingleFile,
46 |     ModifyInplace,
47 |     SaveToDirectory,
48 |     Partition,
49 |     InteractiveLf,
50 |     InteractiveDf,
51 | }
52 | 
53 | impl OutputMode {
54 |     pub(crate) fn writes_to_disk(&self) -> bool {
55 |         matches!(
56 |             self,
57 |             OutputMode::SaveToSingleFile
58 |                 | OutputMode::SaveToDirectory
59 |                 | OutputMode::ModifyInplace
60 |                 | OutputMode::Partition
61 |         )
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/filesystem/manipulate.rs:
--------------------------------------------------------------------------------
 1 | use crate::TblError;
 2 | use std::path::{Component, Path, PathBuf};
 3 | 
 4 | /// get common prefix of paths
 5 | pub fn get_common_prefix(paths: &[PathBuf]) -> Result<PathBuf, TblError> {
 6 |     if paths.is_empty() {
 7 |         return Err(TblError::InputError("no paths given".to_string()));
 8 |     }
 9 | 
10 |     let mut components_iter = paths.iter().map(|p| p.components());
11 |     let mut common_components: Vec<Component<'_>> = components_iter
12 |         .next()
13 |         .ok_or(TblError::Error(
14 |             "cannot parse common path components".to_string(),
15 |         ))?
16 |         .collect();
17 | 
18 |     for components in components_iter {
19 |         common_components = common_components
20 |             .iter()
21 |             .zip(components)
22 |             .take_while(|(a, b)| a == &b)
23 |             .map(|(a, _)| *a)
24 |             .collect();
25 |     }
26 | 
27 |     Ok(common_components.iter().collect())
28 | }
29 | 
30 | /// convert file path to new input
31 | pub fn convert_file_path(
32 |     input: &Path,
33 |     output_dir: &Option<PathBuf>,
34 |     file_prefix: &Option<String>,
35 |     file_postfix: &Option<String>,
36 | ) -> Result<PathBuf, TblError> {
37 |     // change output directory
38 |     let output = match output_dir.as_ref() {
39 |         Some(output_dir) => {
40 |             let file_name = input
41 |                 .file_name()
42 |                 .ok_or_else(|| TblError::Error("Invalid input path".to_string()))?;
43 |             output_dir.join(file_name)
44 |         }
45 |         None => input.to_path_buf(),
46 |     };
47 | 
48 |     if file_prefix.is_some() || file_postfix.is_some() {
49 |         let stem = output
50 |             .file_stem()
51 |             .ok_or_else(|| TblError::Error("Invalid output path".to_string()))?;
52 |         let extension = output.extension();
53 | 
54 |         let new_filename = format!(
55 |             "{}{}{}{}",
56 |             file_prefix.as_deref().unwrap_or(""),
57 |             stem.to_string_lossy(),
58 |             file_postfix.as_deref().unwrap_or(""),
59 |             extension.map_or_else(String::new, |ext| format!(".{}", ext.to_string_lossy()))
60 |         );
61 | 
62 |         Ok(output.with_file_name(new_filename))
63 |     } else {
64 |         Ok(output)
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/filesystem/gather.rs:
--------------------------------------------------------------------------------
 1 | use crate::TblError;
 2 | use futures::stream::StreamExt;
 3 | use std::path::{Path, PathBuf};
 4 | 
 5 | /// return tabular file paths within directory
 6 | pub fn get_directory_tabular_files(dir_path: &Path) -> Result<Vec<PathBuf>, TblError> {
 7 |     let mut tabular_files = Vec::new();
 8 | 
 9 |     for entry in std::fs::read_dir(dir_path)? {
10 |         let entry = entry?;
11 |         let path = entry.path();
12 | 
13 |         if path.is_file() && is_tabular_file(&path) {
14 |             tabular_files.push(path);
15 |         }
16 |     }
17 | 
18 |     Ok(tabular_files)
19 | }
20 | 
21 | /// get tabular files inside directory tree
22 | pub fn get_tree_tabular_files(dir_path: &std::path::Path) -> Result<Vec<PathBuf>, TblError> {
23 |     let mut tabular_files = Vec::new();
24 |     for entry in std::fs::read_dir(dir_path)? {
25 |         let entry = entry?;
26 |         let path = entry.path();
27 |         if path.is_file() && is_tabular_file(&path) {
28 |             tabular_files.push(path);
29 |         } else if path.is_dir() {
30 |             let sub_dir_files = get_tree_tabular_files(&path)?;
31 |             tabular_files.extend(sub_dir_files);
32 |         }
33 |     }
34 |     Ok(tabular_files)
35 | }
36 | 
37 | /// return true if file_path has a tabular extension
38 | pub fn is_tabular_file(file_path: &std::path::Path) -> bool {
39 |     // let tabular_extensions = ["parquet", "csv"];
40 |     let tabular_extensions = ["parquet"];
41 | 
42 |     if let Some(extension) = file_path.extension() {
43 |         let extension = extension.to_string_lossy().to_string();
44 |         tabular_extensions.contains(&extension.as_str())
45 |     } else {
46 |         false
47 |     }
48 | }
49 | 
50 | /// count number of existing files
51 | pub async fn count_existing_files(paths: &[PathBuf]) -> usize {
52 |     const CONCURRENT_LIMIT: usize = 1000; // Adjust based on your system's capabilities
53 | 
54 |     futures::stream::iter(paths)
55 |         .map(tokio::fs::metadata)
56 |         .buffer_unordered(CONCURRENT_LIMIT)
57 |         .filter_map(|result| async move {
58 |             match result {
59 |                 Ok(metadata) => Some(metadata.is_file()),
60 |                 Err(_) => None,
61 |             }
62 |         })
63 |         .fold(0, |acc, is_file| async move {
64 |             if is_file {
65 |                 acc + 1
66 |             } else {
67 |                 acc
68 |             }
69 |         })
70 |         .await
71 | }
72 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/parquet_merge.rs:
--------------------------------------------------------------------------------
 1 | use crate::TblError;
 2 | use futures::StreamExt;
 3 | use parquet::arrow::arrow_writer::ArrowWriter;
 4 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
 5 | use parquet::file::properties::WriterProperties;
 6 | use std::io::BufWriter as StdBufWriter;
 7 | use std::path::PathBuf;
 8 | use tokio::fs::File;
 9 | use tokio::io::AsyncWriteExt;
10 | 
11 | /// merge parquet files into one
12 | pub async fn merge_parquets(
13 |     input_paths: &Vec<PathBuf>,
14 |     output_path: &PathBuf,
15 |     batch_size: usize,
16 | ) -> Result<(), crate::TblError> {
17 |     if input_paths.is_empty() {
18 |         return Err(crate::TblError::Error(
19 |             "No input files provided".to_string(),
20 |         ));
21 |     }
22 | 
23 |     let tmp_output_path = super::parquet_drop::create_tmp_target(output_path.as_path());
24 |     let mut output_file = File::create(&tmp_output_path).await?;
25 |     let mut buffer = Vec::new();
26 | 
27 |     // Read the schema from the first file
28 |     let first_file = File::open(&input_paths[0]).await?;
29 |     let builder = ParquetRecordBatchStreamBuilder::new(first_file)
30 |         .await?
31 |         .with_batch_size(batch_size);
32 |     let schema = builder.schema().clone();
33 | 
34 |     let writer_props = WriterProperties::builder().build();
35 |     let mut arrow_writer = ArrowWriter::try_new(
36 |         StdBufWriter::new(&mut buffer),
37 |         schema.clone(),
38 |         Some(writer_props),
39 |     )?;
40 | 
41 |     for input_path in input_paths {
42 |         let input_file = File::open(input_path).await?;
43 |         let builder = ParquetRecordBatchStreamBuilder::new(input_file)
44 |             .await?
45 |             .with_batch_size(batch_size);
46 |         let mut reader_stream = builder.build()?;
47 | 
48 |         // Verify that the schema matches
49 |         if reader_stream.schema() != &schema {
50 |             println!("SCHEMA OF {}:", input_paths[0].to_string_lossy());
51 |             println!("{:?}", schema);
52 |             println!();
53 |             println!("SCHEMA OF {}:", input_path.to_string_lossy());
54 |             println!("{:?}", reader_stream.schema());
55 |             return Err(TblError::SchemaError(
56 |                 "schemas of files are not equal".to_string(),
57 |             ));
58 |         }
59 | 
60 |         while let Some(batch) = reader_stream.next().await {
61 |             let batch = batch?;
62 |             arrow_writer.write(&batch)?;
63 |         }
64 |     }
65 | 
66 |     arrow_writer.close()?;
67 |     output_file.write_all(&buffer).await?;
68 |     output_file.flush().await?;
69 |     tokio::fs::rename(tmp_output_path, output_path).await?;
70 | 
71 |     Ok(())
72 | }
73 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/parquet_cast.rs:
--------------------------------------------------------------------------------
 1 | use polars::prelude::*;
 2 | use std::collections::HashMap;
 3 | use std::path::PathBuf;
 4 | // use arrow::datatypes::Schema as ArrowSchema;
 5 | // use arrow::record_batch::RecordBatch;
 6 | // use parquet::arrow::arrow_writer::ArrowWriter;
 7 | // use parquet::file::properties::WriterProperties;
 8 | // use std::io::BufWriter;
 9 | // use std::sync::Arc;
10 | // use tokio::fs::File;
11 | // use tokio::io::AsyncWriteExt;
12 | use crate::types::TblError;
13 | 
14 | /// cast columns of parquet file to new type
15 | pub async fn cast_parquet_columns(
16 |     _input_path: PathBuf,
17 |     _output_path: PathBuf,
18 |     _columns_to_cast: HashMap<String, DataType>,
19 |     _batch_size: usize,
20 | ) -> Result<(), crate::TblError> {
21 |     Err(TblError::Error("not implemented".to_string()))
22 |     // // Create a LazyFrame from the input Parquet file
23 |     // let lf = LazyFrame::scan_parquet(
24 |     //     input_path.to_str().ok_or_else(|| crate::TblError::Error("Invalid input path".to_string()))?,
25 |     //     ScanArgsParquet::default()
26 |     // )?;
27 | 
28 |     // // Apply the casts
29 |     // let casted_lf = lf.with_columns(
30 |     //     columns_to_cast.iter().map(|(col_name, new_type)| {
31 |     //         col(col_name).cast(new_type.clone())
32 |     //     }).collect::<Vec<_>>()
33 |     // );
34 | 
35 |     // // Collect the schema
36 |     // let schema = casted_lf.schema().map_err(|e| crate::TblError::PolarsError(e))?;
37 | 
38 |     // // Create temporary output path
39 |     // let tmp_output_path = super::parquet_drop::create_tmp_target(output_path.as_path());
40 | 
41 |     // // Open output file
42 |     // let mut output_file = File::create(&tmp_output_path).await?;
43 | 
44 |     // // Convert Polars schema to Arrow schema
45 |     // let arrow_schema: Arc<ArrowSchema> = Arc::new(schema.to_arrow(true));
46 | 
47 |     // // Set up Arrow writer
48 |     // let writer_props = WriterProperties::builder().build();
49 |     // let mut buffer = Vec::new();
50 |     // let mut arrow_writer = ArrowWriter::try_new(
51 |     //     BufWriter::new(&mut buffer),
52 |     //     arrow_schema.clone(),
53 |     //     Some(writer_props),
54 |     // )?;
55 | 
56 |     // // Process data in batches
57 |     // let df = casted_lf.collect()?;
58 |     // for batch in df.iter_chunks(false) {
59 |     //     let arrow_batch = RecordBatch::try_from_iter(
60 |     //         arrow_schema.fields().iter().zip(batch.iter()).map(|(field, array)| {
61 |     //             Ok((field.name().to_string(), array.clone() as Arc<dyn arrow::array::Array>))
62 |     //         })
63 |     //     )?;
64 |     //     arrow_writer.write(&arrow_batch)?;
65 |     // }
66 | 
67 |     // // Finish writing
68 |     // arrow_writer.close()?;
69 |     // output_file.write_all(&buffer).await?;
70 |     // output_file.flush().await?;
71 | 
72 |     // // Rename temporary file to final output file
73 |     // std::fs::rename(tmp_output_path, output_path)?;
74 | 
75 |     // Ok(())
76 | }
77 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/subcommands/ls.rs:
--------------------------------------------------------------------------------
  1 | use crate::{LsArgs, TblCliError};
  2 | use toolstr::Colorize;
  3 | 
  4 | pub(crate) async fn ls_command(ls_args: LsArgs) -> Result<(), TblCliError> {
  5 |     // get paths
  6 |     let paths = tbl_core::filesystem::get_input_paths(&ls_args.paths, ls_args.tree, true)?;
  7 | 
  8 |     if paths.is_empty() {
  9 |         println!("[no tabular paths]");
 10 |         return Ok(());
 11 |     }
 12 | 
 13 |     // print file names
 14 |     print_file_names(&paths, ls_args.n, ls_args.absolute)?;
 15 | 
 16 |     // print stats
 17 |     print_stats(&paths).await?;
 18 | 
 19 |     Ok(())
 20 | }
 21 | 
 22 | fn print_file_names(
 23 |     paths: &[std::path::PathBuf],
 24 |     n: Option<usize>,
 25 |     absolute: bool,
 26 | ) -> Result<(), TblCliError> {
 27 |     // clear common prefix
 28 |     let display_paths = if absolute || (paths.len() == 1) {
 29 |         paths.to_vec()
 30 |     } else {
 31 |         let common_prefix = tbl_core::filesystem::get_common_prefix(paths)?;
 32 |         let mut new_paths = Vec::new();
 33 |         for path in paths.iter() {
 34 |             new_paths.push(path.strip_prefix(&common_prefix)?.to_owned())
 35 |         }
 36 |         new_paths
 37 |     };
 38 | 
 39 |     // decide number of files to print
 40 |     let n_print = match n {
 41 |         Some(n) => n,
 42 |         None => {
 43 |             if let Some((_, height)) = term_size::dimensions() {
 44 |                 if height >= 5 {
 45 |                     height - 4
 46 |                 } else {
 47 |                     1
 48 |                 }
 49 |             } else {
 50 |                 100
 51 |             }
 52 |         }
 53 |     };
 54 | 
 55 |     // print out file names or paths
 56 |     for path in display_paths.iter().take(n_print) {
 57 |         println!("{}", path.to_string_lossy().purple())
 58 |     }
 59 |     if n_print < paths.len() {
 60 |         println!(
 61 |             "{}",
 62 |             format!(
 63 |                 "... {} files not shown",
 64 |                 tbl_core::formats::format_with_commas((paths.len() - n_print) as u64).bold()
 65 |             )
 66 |             .truecolor(150, 150, 150)
 67 |         );
 68 |     }
 69 | 
 70 |     Ok(())
 71 | }
 72 | 
 73 | async fn print_stats(paths: &[std::path::PathBuf]) -> Result<(), TblCliError> {
 74 |     // get total file size
 75 |     let mut total_size: u64 = 0;
 76 |     for path in paths.iter() {
 77 |         let metadata = std::fs::metadata(path)?;
 78 |         total_size += metadata.len();
 79 |     }
 80 | 
 81 |     // get row counts
 82 |     let path_refs: Vec<&std::path::Path> =
 83 |         paths.iter().map(|path_buf| path_buf.as_path()).collect();
 84 |     let row_counts = tbl_core::parquet::get_parquet_row_counts(&path_refs).await?;
 85 | 
 86 |     // print total summary
 87 |     println!(
 88 |         "{} rows stored in {} across {} tabular files",
 89 |         tbl_core::formats::format_with_commas(row_counts.iter().sum())
 90 |             .green()
 91 |             .bold(),
 92 |         tbl_core::formats::format_bytes(total_size).green().bold(),
 93 |         tbl_core::formats::format_with_commas(paths.len() as u64)
 94 |             .green()
 95 |             .bold()
 96 |     );
 97 | 
 98 |     Ok(())
 99 | }
100 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/styles.rs:
--------------------------------------------------------------------------------
 1 | use toolstr::Colorize;
 2 | use toolstr_colored::ColoredString;
 3 | 
 4 | pub(crate) fn get_styles() -> clap::builder::Styles {
 5 |     let white = anstyle::Color::Rgb(anstyle::RgbColor(255, 255, 255));
 6 |     let green = anstyle::Color::Rgb(anstyle::RgbColor(0, 225, 0));
 7 |     let grey = anstyle::Color::Rgb(anstyle::RgbColor(170, 170, 170));
 8 |     let title = anstyle::Style::new().bold().fg_color(Some(green));
 9 |     let arg = anstyle::Style::new().bold().fg_color(Some(white));
10 |     let comment = anstyle::Style::new().fg_color(Some(grey));
11 |     clap::builder::Styles::styled()
12 |         .header(title)
13 |         .error(comment)
14 |         .usage(title)
15 |         .literal(arg)
16 |         .placeholder(comment)
17 |         .valid(title)
18 |         .invalid(comment)
19 | }
20 | 
21 | pub(crate) trait FontStyle {
22 |     fn colorize_background(self) -> ColoredString;
23 |     fn colorize_title(self) -> ColoredString;
24 |     fn colorize_comment(self) -> ColoredString;
25 |     fn colorize_string(self) -> ColoredString;
26 |     fn colorize_constant(self) -> ColoredString;
27 |     fn colorize_function(self) -> ColoredString;
28 |     fn colorize_variable(self) -> ColoredString;
29 | }
30 | 
31 | impl FontStyle for &str {
32 |     fn colorize_background(self) -> ColoredString {
33 |         self.truecolor(40, 42, 54)
34 |     }
35 | 
36 |     fn colorize_title(self) -> ColoredString {
37 |         self.truecolor(206, 147, 249).bold()
38 |     }
39 | 
40 |     fn colorize_comment(self) -> ColoredString {
41 |         self.truecolor(98, 114, 164)
42 |     }
43 | 
44 |     fn colorize_string(self) -> ColoredString {
45 |         self.truecolor(241, 250, 140)
46 |     }
47 | 
48 |     fn colorize_constant(self) -> ColoredString {
49 |         self.truecolor(185, 242, 159)
50 |     }
51 | 
52 |     fn colorize_function(self) -> ColoredString {
53 |         self.truecolor(139, 233, 253)
54 |     }
55 | 
56 |     fn colorize_variable(self) -> ColoredString {
57 |         self.truecolor(100, 170, 170)
58 |     }
59 | }
60 | 
61 | use inquire::ui::{Attributes, Color, IndexPrefix, RenderConfig, StyleSheet, Styled};
62 | 
63 | pub(crate) fn get_render_config() -> RenderConfig<'static> {
64 |     let highlight_color = Color::DarkGreen;
65 | 
66 |     let mut render_config = RenderConfig::default();
67 |     render_config.prompt = StyleSheet::new().with_attr(Attributes::BOLD);
68 |     render_config.prompt_prefix = Styled::new("").with_fg(Color::LightRed);
69 |     render_config.answered_prompt_prefix = Styled::new("").with_fg(Color::LightRed);
70 |     render_config.placeholder = StyleSheet::new().with_fg(Color::LightRed);
71 |     render_config.selected_option = Some(StyleSheet::new().with_fg(highlight_color));
72 |     render_config.highlighted_option_prefix = Styled::new("→").with_fg(highlight_color);
73 |     render_config.selected_checkbox = Styled::new("☑").with_fg(highlight_color);
74 |     render_config.scroll_up_prefix = Styled::new("⇞");
75 |     render_config.scroll_down_prefix = Styled::new("⇟");
76 |     render_config.unselected_checkbox = Styled::new("☐");
77 |     render_config.option_index_prefix = IndexPrefix::Simple;
78 |     render_config.error_message = render_config
79 |         .error_message
80 |         .with_prefix(Styled::new("❌").with_fg(Color::LightRed));
81 |     render_config.answer = StyleSheet::new()
82 |         .with_attr(Attributes::BOLD)
83 |         .with_fg(highlight_color);
84 |     let grey = Color::Rgb {
85 |         r: 100,
86 |         g: 100,
87 |         b: 100,
88 |     };
89 |     render_config.help_message = StyleSheet::new()
90 |         .with_fg(grey)
91 |         .with_attr(Attributes::ITALIC);
92 | 
93 |     render_config
94 | }
95 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/formats.rs:
--------------------------------------------------------------------------------
  1 | use colored::Colorize;
  2 | 
  3 | /// format bytes
  4 | pub fn format_bytes(bytes: u64) -> String {
  5 |     let units = ["B", "KB", "MB", "GB", "TB", "PB", "EB"];
  6 |     let mut size = bytes as f64;
  7 |     let mut unit = 0;
  8 | 
  9 |     while size >= 1024.0 && unit < units.len() - 1 {
 10 |         size /= 1024.0;
 11 |         unit += 1;
 12 |     }
 13 | 
 14 |     format!("{:.2} {}", size, units[unit])
 15 | }
 16 | 
 17 | /// format number with commas
 18 | pub fn format_with_commas(number: u64) -> String {
 19 |     let num_str = number.to_string();
 20 |     let mut result = String::new();
 21 |     let mut count = 0;
 22 | 
 23 |     for c in num_str.chars().rev() {
 24 |         if count == 3 {
 25 |             result.push(',');
 26 |             count = 0;
 27 |         }
 28 |         result.push(c);
 29 |         count += 1;
 30 |     }
 31 | 
 32 |     result.chars().rev().collect()
 33 | }
 34 | 
 35 | const TITLE_R: u8 = 0;
 36 | const TITLE_G: u8 = 225;
 37 | const TITLE_B: u8 = 0;
 38 | const ERROR_R: u8 = 225;
 39 | const ERROR_G: u8 = 0;
 40 | const ERROR_B: u8 = 0;
 41 | 
 42 | /// print header
 43 | pub fn print_header<A: AsRef<str>>(header: A) {
 44 |     let header_str = header.as_ref().white().bold();
 45 |     let underline = "─"
 46 |         .repeat(header_str.len())
 47 |         .truecolor(TITLE_R, TITLE_G, TITLE_B);
 48 |     println!("{}", header_str);
 49 |     println!("{}", underline);
 50 | }
 51 | 
 52 | /// print header error
 53 | pub fn print_header_error<A: AsRef<str>>(header: A) {
 54 |     let header_str = header.as_ref().white().bold();
 55 |     let underline = "─"
 56 |         .repeat(header_str.len())
 57 |         .truecolor(ERROR_R, ERROR_G, ERROR_B);
 58 |     println!("{}", header_str);
 59 |     println!("{}", underline);
 60 | }
 61 | 
 62 | /// print bullet as `- key`
 63 | pub fn print_bullet_key<A: AsRef<str>>(key: A) {
 64 |     let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B);
 65 |     let key_str = key.as_ref().white().bold();
 66 |     println!("{}{}", bullet_str, key_str);
 67 | }
 68 | 
 69 | /// print bullet as `- key: value`
 70 | pub fn print_bullet<A: AsRef<str>, B: AsRef<str>>(key: A, value: B) {
 71 |     let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B);
 72 |     let key_str = key.as_ref().white().bold();
 73 |     let value_str = value.as_ref().truecolor(170, 170, 170);
 74 |     let colon_str = ": ".truecolor(TITLE_R, TITLE_G, TITLE_B);
 75 |     println!("{}{}{}{}", bullet_str, key_str, colon_str, value_str);
 76 | }
 77 | 
 78 | /// print bullet as `- key (value)`
 79 | pub fn print_bullet_parenthetical<A: AsRef<str>, B: AsRef<str>>(key: A, value: B) {
 80 |     let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B);
 81 |     let key_str = key.as_ref().white().bold();
 82 |     let value_str = value.as_ref().truecolor(170, 170, 170);
 83 |     println!("{}{} ({})", bullet_str, key_str, value_str);
 84 | }
 85 | 
 86 | /// print bullet as `    - key: value`
 87 | pub fn print_bullet_indent<A: AsRef<str>, B: AsRef<str>>(key: A, value: B, indent: usize) {
 88 |     let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B);
 89 |     let key_str = key.as_ref().white().bold();
 90 |     let value_str = value.as_ref().truecolor(170, 170, 170);
 91 |     let colon_str = ": ".truecolor(TITLE_R, TITLE_G, TITLE_B);
 92 |     println!(
 93 |         "{}{}{}{}{}",
 94 |         " ".repeat(indent),
 95 |         bullet_str,
 96 |         key_str,
 97 |         colon_str,
 98 |         value_str
 99 |     );
100 | }
101 | 
102 | /// print bullet as `    - key`
103 | pub fn print_bullet_key_indent<A: AsRef<str>>(key: A, indent: usize) {
104 |     let bullet_str = "- ".truecolor(TITLE_R, TITLE_G, TITLE_B);
105 |     let key_str = key.as_ref().white().bold();
106 |     println!("{}{}{}", " ".repeat(indent), bullet_str, key_str,);
107 | }
108 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/parquet_drop.rs:
--------------------------------------------------------------------------------
  1 | use arrow::datatypes::Schema;
  2 | use arrow::record_batch::RecordBatch;
  3 | use futures::stream::StreamExt;
  4 | use parquet::arrow::arrow_writer::ArrowWriter;
  5 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
  6 | use parquet::file::properties::WriterProperties;
  7 | use std::io::BufWriter;
  8 | use std::path::PathBuf;
  9 | use std::sync::Arc;
 10 | use tokio::fs::File;
 11 | use tokio::io::AsyncWriteExt;
 12 | 
 13 | /// create temporary path target similar to the final target path
 14 | pub fn create_tmp_target(path: &std::path::Path) -> PathBuf {
 15 |     let mut new_path = path.to_path_buf();
 16 |     let suffix = "_tmp";
 17 |     if let Some(stem) = path.file_stem() {
 18 |         let mut new_stem = stem.to_string_lossy().into_owned();
 19 |         new_stem.push_str(suffix);
 20 |         if let Some(extension) = path.extension() {
 21 |             new_stem.push('.');
 22 |             new_stem.push_str(&extension.to_string_lossy());
 23 |         }
 24 |         new_path.set_file_name(new_stem);
 25 |     }
 26 | 
 27 |     new_path
 28 | }
 29 | 
 30 | /// drop columns from parquet column
 31 | pub async fn drop_parquet_columns(
 32 |     input_path: PathBuf,
 33 |     output_path: PathBuf,
 34 |     columns_to_drop: Vec<String>,
 35 |     batch_size: usize,
 36 | ) -> Result<(), crate::TblError> {
 37 |     let input_file = File::open(input_path).await?;
 38 |     let tmp_output_path = create_tmp_target(output_path.as_path());
 39 |     let mut output_file = File::create(&tmp_output_path).await?;
 40 |     let builder = ParquetRecordBatchStreamBuilder::new(input_file)
 41 |         .await?
 42 |         .with_batch_size(batch_size);
 43 |     let mut reader_stream = builder.build()?;
 44 |     let original_schema = reader_stream.schema().clone();
 45 | 
 46 |     // Create new schema without dropped columns
 47 |     let new_schema = Arc::new(Schema::new(
 48 |         original_schema
 49 |             .fields()
 50 |             .iter()
 51 |             .filter_map(|field| {
 52 |                 if !columns_to_drop.contains(field.name()) {
 53 |                     Some(field.clone())
 54 |                 } else {
 55 |                     None
 56 |                 }
 57 |             })
 58 |             .collect::<Vec<_>>(),
 59 |     ));
 60 | 
 61 |     let writer_props = WriterProperties::builder().build();
 62 |     let mut buffer = Vec::new();
 63 |     let mut arrow_writer = ArrowWriter::try_new(
 64 |         BufWriter::new(&mut buffer),
 65 |         new_schema.clone(),
 66 |         Some(writer_props),
 67 |     )?;
 68 | 
 69 |     while let Some(batch) = reader_stream.next().await {
 70 |         let batch = batch?;
 71 |         let new_columns = batch
 72 |             .columns()
 73 |             .iter()
 74 |             .enumerate()
 75 |             .filter_map(|(i, col)| {
 76 |                 if !columns_to_drop.contains(original_schema.field(i).name()) {
 77 |                     Some(col.clone())
 78 |                 } else {
 79 |                     None
 80 |                 }
 81 |             })
 82 |             .collect::<Vec<_>>();
 83 | 
 84 |         let new_batch = RecordBatch::try_new(new_schema.clone(), new_columns)?;
 85 |         arrow_writer.write(&new_batch)?;
 86 |     }
 87 | 
 88 |     arrow_writer.close()?;
 89 |     output_file.write_all(&buffer).await?;
 90 |     output_file.flush().await?;
 91 | 
 92 |     std::fs::rename(tmp_output_path, output_path)?;
 93 | 
 94 |     Ok(())
 95 | }
 96 | 
 97 | /// drop columns from multiple parquet files
 98 | pub async fn drop_parquets_columns(
 99 |     input_output_paths: Vec<(PathBuf, PathBuf)>,
100 |     columns_to_drop: Vec<String>,
101 |     batch_size: usize,
102 |     max_concurrent: usize,
103 | ) -> Result<(), crate::TblError> {
104 |     let semaphore = Arc::new(tokio::sync::Semaphore::new(max_concurrent));
105 | 
106 |     let results = futures::stream::iter(input_output_paths)
107 |         .map(|(input, output)| {
108 |             let columns_to_drop = columns_to_drop.clone();
109 |             let sem = Arc::clone(&semaphore);
110 |             async move {
111 |                 let _permit = sem.acquire().await?;
112 |                 drop_parquet_columns(input, output, columns_to_drop, batch_size).await
113 |             }
114 |         })
115 |         .buffer_unordered(max_concurrent)
116 |         .collect::<Vec<_>>()
117 |         .await;
118 | 
119 |     // Check if any operations failed
120 |     for result in results {
121 |         result?;
122 |     }
123 | 
124 |     Ok(())
125 | }
126 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/summary.rs:
--------------------------------------------------------------------------------
  1 | use crate::{DataArgs, OutputMode, TblCliError};
  2 | use std::path::{Path, PathBuf};
  3 | use tbl_core::formats::{print_bullet, print_header};
  4 | 
  5 | pub(crate) async fn print_summary(
  6 |     inputs_and_outputs: &[(Vec<PathBuf>, Option<PathBuf>)],
  7 |     output_mode: &OutputMode,
  8 |     args: &DataArgs,
  9 | ) -> Result<(), TblCliError> {
 10 |     let mut n_input_files = 0;
 11 |     let mut all_input_files = Vec::new();
 12 |     let mut _n_output_files = 0;
 13 |     for (input_files, output_file) in inputs_and_outputs.iter() {
 14 |         n_input_files += input_files.len();
 15 |         all_input_files.extend(input_files.iter().map(|p| p.as_path()));
 16 |         if output_file.is_some() {
 17 |             _n_output_files += 1;
 18 |         }
 19 |     }
 20 | 
 21 |     // compute total size of input files
 22 |     let n_input_bytes = tbl_core::filesystem::get_total_bytes_of_files(&all_input_files).await?;
 23 | 
 24 |     print_input_summary(n_input_files, &all_input_files, n_input_bytes, args);
 25 |     println!();
 26 |     println!();
 27 |     print_transform_summary(args);
 28 |     println!();
 29 |     println!();
 30 |     print_output_mode_summary(n_input_files, output_mode, args);
 31 |     Ok(())
 32 | }
 33 | 
 34 | fn print_input_summary(
 35 |     n_input_files: usize,
 36 |     input_files: &[&Path],
 37 |     n_input_bytes: u64,
 38 |     _args: &DataArgs,
 39 | ) {
 40 |     print_header("Inputs");
 41 |     print_bullet(
 42 |         "n_input_bytes",
 43 |         tbl_core::formats::format_bytes(n_input_bytes),
 44 |     );
 45 |     print_bullet(
 46 |         "n_input_files",
 47 |         tbl_core::formats::format_with_commas(n_input_files as u64),
 48 |     );
 49 | 
 50 |     let n_show_files = 10;
 51 |     for path in input_files.iter().take(n_show_files) {
 52 |         let path: String = path.to_string_lossy().to_string();
 53 |         tbl_core::formats::print_bullet_key_indent(path, 4);
 54 |     }
 55 |     if input_files.len() > n_show_files {
 56 |         tbl_core::formats::print_bullet_key_indent("...", 4);
 57 |     }
 58 | }
 59 | 
 60 | fn print_transform_summary(args: &DataArgs) {
 61 |     print_header("Transformations");
 62 |     let mut transforming = false;
 63 |     if let Some(with_columns) = &args.with_columns {
 64 |         print_bullet("adding columns", format!("{:?}", with_columns));
 65 |         transforming = true;
 66 |     }
 67 |     if let Some(filter) = &args.filter {
 68 |         print_bullet("filtering rows", format!("{:?}", filter));
 69 |         transforming = true;
 70 |     }
 71 |     if let Some(drop) = &args.drop {
 72 |         print_bullet("dropping columns", format!("{:?}", drop));
 73 |         transforming = true;
 74 |     }
 75 |     if let Some(cast) = &args.cast {
 76 |         print_bullet("casting types", format!("{:?}", cast));
 77 |         transforming = true;
 78 |     }
 79 |     if !transforming {
 80 |         println!("[no transformations]");
 81 |     }
 82 | }
 83 | 
 84 | fn print_output_mode_summary(n_input_files: usize, output_mode: &OutputMode, args: &DataArgs) {
 85 |     print_header("Outputs");
 86 |     match output_mode {
 87 |         OutputMode::PrintToStdout => {
 88 |             print_bullet("output_mode", "PRINT_TO_STDOUT");
 89 |             let summary = format!("loading {} files and printing to stdout", n_input_files);
 90 |             print_bullet("summary", summary);
 91 |         }
 92 |         OutputMode::SaveToSingleFile => {
 93 |             print_bullet("output_mode", "SAVE_TO_ONE_FILE");
 94 |             let summary = format!(
 95 |                 "loading {} files and merging result into 1 output file",
 96 |                 n_input_files
 97 |             );
 98 |             print_bullet("summary", summary);
 99 |             if let Some(output_file) = &args.output_file {
100 |                 print_bullet("output_file", output_file.to_string_lossy());
101 |             }
102 |         }
103 |         OutputMode::SaveToDirectory => {
104 |             print_bullet("output_mode", "SAVE_TO_NEW_DIR");
105 |             let summary = format!(
106 |                 "loading {} files and saving results to new directory",
107 |                 n_input_files
108 |             );
109 |             print_bullet("summary", summary);
110 |             if let Some(output_dir) = &args.output_dir {
111 |                 print_bullet("output_dir", output_dir.to_string_lossy());
112 |             }
113 |         }
114 |         OutputMode::ModifyInplace => {
115 |             print_bullet("output_mode", "MODIFY_INPLACE");
116 |             let summary = format!("modifying {} files in-place", n_input_files);
117 |             print_bullet("summary", summary);
118 |         }
119 |         OutputMode::Partition => {
120 |             print_bullet("output_mode", "REPARTITION");
121 |             let summary = format!("repartitioning {} files", n_input_files);
122 |             print_bullet("summary", summary);
123 |         }
124 |         OutputMode::InteractiveLf => {
125 |             print_bullet("output_mode", "INTERACTIVE");
126 |             let summary = format!(
127 |                 "starting interactive session, loading {} files into LazyFrame",
128 |                 n_input_files
129 |             );
130 |             print_bullet("summary", summary);
131 |         }
132 |         OutputMode::InteractiveDf => {
133 |             print_bullet("output_mode", "INTERACTIVE");
134 |             let summary = format!(
135 |                 "starting interactive session, loading {} files into LazyFrame",
136 |                 n_input_files
137 |             );
138 |             print_bullet("summary", summary);
139 |         }
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/subcommands/data.rs:
--------------------------------------------------------------------------------
  1 | use crate::{DataArgs, OutputMode, TblCliError};
  2 | use std::path::PathBuf;
  3 | use tbl_core::filesystem::{get_input_paths, get_output_paths, OutputPathSpec};
  4 | 
  5 | pub(crate) async fn data_command(args: DataArgs) -> Result<(), TblCliError> {
  6 |     inquire::set_global_render_config(crate::styles::get_render_config());
  7 | 
  8 |     // decide output mode
  9 |     let output_mode = decide_output_mode(&args)?;
 10 | 
 11 |     // create input output pairs
 12 |     let io = gather_inputs_and_outputs(&output_mode, &args)?;
 13 | 
 14 |     // print data summary
 15 |     if !args.no_summary {
 16 |         crate::summary::print_summary(&io, &output_mode, &args).await?;
 17 |     }
 18 | 
 19 |     // exit early as needed
 20 |     exit_early_if_needed(args.dry, args.confirm, !args.no_summary, &output_mode, &io);
 21 | 
 22 |     // process each input output pair
 23 |     for (input_paths, output_path) in io.into_iter() {
 24 |         process_io(input_paths, output_path, &output_mode, &args)?
 25 |     }
 26 | 
 27 |     Ok(())
 28 | }
 29 | 
 30 | fn decide_output_mode(args: &DataArgs) -> Result<OutputMode, TblCliError> {
 31 |     match (
 32 |         args.inplace,
 33 |         &args.output_file,
 34 |         &args.output_dir,
 35 |         &args.partition,
 36 |         args.df,
 37 |         args.lf,
 38 |     ) {
 39 |         (false, None, None, None, false, false) => Ok(OutputMode::PrintToStdout),
 40 |         (true, None, None, None, false, false) => Ok(OutputMode::ModifyInplace),
 41 |         (false, Some(_), None, None, false, false) => Ok(OutputMode::SaveToSingleFile),
 42 |         (false, None, Some(_), None, false, false) => Ok(OutputMode::SaveToDirectory),
 43 |         (false, None, _, Some(_), false, false) => Ok(OutputMode::Partition),
 44 |         (false, None, None, None, true, false) => Ok(OutputMode::InteractiveDf),
 45 |         (false, None, None, None, false, true) => Ok(OutputMode::InteractiveLf),
 46 |         _ => Err(TblCliError::Error(
 47 |             "can only specify one output mode".to_string(),
 48 |         )),
 49 |     }
 50 | }
 51 | 
 52 | #[allow(clippy::type_complexity)]
 53 | fn gather_inputs_and_outputs(
 54 |     output_mode: &OutputMode,
 55 |     args: &DataArgs,
 56 | ) -> Result<Vec<(Vec<PathBuf>, Option<PathBuf>)>, TblCliError> {
 57 |     // parse input output pairs
 58 |     let mut io = Vec::new();
 59 |     match output_mode {
 60 |         OutputMode::PrintToStdout
 61 |         | OutputMode::Partition
 62 |         | OutputMode::InteractiveLf
 63 |         | OutputMode::InteractiveDf => {
 64 |             let input_paths = get_input_paths(&args.paths, args.tree, true)?;
 65 |             io.push((input_paths, None))
 66 |         }
 67 |         OutputMode::SaveToSingleFile => {
 68 |             let input_paths = get_input_paths(&args.paths, args.tree, true)?;
 69 |             io.push((input_paths, args.output_file.clone()))
 70 |         }
 71 |         OutputMode::ModifyInplace => {
 72 |             let input_paths = get_input_paths(&args.paths, args.tree, true)?;
 73 |             for input_path in input_paths.into_iter() {
 74 |                 io.push(([input_path.clone()].to_vec(), Some(input_path)))
 75 |             }
 76 |         }
 77 |         OutputMode::SaveToDirectory => {
 78 |             if let Some(output_dir) = args.output_dir.clone() {
 79 |                 let _ = std::fs::create_dir(output_dir);
 80 |             };
 81 |             let output_spec = OutputPathSpec {
 82 |                 inputs: args.paths.clone(),
 83 |                 output_dir: args.output_dir.clone(),
 84 |                 tree: args.tree,
 85 |                 file_prefix: args.output_prefix.clone(),
 86 |                 file_postfix: args.output_postfix.clone(),
 87 |                 sort: true,
 88 |             };
 89 |             let (input_paths, output_paths) = get_output_paths(output_spec)?;
 90 |             for (input_path, output_path) in input_paths.into_iter().zip(output_paths) {
 91 |                 io.push(([input_path].to_vec(), Some(output_path)))
 92 |             }
 93 |         }
 94 |     };
 95 | 
 96 |     // filter empty io pairs
 97 |     let io = io
 98 |         .into_iter()
 99 |         .filter(|(inputs, _)| !inputs.is_empty())
100 |         .collect();
101 | 
102 |     Ok(io)
103 | }
104 | 
105 | fn exit_early_if_needed(
106 |     dry: bool,
107 |     confirm: bool,
108 |     summary: bool,
109 |     output_mode: &OutputMode,
110 |     io: &[(Vec<PathBuf>, Option<PathBuf>)],
111 | ) {
112 |     // exit if performing dry run
113 |     if dry {
114 |         if summary {
115 |             println!();
116 |             println!();
117 |             tbl_core::formats::print_header("Data")
118 |         }
119 |         println!("[dry run, exiting]");
120 |         std::process::exit(0);
121 |     }
122 | 
123 |     // exit if no files selected
124 |     if io.is_empty() {
125 |         if summary {
126 |             println!();
127 |             println!();
128 |             tbl_core::formats::print_header("Data")
129 |         }
130 |         println!("[no tabular files selected]");
131 |         std::process::exit(0)
132 |     };
133 | 
134 |     // exit if user does not confirm write operations
135 |     if output_mode.writes_to_disk() & !confirm {
136 |         if summary {
137 |             println!();
138 |             println!();
139 |         }
140 |         let prompt = "continue? ";
141 |         if let Ok(true) = inquire::Confirm::new(prompt).with_default(false).prompt() {
142 |         } else {
143 |             println!("[exiting]");
144 |             std::process::exit(0)
145 |         }
146 |     }
147 | }
148 | 
149 | fn process_io(
150 |     input_paths: Vec<PathBuf>,
151 |     output_path: Option<PathBuf>,
152 |     output_mode: &OutputMode,
153 |     args: &DataArgs,
154 | ) -> Result<(), TblCliError> {
155 |     // create lazy frame
156 |     let lf = tbl_core::parquet::create_lazyframe(&input_paths)?;
157 | 
158 |     // transform into output frames
159 |     let lf = crate::transform::apply_transformations(lf, args)?;
160 | 
161 |     // output data
162 |     crate::output::output_lazyframe(lf, input_paths, output_path, output_mode, args)
163 | }
164 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/output.rs:
--------------------------------------------------------------------------------
  1 | use crate::styles::FontStyle;
  2 | use crate::{DataArgs, OutputMode, TblCliError};
  3 | use color_print::cstr;
  4 | use polars::prelude::*;
  5 | use std::io::stdout;
  6 | use std::path::PathBuf;
  7 | use toolstr::Colorize;
  8 | 
  9 | pub(crate) fn output_lazyframe(
 10 |     lf: LazyFrame,
 11 |     input_paths: Vec<PathBuf>,
 12 |     output_path: Option<PathBuf>,
 13 |     output_mode: &OutputMode,
 14 |     args: &DataArgs,
 15 | ) -> Result<(), TblCliError> {
 16 |     match output_mode {
 17 |         OutputMode::PrintToStdout => print_lazyframe(lf, args),
 18 |         OutputMode::SaveToSingleFile => save_lf_to_disk(lf, output_path, args),
 19 |         OutputMode::SaveToDirectory => save_lf_to_disk(lf, output_path, args),
 20 |         OutputMode::ModifyInplace => save_lf_to_disk(lf, output_path, args),
 21 |         OutputMode::Partition => partition_data(lf, input_paths, args),
 22 |         OutputMode::InteractiveLf => enter_interactive_session(lf, input_paths, args),
 23 |         OutputMode::InteractiveDf => enter_interactive_session(lf, input_paths, args),
 24 |     }
 25 | }
 26 | 
 27 | fn print_lazyframe(lf: LazyFrame, args: &DataArgs) -> Result<(), TblCliError> {
 28 |     let df = lf.collect()?;
 29 | 
 30 |     let mut df = match args.hex {
 31 |         true => binary_to_hex(&mut df.clone())?,
 32 |         false => df,
 33 |     };
 34 | 
 35 |     if !args.no_summary {
 36 |         println!();
 37 |         println!();
 38 |         tbl_core::formats::print_header("Data");
 39 |     };
 40 | 
 41 |     let n_show = match &args.n {
 42 |         Some(n) if n == "all" => df.height(),
 43 |         Some(n) => n.parse::<usize>()?,
 44 |         None => 20,
 45 |     };
 46 |     let n_missing = if df.height() >= n_show {
 47 |         df.height() - n_show
 48 |     } else {
 49 |         0
 50 |     };
 51 | 
 52 |     if args.csv {
 53 |         let df = binary_to_hex(&mut df)?;
 54 |         print_dataframe_as_csv(&df, n_show)?;
 55 |     } else if args.json | args.jsonl {
 56 |         let df = binary_to_hex(&mut df)?;
 57 |         print_dataframe_as_json(&df, n_show, args.jsonl)?;
 58 |     } else {
 59 |         let df = df.head(Some(n_show));
 60 |         println!("{}", df);
 61 |     };
 62 | 
 63 |     if n_missing > 0 {
 64 |         println!(
 65 |             "{} rows omitted, use {} to show all rows",
 66 |             n_missing.to_string().colorize_constant().bold(),
 67 |             cstr!("<white><bold>-n all</bold></white>")
 68 |         );
 69 |     }
 70 | 
 71 |     Ok(())
 72 | }
 73 | 
 74 | fn print_dataframe_as_csv(df: &DataFrame, n: usize) -> Result<(), PolarsError> {
 75 |     let mut writer = CsvWriter::new(stdout());
 76 |     let df: DataFrame = df.head(Some(n));
 77 |     writer.finish(&mut df.clone())
 78 | }
 79 | 
 80 | fn print_dataframe_as_json(df: &DataFrame, n: usize, jsonl: bool) -> Result<(), PolarsError> {
 81 |     let mut writer = JsonWriter::new(stdout());
 82 | 
 83 |     if !jsonl {
 84 |         writer = writer.with_json_format(polars::prelude::JsonFormat::Json);
 85 |     };
 86 | 
 87 |     let df: DataFrame = df.head(Some(n));
 88 |     let result = writer.finish(&mut df.clone());
 89 | 
 90 |     if !jsonl {
 91 |         println!()
 92 |     };
 93 | 
 94 |     result
 95 | }
 96 | 
 97 | fn binary_to_hex(df: &mut DataFrame) -> Result<DataFrame, PolarsError> {
 98 |     let mut df = df.clone();
 99 | 
100 |     let binary_columns: Vec<String> = df
101 |         .get_columns()
102 |         .iter()
103 |         .filter_map(|s| {
104 |             if matches!(s.dtype(), DataType::Binary) {
105 |                 Some(s.name().to_string())
106 |             } else {
107 |                 None
108 |             }
109 |         })
110 |         .collect();
111 | 
112 |     for col_name in binary_columns {
113 |         let hex_col_with_prefix = df
114 |             .clone()
115 |             .lazy()
116 |             .select(&[
117 |                 concat_str([lit("0x"), col(&col_name).binary().hex_encode()], "", true)
118 |                     .alias(&col_name),
119 |             ])
120 |             .collect()?
121 |             .column(&col_name)?
122 |             .clone();
123 | 
124 |         df = df.with_column(hex_col_with_prefix)?.clone();
125 |     }
126 | 
127 |     Ok(df)
128 | }
129 | 
130 | fn save_lf_to_disk(
131 |     lf: LazyFrame,
132 |     output_path: Option<PathBuf>,
133 |     args: &DataArgs,
134 | ) -> Result<(), TblCliError> {
135 |     let output_path = match output_path {
136 |         Some(output_path) => output_path,
137 |         None => return Err(TblCliError::Error("no output path specified".to_string())),
138 |     };
139 | 
140 |     // Create a temporary path by appending "_tmp" to the original path
141 |     let tmp_path = output_path.with_file_name(format!(
142 |         "{}_tmp",
143 |         output_path
144 |             .file_name()
145 |             .ok_or_else(|| TblCliError::Error("File name is missing".to_string()))?
146 |             .to_str()
147 |             .ok_or_else(|| TblCliError::Error("File name is not valid UTF-8".to_string()))?
148 |     ));
149 | 
150 |     // Write to the temporary file
151 |     if output_path.ends_with(".csv") | args.csv {
152 |         let options = CsvWriterOptions::default();
153 |         lf.sink_csv(&tmp_path, options)?;
154 |     } else if output_path.ends_with(".json") | args.json {
155 |         let options = JsonWriterOptions::default();
156 |         lf.sink_json(&tmp_path, options)?;
157 |     } else {
158 |         let options = ParquetWriteOptions::default();
159 |         let result = lf.clone().sink_parquet(&tmp_path, options);
160 |         if result.is_err() {
161 |             // sink_parquet() is still missing some options, so if it fails use backup
162 |             let file = std::fs::File::create(&tmp_path)?;
163 |             let writer = ParquetWriter::new(file)
164 |                 .with_compression(ParquetCompression::Snappy)
165 |                 .with_statistics(StatisticsOptions {
166 |                     min_value: true,
167 |                     max_value: true,
168 |                     distinct_count: true,
169 |                     null_count: true,
170 |                 });
171 |             writer.finish(&mut lf.clone().collect()?)?;
172 |         }
173 |     };
174 | 
175 |     // Move the temporary file to the final output path
176 |     std::fs::rename(&tmp_path, &output_path).map_err(|e| TblCliError::Error(e.to_string()))?;
177 | 
178 |     Ok(())
179 | }
180 | 
181 | fn partition_data(
182 |     _lf: LazyFrame,
183 |     _input_paths: Vec<PathBuf>,
184 |     _args: &DataArgs,
185 | ) -> Result<(), TblCliError> {
186 |     Err(TblCliError::Error(
187 |         "partition functionality not implemented".to_string(),
188 |     ))
189 | }
190 | 
191 | fn enter_interactive_session(
192 |     _lf: LazyFrame,
193 |     input_paths: Vec<PathBuf>,
194 |     args: &DataArgs,
195 | ) -> Result<(), TblCliError> {
196 |     crate::python::load_df_interactive(input_paths, args.lf, args.executable.clone())
197 | }
198 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/parquet_summary.rs:
--------------------------------------------------------------------------------
  1 | use crate::TblError;
  2 | use futures::stream::{self, StreamExt};
  3 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
  4 | use polars::prelude::*;
  5 | use std::collections::HashMap;
  6 | 
  7 | /// get the number of rows in a parquet file
  8 | pub async fn get_parquet_row_count(path: &std::path::Path) -> Result<u64, TblError> {
  9 |     let file = tokio::fs::File::open(path).await?;
 10 |     let builder = ParquetRecordBatchStreamBuilder::new(file)
 11 |         .await?
 12 |         .with_batch_size(1);
 13 |     let file_metadata = builder.metadata().file_metadata();
 14 |     Ok(file_metadata.num_rows() as u64)
 15 | }
 16 | 
 17 | /// get the number of rows in multiple parquet files
 18 | pub async fn get_parquet_row_counts(paths: &[&std::path::Path]) -> Result<Vec<u64>, TblError> {
 19 |     let row_counts = stream::iter(paths)
 20 |         .map(|path| get_parquet_row_count(path))
 21 |         .buffered(10)
 22 |         .collect::<Vec<Result<u64, TblError>>>()
 23 |         .await;
 24 | 
 25 |     row_counts
 26 |         .into_iter()
 27 |         .collect::<Result<Vec<u64>, TblError>>()
 28 | }
 29 | 
 30 | /// get parquet schema
 31 | pub async fn get_parquet_schema(path: &std::path::Path) -> Result<Arc<Schema>, TblError> {
 32 |     let path = path.to_path_buf();
 33 |     tokio::task::spawn_blocking(move || {
 34 |         let scan_args = ScanArgsParquet::default();
 35 |         let mut lf = LazyFrame::scan_parquet(path, scan_args)?;
 36 |         let schema = lf.schema()?;
 37 |         Ok(schema)
 38 |     })
 39 |     .await?
 40 | }
 41 | 
 42 | /// get parquet schemas
 43 | pub async fn get_parquet_schemas(
 44 |     paths: &[std::path::PathBuf],
 45 | ) -> Result<Vec<Arc<Schema>>, TblError> {
 46 |     let schemas = stream::iter(paths)
 47 |         .map(|path| get_parquet_schema(path))
 48 |         .buffered(10)
 49 |         .collect::<Vec<Result<Arc<Schema>, TblError>>>()
 50 |         .await;
 51 | 
 52 |     schemas
 53 |         .into_iter()
 54 |         .collect::<Result<Vec<Arc<Schema>>, TblError>>()
 55 | }
 56 | 
 57 | /// TabularSummary
 58 | #[derive(Clone, Default)]
 59 | pub struct TabularSummary {
 60 |     /// n_files
 61 |     pub n_files: u64,
 62 |     /// n_bytes_compressed
 63 |     pub n_bytes_compressed: u64,
 64 |     /// n_bytes_uncompressed
 65 |     pub n_bytes_uncompressed: u64,
 66 |     /// n_rows
 67 |     pub n_rows: u64,
 68 |     /// schema
 69 |     pub schema: Arc<Schema>,
 70 |     /// columns
 71 |     pub columns: Vec<TabularColumnSummary>,
 72 | }
 73 | 
 74 | /// TabularColumnSummary
 75 | #[derive(Default, Clone, Debug)]
 76 | pub struct TabularColumnSummary {
 77 |     /// n_bytes_compressed
 78 |     pub n_bytes_compressed: u64,
 79 |     /// n_bytes_uncompressed
 80 |     pub n_bytes_uncompressed: u64,
 81 |     // /// n_null
 82 |     // pub n_null: u64,
 83 |     // /// n_unique
 84 |     // pub n_unique: u64,
 85 |     // pub min_value
 86 |     // pub max_value
 87 | }
 88 | 
 89 | /// get summary of parquet file
 90 | pub async fn get_parquet_summary(path: &std::path::Path) -> Result<TabularSummary, TblError> {
 91 |     let metadata = std::fs::metadata(path)?;
 92 |     let n_bytes_compressed = metadata.len();
 93 |     let n_rows = get_parquet_row_count(path).await?;
 94 |     let schema = get_parquet_schema(path).await?;
 95 | 
 96 |     let parquet_metadata = get_parquet_metadata(path).await?;
 97 |     let columns = get_parquet_column_summaries(parquet_metadata.clone()).await?;
 98 |     let n_bytes_uncompressed = get_parquet_n_bytes_uncompressed(parquet_metadata);
 99 | 
100 |     Ok(TabularSummary {
101 |         n_files: 1,
102 |         n_bytes_compressed,
103 |         n_bytes_uncompressed,
104 |         n_rows,
105 |         schema,
106 |         columns,
107 |     })
108 | }
109 | 
110 | /// get parquet file metadata
111 | pub async fn get_parquet_metadata(
112 |     path: &std::path::Path,
113 | ) -> Result<std::sync::Arc<parquet::file::metadata::ParquetMetaData>, TblError> {
114 |     let file = tokio::fs::File::open(path).await?;
115 |     let builder = ParquetRecordBatchStreamBuilder::new(file)
116 |         .await?
117 |         .with_batch_size(1);
118 |     Ok(builder.metadata().clone())
119 | }
120 | 
121 | /// get parquet uncompressed bytes
122 | pub fn get_parquet_n_bytes_uncompressed(
123 |     metadata: Arc<parquet::file::metadata::ParquetMetaData>,
124 | ) -> u64 {
125 |     metadata
126 |         .row_groups()
127 |         .iter()
128 |         .map(|rg| rg.total_byte_size() as u64)
129 |         .sum::<u64>()
130 | }
131 | 
132 | /// get column summaries for parquet file
133 | pub async fn get_parquet_column_summaries(
134 |     metadata: Arc<parquet::file::metadata::ParquetMetaData>,
135 | ) -> Result<Vec<TabularColumnSummary>, TblError> {
136 |     let n_columns = metadata
137 |         .row_groups()
138 |         .first()
139 |         .map(|rg| rg.columns().len())
140 |         .unwrap_or(0);
141 |     let mut columns: Vec<TabularColumnSummary> = vec![TabularColumnSummary::default(); n_columns];
142 |     for rg in metadata.row_groups() {
143 |         for (column, column_metadata) in columns.iter_mut().zip(rg.columns()) {
144 |             column.n_bytes_compressed += column_metadata.compressed_size() as u64;
145 |             column.n_bytes_uncompressed += column_metadata.uncompressed_size() as u64;
146 |         }
147 |     }
148 |     Ok(columns)
149 | }
150 | 
151 | /// get parquet schemas
152 | pub async fn get_parquet_summaries(
153 |     paths: &[std::path::PathBuf],
154 | ) -> Result<Vec<TabularSummary>, TblError> {
155 |     let schemas = stream::iter(paths)
156 |         .map(|path| get_parquet_summary(path))
157 |         .buffered(10)
158 |         .collect::<Vec<Result<TabularSummary, TblError>>>()
159 |         .await;
160 | 
161 |     schemas
162 |         .into_iter()
163 |         .collect::<Result<Vec<TabularSummary>, TblError>>()
164 | }
165 | 
166 | /// combine tabular summaries
167 | pub fn combine_tabular_summaries(
168 |     summaries: &[&TabularSummary],
169 |     include_columns: bool,
170 | ) -> Result<TabularSummary, TblError> {
171 |     let mut total_summary = TabularSummary::default();
172 |     for (s, summary) in summaries.iter().enumerate() {
173 |         if s == 0 {
174 |             total_summary.schema = summary.schema.clone();
175 |         }
176 |         total_summary.n_files += summary.n_files;
177 |         total_summary.n_bytes_compressed += summary.n_bytes_compressed;
178 |         total_summary.n_bytes_uncompressed += summary.n_bytes_uncompressed;
179 |         total_summary.n_rows += summary.n_rows;
180 |         if include_columns {
181 |             total_summary.columns = combine_tabular_columns_summaries(
182 |                 total_summary.columns.as_slice(),
183 |                 summary.columns.as_slice(),
184 |             )?;
185 |         }
186 |     }
187 |     Ok(total_summary)
188 | }
189 | 
190 | fn combine_tabular_columns_summaries(
191 |     lhs: &[TabularColumnSummary],
192 |     rhs: &[TabularColumnSummary],
193 | ) -> Result<Vec<TabularColumnSummary>, TblError> {
194 |     if lhs.is_empty() {
195 |         Ok(rhs.to_vec())
196 |     } else if rhs.is_empty() {
197 |         Ok(lhs.to_vec())
198 |     } else if lhs.len() != rhs.len() {
199 |         Err(TblError::SchemaError(
200 |             "different number of columns".to_string(),
201 |         ))
202 |     } else {
203 |         Ok(lhs
204 |             .iter()
205 |             .zip(rhs.iter())
206 |             .map(|(lhs, rhs)| combine_tabular_column_summary(lhs, rhs))
207 |             .collect())
208 |     }
209 | }
210 | 
211 | fn combine_tabular_column_summary(
212 |     lhs: &TabularColumnSummary,
213 |     rhs: &TabularColumnSummary,
214 | ) -> TabularColumnSummary {
215 |     TabularColumnSummary {
216 |         n_bytes_compressed: lhs.n_bytes_compressed + rhs.n_bytes_compressed,
217 |         n_bytes_uncompressed: lhs.n_bytes_uncompressed + rhs.n_bytes_uncompressed,
218 |     }
219 | }
220 | 
221 | /// summarize by schema
222 | pub fn summarize_by_schema(
223 |     summaries: &[&TabularSummary],
224 | ) -> Result<HashMap<Arc<Schema>, TabularSummary>, TblError> {
225 |     let mut by_schema: HashMap<Arc<Schema>, Vec<&TabularSummary>> = HashMap::new();
226 |     for summary in summaries.iter() {
227 |         by_schema
228 |             .entry(summary.schema.clone())
229 |             .or_default()
230 |             .push(summary)
231 |     }
232 |     by_schema
233 |         .into_iter()
234 |         .map(|(k, v)| combine_tabular_summaries(v.as_slice(), true).map(|combined| (k, combined)))
235 |         .collect()
236 | }
237 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/parquet/parquet_insert.rs:
--------------------------------------------------------------------------------
  1 | use crate::TblError;
  2 | use arrow::array::{ArrayRef, StringArray};
  3 | use arrow::array::{BinaryArray, BooleanArray, UInt32Array, UInt64Array};
  4 | use arrow::datatypes::{DataType, Field, Schema};
  5 | use arrow::record_batch::RecordBatch;
  6 | use futures::stream::{self};
  7 | use futures::StreamExt;
  8 | use hex;
  9 | use parquet::arrow::arrow_writer::ArrowWriter;
 10 | use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
 11 | use parquet::file::properties::WriterProperties;
 12 | use std::io::BufWriter as StdBufWriter;
 13 | use std::path::{Path, PathBuf};
 14 | use std::sync::Arc;
 15 | use tokio::fs::File;
 16 | use tokio::io::AsyncWriteExt;
 17 | use tokio::sync::Semaphore;
 18 | 
 19 | /// insert columns into multiple parquet files
 20 | #[allow(clippy::too_many_arguments)]
 21 | pub async fn insert_parquets_columns(
 22 |     inputs: &[PathBuf],
 23 |     outputs: &[PathBuf],
 24 |     column_names: Vec<String>,
 25 |     column_dtypes: Vec<DataType>,
 26 |     default_values: Option<Vec<String>>,
 27 |     index: Option<Vec<usize>>,
 28 |     batch_size: usize,
 29 |     max_concurrent: usize,
 30 | ) -> Result<(), TblError> {
 31 |     if inputs.len() != outputs.len() {
 32 |         return Err(TblError::Error(
 33 |             "Number of inputs must match number of outputs".to_string(),
 34 |         ));
 35 |     }
 36 | 
 37 |     let semaphore = Arc::new(Semaphore::new(max_concurrent));
 38 | 
 39 |     let results = stream::iter(inputs.iter().zip(outputs.iter()))
 40 |         .map(|(input, output)| {
 41 |             let sem_clone = semaphore.clone();
 42 |             let column_names = column_names.clone();
 43 |             let column_dtypes = column_dtypes.clone();
 44 |             let default_values = default_values.clone();
 45 |             let index = index.clone();
 46 | 
 47 |             async move {
 48 |                 let _permit = sem_clone
 49 |                     .acquire()
 50 |                     .await
 51 |                     .map_err(|e| TblError::Error(e.to_string()))?;
 52 | 
 53 |                 insert_parquet_columns(
 54 |                     input,
 55 |                     output,
 56 |                     column_names,
 57 |                     column_dtypes,
 58 |                     default_values,
 59 |                     index,
 60 |                     batch_size,
 61 |                 )
 62 |                 .await
 63 |             }
 64 |         })
 65 |         .buffer_unordered(max_concurrent)
 66 |         .collect::<Vec<_>>()
 67 |         .await;
 68 | 
 69 |     // Check if any of the operations resulted in an error
 70 |     for result in results {
 71 |         result?;
 72 |     }
 73 | 
 74 |     Ok(())
 75 | }
 76 | 
 77 | /// Insert columns into a parquet file
 78 | pub async fn insert_parquet_columns(
 79 |     input: &Path,
 80 |     output: &Path,
 81 |     column_names: Vec<String>,
 82 |     column_dtypes: Vec<DataType>,
 83 |     default_values: Option<Vec<String>>,
 84 |     index: Option<Vec<usize>>,
 85 |     batch_size: usize,
 86 | ) -> Result<(), TblError> {
 87 |     if column_names.len() != column_dtypes.len() {
 88 |         return Err(TblError::Error(
 89 |             "Column names and dtypes must have the same length".to_string(),
 90 |         ));
 91 |     }
 92 | 
 93 |     if let Some(ref default_values) = default_values {
 94 |         if default_values.len() != column_names.len() {
 95 |             return Err(TblError::Error(
 96 |                 "Default values must have the same length as column names and dtypes".to_string(),
 97 |             ));
 98 |         }
 99 |     }
100 | 
101 |     if let Some(ref index_values) = index {
102 |         if index_values.len() != column_names.len() {
103 |             return Err(TblError::Error(
104 |                 "Index values must have the same length as column names and dtypes".to_string(),
105 |             ));
106 |         }
107 |     }
108 | 
109 |     let input_file = File::open(&input).await?;
110 |     let builder = ParquetRecordBatchStreamBuilder::new(input_file)
111 |         .await?
112 |         .with_batch_size(batch_size);
113 |     let mut reader_stream = builder.build()?;
114 |     let original_schema = reader_stream.schema();
115 | 
116 |     // Create new schema with inserted columns
117 |     let mut new_fields = original_schema.fields().to_vec();
118 |     let insert_positions = index.unwrap_or_else(|| {
119 |         (0..column_names.len())
120 |             .map(|i| new_fields.len() + i)
121 |             .collect()
122 |     });
123 |     for (i, (name, dtype)) in column_names.iter().zip(column_dtypes.iter()).enumerate() {
124 |         let pos = insert_positions[i];
125 |         new_fields.insert(pos, Arc::new(Field::new(name, dtype.clone(), true)));
126 |     }
127 |     let new_schema = Arc::new(Schema::new(new_fields));
128 | 
129 |     let tmp_output_path = super::parquet_drop::create_tmp_target(output);
130 |     let mut output_file = File::create(&tmp_output_path).await?;
131 |     let mut buffer = Vec::new();
132 | 
133 |     let writer_props = WriterProperties::builder().build();
134 |     let mut arrow_writer = ArrowWriter::try_new(
135 |         StdBufWriter::new(&mut buffer),
136 |         new_schema.clone(),
137 |         Some(writer_props),
138 |     )?;
139 | 
140 |     while let Some(batch) = reader_stream.next().await {
141 |         let batch = batch?;
142 |         let mut new_columns = batch.columns().to_vec();
143 | 
144 |         for (i, dtype) in column_dtypes.iter().enumerate() {
145 |             let pos = insert_positions[i];
146 |             let default_value = default_values.as_ref().map(|values| values[i].as_str());
147 |             let new_column = create_new_column(batch.num_rows(), dtype, default_value)?;
148 |             new_columns.insert(pos, new_column);
149 |         }
150 | 
151 |         let new_batch = RecordBatch::try_new(new_schema.clone(), new_columns)?;
152 |         arrow_writer.write(&new_batch)?;
153 |     }
154 | 
155 |     arrow_writer.close()?;
156 |     output_file.write_all(&buffer).await?;
157 |     output_file.flush().await?;
158 |     tokio::fs::rename(tmp_output_path, output).await?;
159 | 
160 |     Ok(())
161 | }
162 | 
163 | fn create_new_column(
164 |     len: usize,
165 |     dtype: &DataType,
166 |     default_value: Option<&str>,
167 | ) -> Result<ArrayRef, TblError> {
168 |     match dtype {
169 |         DataType::Int32 => {
170 |             let value = default_value
171 |                 .map(|v| v.parse::<i32>().map_err(|e| TblError::Error(e.to_string())))
172 |                 .transpose()?;
173 |             Ok(Arc::new(arrow::array::Int32Array::from(vec![value; len])))
174 |         }
175 |         DataType::Int64 => {
176 |             let value = default_value
177 |                 .map(|v| v.parse::<i64>().map_err(|e| TblError::Error(e.to_string())))
178 |                 .transpose()?;
179 |             Ok(Arc::new(arrow::array::Int64Array::from(vec![value; len])))
180 |         }
181 |         DataType::UInt32 => {
182 |             let value = default_value
183 |                 .map(|v| v.parse::<u32>().map_err(|e| TblError::Error(e.to_string())))
184 |                 .transpose()?;
185 |             Ok(Arc::new(UInt32Array::from(vec![value; len])))
186 |         }
187 |         DataType::UInt64 => {
188 |             let value = default_value
189 |                 .map(|v| v.parse::<u64>().map_err(|e| TblError::Error(e.to_string())))
190 |                 .transpose()?;
191 |             Ok(Arc::new(UInt64Array::from(vec![value; len])))
192 |         }
193 |         DataType::Float32 => {
194 |             let value = default_value
195 |                 .map(|v| v.parse::<f32>().map_err(|e| TblError::Error(e.to_string())))
196 |                 .transpose()?;
197 |             Ok(Arc::new(arrow::array::Float32Array::from(vec![value; len])))
198 |         }
199 |         DataType::Float64 => {
200 |             let value = default_value
201 |                 .map(|v| v.parse::<f64>().map_err(|e| TblError::Error(e.to_string())))
202 |                 .transpose()?;
203 |             Ok(Arc::new(arrow::array::Float64Array::from(vec![value; len])))
204 |         }
205 |         DataType::Utf8 => {
206 |             let value = default_value.unwrap_or("");
207 |             Ok(Arc::new(StringArray::from(vec![value; len])))
208 |         }
209 |         DataType::Binary => {
210 |             let value = default_value
211 |                 .map(|v| {
212 |                     if let Some(stripped) = v.strip_prefix("0x") {
213 |                         hex::decode(stripped).map_err(|e| TblError::Error(e.to_string()))
214 |                     } else {
215 |                         Err(TblError::Error(
216 |                             "Binary default value must start with '0x'".to_string(),
217 |                         ))
218 |                     }
219 |                 })
220 |                 .transpose()?
221 |                 .unwrap_or_else(Vec::new);
222 |             Ok(Arc::new(BinaryArray::from(vec![
223 |                 Some(value.as_slice());
224 |                 len
225 |             ])))
226 |         }
227 |         DataType::Boolean => {
228 |             let value = default_value
229 |                 .map(|v| {
230 |                     v.parse::<bool>()
231 |                         .map_err(|e| TblError::Error(e.to_string()))
232 |                 })
233 |                 .transpose()?;
234 |             Ok(Arc::new(BooleanArray::from(vec![value; len])))
235 |         }
236 |         // Add more data types as needed
237 |         _ => Err(TblError::Error(format!(
238 |             "Unsupported data type: {:?}",
239 |             dtype
240 |         ))),
241 |     }
242 | }
243 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/subcommands/schema.rs:
--------------------------------------------------------------------------------
  1 | use crate::styles::FontStyle;
  2 | use crate::{SchemaArgs, TblCliError};
  3 | use polars::prelude::*;
  4 | use std::collections::HashMap;
  5 | use std::path::PathBuf;
  6 | use std::sync::Arc;
  7 | use tbl_core::formats::{format_bytes, format_with_commas};
  8 | use tbl_core::parquet::{combine_tabular_summaries, summarize_by_schema, TabularSummary};
  9 | use toolstr::Colorize;
 10 | 
 11 | pub(crate) async fn schema_command(args: SchemaArgs) -> Result<(), TblCliError> {
 12 |     // get schemas
 13 |     let paths = tbl_core::filesystem::get_input_paths(&args.paths, args.tree, true)?;
 14 |     let summaries = tbl_core::parquet::get_parquet_summaries(&paths).await?;
 15 |     let ref_summaries: Vec<&tbl_core::parquet::TabularSummary> = summaries.iter().collect();
 16 |     let by_schema = summarize_by_schema(ref_summaries.as_slice())?;
 17 | 
 18 |     // summarize entire set
 19 |     let total_summary = combine_tabular_summaries(&ref_summaries, false)?;
 20 | 
 21 |     // clear common prefix
 22 |     let paths = if args.absolute {
 23 |         paths
 24 |     } else {
 25 |         let common_prefix = tbl_core::filesystem::get_common_prefix(&paths)?;
 26 |         let mut new_paths = Vec::new();
 27 |         for path in paths {
 28 |             new_paths.push(path.strip_prefix(&common_prefix)?.to_owned())
 29 |         }
 30 |         new_paths
 31 |     };
 32 | 
 33 |     // collect example paths for each schema
 34 |     let n_example_paths = 3;
 35 |     let example_paths = if args.examples {
 36 |         let mut example_paths = HashMap::<Arc<Schema>, Vec<PathBuf>>::new();
 37 |         for (path, summary) in paths.iter().zip(summaries.iter()) {
 38 |             example_paths
 39 |                 .entry(Arc::clone(&summary.schema))
 40 |                 .or_default()
 41 |                 .push(path.clone());
 42 |         }
 43 |         Some(example_paths)
 44 |     } else {
 45 |         None
 46 |     };
 47 | 
 48 |     // decide how many schemas to show
 49 |     let n_to_show = std::cmp::min(args.n.unwrap_or(3), by_schema.len());
 50 | 
 51 |     // decide what to sort by
 52 |     let sort_by = match args.sort.as_str() {
 53 |         "rows" => SortSchemasBy::Rows,
 54 |         "bytes" => SortSchemasBy::Bytes,
 55 |         "files" => SortSchemasBy::Files,
 56 |         _ => {
 57 |             return Err(TblCliError::Arg(
 58 |                 "must sort by rows, bytes, or files".to_string(),
 59 |             ))
 60 |         }
 61 |     };
 62 | 
 63 |     // print output
 64 |     print_schemas(
 65 |         by_schema,
 66 |         total_summary,
 67 |         n_to_show,
 68 |         sort_by,
 69 |         n_example_paths,
 70 |         example_paths,
 71 |     )?;
 72 | 
 73 |     Ok(())
 74 | }
 75 | 
 76 | fn count_unique_schemas(schemas: &Vec<&Arc<Schema>>) -> HashMap<Arc<Schema>, usize> {
 77 |     let mut schema_counts = HashMap::new();
 78 | 
 79 |     for schema in schemas {
 80 |         let counter = schema_counts.entry(Arc::clone(schema)).or_insert(0);
 81 |         *counter += 1;
 82 |     }
 83 | 
 84 |     schema_counts
 85 | }
 86 | 
 87 | pub(crate) enum SortSchemasBy {
 88 |     Files,
 89 |     Bytes,
 90 |     Rows,
 91 | }
 92 | 
 93 | impl std::fmt::Display for SortSchemasBy {
 94 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 95 |         let s = match self {
 96 |             SortSchemasBy::Files => "files",
 97 |             SortSchemasBy::Bytes => "bytes",
 98 |             SortSchemasBy::Rows => "rows",
 99 |         };
100 |         write!(f, "{}", s)
101 |     }
102 | }
103 | 
104 | fn top_n_schemas(
105 |     schema_summaries: HashMap<Arc<Schema>, TabularSummary>,
106 |     n_to_show: usize,
107 |     sort_by: SortSchemasBy,
108 | ) -> Vec<TabularSummary> {
109 |     let mut summaries: Vec<_> = schema_summaries.values().cloned().collect();
110 |     match sort_by {
111 |         SortSchemasBy::Rows => summaries.sort_by(|a, b| b.n_rows.cmp(&a.n_rows)),
112 |         SortSchemasBy::Files => summaries.sort_by(|a, b| b.n_files.cmp(&a.n_files)),
113 |         SortSchemasBy::Bytes => {
114 |             summaries.sort_by(|a, b| b.n_bytes_compressed.cmp(&a.n_bytes_compressed))
115 |         }
116 |     }
117 |     summaries.into_iter().take(n_to_show).collect()
118 | }
119 | 
120 | fn print_schemas(
121 |     schema_summaries: HashMap<Arc<Schema>, TabularSummary>,
122 |     total_summary: TabularSummary,
123 |     n_to_show: usize,
124 |     sort_by: SortSchemasBy,
125 |     n_example_paths: usize,
126 |     example_paths: Option<HashMap<Arc<Schema>, Vec<PathBuf>>>,
127 | ) -> Result<(), TblCliError> {
128 |     let n_schemas = schema_summaries.len();
129 | 
130 |     // print summary
131 |     let schema_word = if n_schemas == 1 { "schema" } else { "schemas" };
132 |     println!(
133 |         "{} unique {}, {} rows, {} files, {}",
134 |         format_with_commas(n_schemas as u64).green().bold(),
135 |         schema_word,
136 |         format_with_commas(total_summary.n_rows).green().bold(),
137 |         format_with_commas(total_summary.n_files).green().bold(),
138 |         format_bytes(total_summary.n_bytes_compressed)
139 |             .green()
140 |             .bold(),
141 |     );
142 |     println!();
143 |     if n_schemas > 1 {
144 |         println!(
145 |             "showing top {} schemas by number of {}:",
146 |             format!("{}", n_to_show).green().bold(),
147 |             sort_by,
148 |         );
149 |         println!();
150 |         if example_paths.is_some() {
151 |             println!();
152 |         };
153 |     }
154 | 
155 |     // print top schemas
156 |     let format = toolstr::NumberFormat::new().percentage().precision(2);
157 |     let top_n = top_n_schemas(schema_summaries, n_to_show, sort_by);
158 |     for (i, summary) in top_n.into_iter().enumerate() {
159 |         let file_percent = (summary.n_files as f64) / (total_summary.n_files as f64);
160 |         let file_percent = format.format(file_percent)?;
161 | 
162 |         let row_percent = if total_summary.n_rows == 0 {
163 |             0.0
164 |         } else {
165 |             (summary.n_rows as f64) / (total_summary.n_rows as f64)
166 |         };
167 |         let row_percent = format.format(row_percent)?;
168 | 
169 |         let byte_percent = if total_summary.n_bytes_compressed == 0 {
170 |             0.0
171 |         } else {
172 |             (summary.n_bytes_compressed as f64) / (total_summary.n_bytes_compressed as f64)
173 |         };
174 |         let byte_percent = format.format(byte_percent)?;
175 | 
176 |         if n_schemas > 1 {
177 |             println!(
178 |                 "{} {}{} {} rows ({}), {} files ({}), {} ({})",
179 |                 "Schema".colorize_title(),
180 |                 format!("{}", i + 1).green().bold(),
181 |                 ":".colorize_title(),
182 |                 format_with_commas(summary.n_rows).green().bold(),
183 |                 row_percent.green().bold(),
184 |                 format_with_commas(summary.n_files).green().bold(),
185 |                 file_percent.green().bold(),
186 |                 format_bytes(summary.n_bytes_compressed).green().bold(),
187 |                 byte_percent.green().bold(),
188 |             );
189 |             println!();
190 |         }
191 |         print_schema(summary.schema.clone(), &summary)?;
192 | 
193 |         if let Some(example_paths) = example_paths.as_ref() {
194 |             if let Some(paths_vec) = example_paths.get(&summary.schema) {
195 |                 println!();
196 |                 if n_example_paths == 1 {
197 |                     println!("{}", "Example path".colorize_title());
198 |                 } else {
199 |                     println!("{}", "Example paths".colorize_title());
200 |                 };
201 |                 for (i, path) in paths_vec.iter().take(n_example_paths).enumerate() {
202 |                     println!(
203 |                         "{} {}",
204 |                         format!("{}.", i + 1).colorize_variable(),
205 |                         path.to_string_lossy().colorize_comment()
206 |                     );
207 |                 }
208 |             }
209 |         }
210 | 
211 |         if i < n_to_show - 1 {
212 |             println!();
213 |             println!();
214 |         }
215 |     }
216 |     if n_to_show < n_schemas {
217 |         println!();
218 |         println!(
219 |             "{} more schemas not shown",
220 |             format!("{}", n_schemas - n_to_show).bold().green()
221 |         )
222 |     }
223 | 
224 |     Ok(())
225 | }
226 | 
227 | fn print_schema(schema: Arc<Schema>, summary: &TabularSummary) -> Result<(), TblCliError> {
228 |     // gather data
229 |     let names: Vec<String> = schema.iter_names().map(|x| x.to_string()).collect();
230 |     let dtypes: Vec<String> = schema.iter_dtypes().map(|x| x.to_string()).collect();
231 |     let uncompressed: Vec<_> = summary
232 |         .columns
233 |         .iter()
234 |         .map(|x| format_bytes(x.n_bytes_uncompressed))
235 |         .collect();
236 |     let compressed: Vec<_> = summary
237 |         .columns
238 |         .iter()
239 |         .map(|x| format_bytes(x.n_bytes_compressed))
240 |         .collect();
241 | 
242 |     let total_disk_bytes: u64 = summary.columns.iter().map(|x| x.n_bytes_compressed).sum();
243 |     let percent_disk: Vec<_> = summary
244 |         .columns
245 |         .iter()
246 |         .map(|x| {
247 |             format!(
248 |                 "{:.2}%",
249 |                 100.0 * (x.n_bytes_compressed as f64) / (total_disk_bytes as f64)
250 |             )
251 |         })
252 |         .collect();
253 | 
254 |     // build table
255 |     let mut table = toolstr::Table::new();
256 |     table.add_column("column name", names)?;
257 |     table.add_column("dtype", dtypes)?;
258 |     table.add_column("full size", uncompressed)?;
259 |     table.add_column("disk size", compressed)?;
260 |     table.add_column("disk %", percent_disk)?;
261 | 
262 |     // create format
263 |     let mut name_column = toolstr::ColumnFormatShorthand::default().name("column name");
264 |     let mut dtype_column = toolstr::ColumnFormatShorthand::default().name("dtype");
265 |     let mut uncompressed_column = toolstr::ColumnFormatShorthand::default().name("full size");
266 |     let mut compressed_column = toolstr::ColumnFormatShorthand::default().name("disk size");
267 |     let mut disk_percent_column = toolstr::ColumnFormatShorthand::default().name("disk %");
268 |     name_column.font_style = Some("".colorize_function().into());
269 |     dtype_column.font_style = Some("".colorize_variable().into());
270 |     uncompressed_column.font_style = Some("".colorize_constant().into());
271 |     compressed_column.font_style = Some("".colorize_constant().into());
272 |     disk_percent_column.font_style = Some("".colorize_constant().into());
273 | 
274 |     let mut format = toolstr::TableFormat {
275 |         // indent: 4,
276 |         label_font_style: Some("".colorize_title().into()),
277 |         border_font_style: Some("".colorize_comment().into()),
278 |         ..Default::default()
279 |     };
280 |     format.add_column(name_column);
281 |     format.add_column(dtype_column);
282 |     format.add_column(compressed_column);
283 |     format.add_column(uncompressed_column);
284 |     format.add_column(disk_percent_column);
285 | 
286 |     // print table
287 |     format.print(table)?;
288 | 
289 |     Ok(())
290 | }
291 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # tbl ┳━┳
  3 | 
  4 | `tbl` is a cli tool for reading and editing parquet files
  5 | 
  6 | #### Goals of `tbl`:
  7 | - be a swiss army knife for reading/editing parquet (kind of like [`jq`](https://github.com/jqlang/jq) is for JSON)
  8 | - make it effortless to manage multi-file multi-schema parquet datasets
  9 | - use a cli-native version of [polars](https://github.com/pola-rs/polars) syntax, so if you know python polars you already mostly know `tbl`
 10 | 
 11 | #### Example use cases:
 12 | - quickly look up schemas, row counts, and per-column storage usage
 13 | - migrate from one schema to another, like add/remove/rename a column
 14 | - perform these operations on multiple files in parallel
 15 | 
 16 | 
 17 | To discuss `tbl`, check out the [Paradigm Data Tools](https://t.me/paradigm_data) telegram group.
 18 | 
 19 | 
 20 | ## Contents
 21 | 1. [Installation](#installation)
 22 | 2. [Example Usage](#example-usage)
 23 |     1. [Listing files](#listing-files)
 24 |     2. [Looking up schemas](#looking-up-schemas)
 25 |     3. [Selecting input files](#selecting-input-files)
 26 |     4. [Performing edits](#performing-edits)
 27 |     5. [Selecting output mode](#selecting-output-mode)
 28 | 4. [API Reference](#api-reference)
 29 |     1. [`tbl`](#tbl)
 30 |     2. [`tbl ls`](#tbl-ls)
 31 |     3. [`tbl schema`](#tbl-schema)
 32 | 6. [FAQ](#faq)
 33 |     1. [What is parquet?](#what-is-parquet)
 34 |     2. [What other parquet cli tools exist?](#what-other-parquet-cli-tools-exist)
 35 |     3. [Why use `tbl` when `duckdb` has a cli?](#why-use-tbl-when-duckdb-has-a-cli)
 36 |     4. [What is the plan for `tbl`?](#what-is-the-plan-for-tbl)
 37 | 
 38 | ## Installation
 39 | 
 40 | ##### Install from crates.io
 41 | ```bash
 42 | cargo install tbl-cli
 43 | ```
 44 | 
 45 | ##### Install from source
 46 | ```bash
 47 | git clone https://github.com/paradigmxyz/tbl
 48 | cd tbl
 49 | cargo install --path crates/tbl-cli
 50 | ```
 51 | 
 52 | ## Example Usage
 53 | 
 54 | ### Listing files
 55 | 
 56 | `tbl` can list files and display their statistics, similar to the `ls` cli command.
 57 | 
 58 | The command `tbl ls` produces output:
 59 | 
 60 | ```
 61 | blocks__00000000_to_00000999.parquet
 62 | blocks__00001000_to_00001999.parquet
 63 | blocks__00002000_to_00002999.parquet
 64 | blocks__00003000_to_00003999.parquet
 65 | blocks__00004000_to_00004999.parquet
 66 | blocks__00005000_to_00005999.parquet
 67 | blocks__00006000_to_00006999.parquet
 68 | blocks__00007000_to_00007999.parquet
 69 | blocks__00008000_to_00008999.parquet
 70 | blocks__00009000_to_00009999.parquet
 71 | ... 19,660 files not shown
 72 | 19,041,325 rows stored in 1.05 GB across 19,708 tabular files
 73 | ```
 74 | 
 75 | See full list of `tbl ls` options [below](#tbl-ls).
 76 | 
 77 | ### Looking up schemas
 78 | 
 79 | `tbl` can display the schemas of parquet files.
 80 | 
 81 | The command `tbl schema` produces output:
 82 | 
 83 | ```
 84 | 1 unique schema, 19,041,325 rows, 19,708 files, 1.05 GB
 85 | 
 86 |      column name  │   dtype  │  disk size  │  full size  │  disk %
 87 | ──────────────────┼──────────┼─────────────┼─────────────┼────────
 88 |       block_hash  │  binary  │  649.97 MB  │  657.93 MB  │  63.78%
 89 |           author  │  binary  │   40.52 MB  │   40.59 MB  │   3.98%
 90 |     block_number  │     u32  │   76.06 MB  │   75.75 MB  │   7.46%
 91 |         gas_used  │     u64  │   84.23 MB  │  133.29 MB  │   8.26%
 92 |       extra_data  │  binary  │   46.66 MB  │   76.91 MB  │   4.58%
 93 |        timestamp  │     u32  │   76.06 MB  │   75.75 MB  │   7.46%
 94 | base_fee_per_gas  │     u64  │   41.85 MB  │   49.58 MB  │   4.11%
 95 |         chain_id  │     u64  │    3.74 MB  │    3.70 MB  │   0.37%
 96 | ```
 97 | 
 98 | See full list of `tbl schema` options [below](#tbl-schema).
 99 | 
100 | ### Selecting input files
101 | 
102 | `tbl` can operate on one file, or many files across multiple directories.
103 | 
104 | These input selection options can be used with each `tbl` subcommand:
105 | 
106 | | input selection | command |
107 | | --- | --- |
108 | | Select all tabular files in current directory | `tbl` (default behavior) |
109 | | Select a single file | `tbl /path/to/file.parquet` |
110 | | Select files using a glob | `tbl *.parquet` |
111 | | Select files from multiple directories | `tbl /path/to/dir1 /path/to/dir2` |
112 | | Select files recursively | `tbl /path/to/dir --tree` |
113 | 
114 | ### Performing edits
115 | 
116 | `tbl` can perform many different operations on the selected files:
117 | 
118 | | operation | command |
119 | | --- | --- |
120 | | Rename a column | `tbl --rename old_name=new_name` |
121 | | Cast to a new type | `tbl --cast col1=u64 col2=String` |
122 | | Add new columns | `tbl --with-columns name:String date:Date=2024-01-01` |
123 | | Drop columns | `tbl --drop col1 col2 col3` |
124 | | Filter rows | `tbl --filter col1=val1` <br> `tbl --filter col1!=val1` <br> `tbl --filter "col1>val1"` <br> `tbl --filter "col1<val1"`<br> `tbl --filter "col1>=val1"` <br> `tbl --filter "col1<=val1"` |
125 | | Sort rows | `tbl --sort col1 col2:desc` |
126 | | Select columns | `tbl --select col1 col2 col3` |
127 | 
128 | See full list of transformation operations [below](#tbl).
129 | 
130 | ### Selecting output mode
131 | 
132 | `tbl` can output its results in many different modes:
133 | 
134 | | output mode | description | command |
135 | | --- | --- | --- |
136 | | Single File | output all results to single file | `tbl --output-file /path/to/file.parquet` |
137 | | Inplace | modify each file inplace | `tbl --inplace` |
138 | | New Directory | create equivalent files in a new directory | `tbl --output-dir /path/to/dir` |
139 | | Interactive | load dataframe in interactive python session | `tbl --df` |
140 | | Stdout | output data to stdout | `tbl` (default behavior) |
141 | 
142 | See full list of output options [below](#tbl).
143 | 
144 | ## API Reference
145 | 
146 | #### `tbl`
147 | ##### Output of `tbl -h`:
148 | 
149 | ```markdown
150 | tbl is a tool for reading and editing tabular data files
151 | 
152 | Usage: tbl has two modes
153 | 1. Summary mode: tbl [ls | schema] [SUMMARY_OPTIONS]
154 | 2. Data mode:    tbl [DATA_OPTIONS]
155 | 
156 | Get help with SUMMARY_OPTIONS using tbl [ls | schema] -h
157 | 
158 | Data mode is the default mode. DATA_OPTIONS are documented below
159 | 
160 | Optional Subcommands:
161 |   ls      Display list of tabular files, similar to the cli `ls` command
162 |   schema  Display table representation of each schema in the selected files
163 | 
164 | General Options:
165 |   -h, --help                       display help message
166 |   -V, --version                    display version
167 | 
168 | Input Options:
169 |   [PATHS]...                       input path(s) to use
170 |   -t, --tree                       recursively use all files in tree as inputs
171 | 
172 | Transform Options:
173 |   -c, --columns <COLUMN>...        select only these columns [alias --select]
174 |       --drop <DROP>...             drop column(s)
175 |       --with-columns <NEW_COL>...  insert columns, syntax NAME:TYPE [alias --with]
176 |       --rename <RENAME>...         rename column(s), syntax OLD_NAME=NEW_NAME
177 |       --cast <CAST>...             change column type(s), syntax COLUMN=TYPE
178 |       --set <COLUMN>...            set column values, syntax COLUMN=VALUE
179 |       --nullify <COLUMN>...        set column values to null
180 |       --filter <FILTER>...         filter rows by values, syntax COLUMN=VALUE
181 |       --sort <SORT>...             sort rows, syntax COLUMN[:desc]
182 |       --head <HEAD>                keep only the first n rows [alias --limit]
183 |       --tail <TAIL>                keep only the last n rows
184 |       --offset <OFFSET>            skip the first n rows of table
185 |       --value-counts <COLUMN>      compute value counts of column(s)
186 | 
187 | Output Options:
188 |       --no-summary                 skip printing a summary
189 |   -n, --n <N>                      number of rows to print in stdout, all for all
190 |       --csv                        output data as csv
191 |       --json                       output data as json
192 |       --jsonl                      output data as json lines
193 |       --hex                        encode binary columns as hex for output
194 |       --inplace                    modify files in place
195 |       --output-file <FILE_PATH>    write all data to a single new file
196 |       --output-dir <DIR_PATH>      rewrite all files into this output directory
197 |       --output-prefix <PRE-FIX>    prefix to add to output filenames
198 |       --output-postfix <POST-FIX>  postfix to add to output filenames
199 |       --df                         load as DataFrame in interactive python session
200 |       --lf                         load as LazyFrame in interactive python session
201 |       --executable <EXECUTABLE>    python executable to use with --df or --lf
202 |       --confirm                    confirm that files should be edited
203 |       --dry                        dry run without editing files
204 | 
205 | Output Modes:
206 | 1. output results in single file   --output-file /path/to/file.parquet
207 | 2. modify each file inplace        --inplace
208 | 3. copy files into a new dir       --output-dir /path/to/dir
209 | 4. load as interactive python      --df | --lf
210 | 5. output data to stdout           (default behavior)
211 | ```
212 | 
213 | #### `tbl ls`
214 | ##### Output of `tbl ls -h`:
215 | 
216 | ```markdown
217 | Display list of tabular files, similar to the cli `ls` command
218 | 
219 | Usage: tbl ls [OPTIONS] [PATHS]...
220 | 
221 | Arguments:
222 |   [PATHS]...  input path(s) to use
223 | 
224 | Options:
225 |   -t, --tree         recursively list all files in tree
226 |       --absolute     show absolute paths instead of relative
227 |       --n <N>        number of file names to print
228 |       --sort <SORT>  sort by number of rows, files, or bytes [default: bytes]
229 | 
230 | General Options:
231 |   -h, --help  display help message
232 | ```
233 | 
234 | #### `tbl schema`
235 | ##### Output of `tbl schema -h`:
236 | 
237 | ```markdown
238 | Display table representation of each schema in the selected files
239 | 
240 | Usage: tbl schema [OPTIONS] [PATHS]...
241 | 
242 | Arguments:
243 |   [PATHS]...  input path(s) to use
244 | 
245 | Options:
246 |   -t, --tree               recursively list all files in tree
247 |       --columns <COLUMNS>  columns to print
248 |       --n <N>              number of schemas to print
249 |       --examples           show examples
250 |       --absolute           show absolute paths in examples
251 |       --sort <SORT>        sort by number of rows, files, or bytes [default: bytes]
252 | 
253 | General Options:
254 |   -h, --help  display help message
255 | ```
256 | 
257 | ## FAQ
258 | 
259 | ### What is parquet?
260 | 
261 | [Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a file format for storing tabular datasets. In many cases parquet is a simpler and faster alternative to using an actual database. Parquet has become an industry standard and its ecosystem of tools is growing rapidly.
262 | 
263 | ### What other parquet cli tools exist?
264 | 
265 | The most common tools are [`duckdb`](https://duckdb.org/docs/api/cli/overview), [`pqrs`](https://github.com/manojkarthick/pqrs), and [`parquet-cli`](https://github.com/apache/parquet-java/blob/master/parquet-cli/README.md).
266 | 
267 | ### Why use `tbl` when `duckdb` has a cli?
268 | 
269 | `duckdb` is an incredible tool. We recommend checking it out, especially when you're running complex workloads. However there are 3 reasons you might prefer `tbl` as a cli tool:
270 | 1. **CLI-Native:** Compared to `duckdb`'s SQL, `tbl` has a cli-native syntax. This makes `tbl` simpler to use with fewer keystrokes:
271 |     1. `duckdb "DESCRIBE read_parquet('test.parquet')"` vs `tbl schema test.parquet` 
272 |     2. `duckdb "SELECT * FROM read_parquet('test.parquet')"` vs `tbl test.parquet`
273 |     3. `duckdb "SELECT * FROM read_parquet('test.parquet') ORDER BY co1"` vs `tbl test.parquet --sort col1`
274 | 2. **High Level vs Low Level:** Sometimes SQL can also be a very low-level language. `tbl` and `polars` let you operate on a higher level of abstraction which reduces cognitive load:
275 |     1. `duckdb`: `duckdb "SELECT col1, COUNT(col1) FROM read_parquet('test.parquet') GROUP BY col1"`
276 |     2. `tbl`: `tbl test.parquet --value-counts col1`
277 | 3. **Operational QoL:** `tbl` is built specifically for making it easy to manage large parquet archives. Features like `--tree`, `--inplace`, and multi-schema commands make life easier for archive management.
278 | 
279 | ### What is the plan for `tbl`?
280 | 
281 | There are a few features that we are currently exploring:
282 | 1. **S3 and cloud buckets**: ability to read and write cloud bucket parquet files using the same operations that can be performed on local files
283 | 2. **Re-partitioning**: ability to change how a set of parquet files are partitioned, such as changing the partition key or partition size
284 | 3. **Direct python syntax**: ability to directly use python polars syntax to perform complex operations like `group_by()`, `join()`, and more
285 | 4. **Idempotent Workflows**: ability to interrupt and re-run commands arbitrarily would make migrations more robust
286 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/cli/args.rs:
--------------------------------------------------------------------------------
  1 | use super::subcommands::*;
  2 | use crate::TblCliError;
  3 | use clap::{Parser, Subcommand};
  4 | use color_print::cstr;
  5 | use std::path::PathBuf;
  6 | 
  7 | pub(crate) async fn run_cli() -> Result<(), TblCliError> {
  8 |     let args = Cli::parse();
  9 | 
 10 |     if args.version {
 11 |         let version = env!("GIT_DESCRIPTION");
 12 |         if version.is_empty() {
 13 |             println!(env!("CARGO_PKG_VERSION"));
 14 |         } else {
 15 |             println!("{}", version);
 16 |         }
 17 |         std::process::exit(0);
 18 |     }
 19 | 
 20 |     match args.command {
 21 |         Some(Subcommands::Ls(args)) => ls_command(args).await,
 22 |         Some(Subcommands::Schema(args)) => schema_command(args).await,
 23 |         Some(Subcommands::Schemas(args)) => schemas_command(args).await,
 24 |         _ => data_command(args.data_args).await,
 25 |     }
 26 | }
 27 | 
 28 | /// Utility for creating and managing MESC RPC configurations
 29 | #[derive(Clone, Parser)]
 30 | #[clap(
 31 |     author,
 32 |     about = cstr!("<white><bold>tbl</bold></white> is a tool for reading and editing tabular data files"),
 33 |     override_usage = cstr!("<white><bold>tbl</bold></white> has two modes
 34 | 1. Summary mode: <white><bold>tbl [ls | schema] [SUMMARY_OPTIONS]</bold></white>
 35 | 2. Data mode:    <white><bold>tbl [DATA_OPTIONS]</bold></white>
 36 | 
 37 | Get help with <white><bold>SUMMARY_OPTIONS</bold></white> using <white><bold>tbl [ls | schema] -h</bold></white>
 38 | 
 39 | Data mode is the default mode. <white><bold>DATA_OPTIONS</bold></white> are documented below
 40 | "),
 41 |     after_help = cstr!("<rgb(0,225,0)><bold>Output Modes:</bold></rgb(0,225,0)>
 42 | <white><bold>1.</bold></white> output results in <white><bold>single file</bold></white>   <white><bold>--output-file</bold></white> /path/to/file.parquet
 43 | <white><bold>2.</bold></white> modify each file <white><bold>inplace</bold></white>        <white><bold>--inplace</bold></white>
 44 | <white><bold>3.</bold></white> copy files into a <white><bold>new dir</bold></white>       <white><bold>--output-dir</bold></white> /path/to/dir
 45 | <white><bold>4.</bold></white> load as <white><bold>interactive</bold></white> python      <white><bold>--df | --lf</bold></white>
 46 | <white><bold>5.</bold></white> output data to <white><bold>stdout</bold></white>           (default behavior)"),
 47 |     long_about = None,
 48 |     disable_help_subcommand = true,
 49 |     disable_help_flag = true,
 50 |     disable_version_flag = true,
 51 |     args_conflicts_with_subcommands = true,
 52 |     subcommand_help_heading = "Optional Subcommands",
 53 |     styles=crate::styles::get_styles()
 54 | )]
 55 | pub(crate) struct Cli {
 56 |     #[clap(subcommand)]
 57 |     pub(crate) command: Option<Subcommands>,
 58 | 
 59 |     ///                   display help message
 60 |     #[clap(short, long, verbatim_doc_comment, action = clap::ArgAction::HelpLong, help_heading = "General Options")]
 61 |     help: Option<bool>,
 62 | 
 63 |     ///                   display version
 64 |     #[clap(
 65 |         short = 'V',
 66 |         long,
 67 |         verbatim_doc_comment,
 68 |         help_heading = "General Options"
 69 |     )]
 70 |     version: bool,
 71 | 
 72 |     #[clap(flatten)]
 73 |     data_args: DataArgs,
 74 | }
 75 | 
 76 | /// Define your subcommands as an enum
 77 | #[derive(Clone, Subcommand)]
 78 | #[command()]
 79 | pub(crate) enum Subcommands {
 80 |     /// Display list of tabular files, similar to the cli `ls` command
 81 |     Ls(LsArgs),
 82 | 
 83 |     /// Display table representation of each schema in the selected files
 84 |     Schema(SchemaArgs),
 85 | 
 86 |     /// Display single summary of all schemas
 87 |     #[command(hide = true)]
 88 |     Schemas(SchemasArgs),
 89 | 
 90 |     /// Load, transform, and output file data [default subcommand]
 91 |     #[command(hide = true)]
 92 |     Data,
 93 | }
 94 | 
 95 | /// Arguments for the `schema` subcommand
 96 | #[derive(Clone, Parser)]
 97 | pub(crate) struct LsArgs {
 98 |     /// display help message
 99 |     #[clap(short, long, action = clap::ArgAction::HelpLong, help_heading = "General Options")]
100 |     help: Option<bool>,
101 | 
102 |     /// input path(s) to use
103 |     #[clap()]
104 |     pub(crate) paths: Option<Vec<PathBuf>>,
105 | 
106 |     /// recursively list all files in tree
107 |     #[clap(short, long)]
108 |     pub(crate) tree: bool,
109 | 
110 |     /// show absolute paths instead of relative
111 |     #[clap(long)]
112 |     pub(crate) absolute: bool,
113 | 
114 |     /// display bytes stats
115 |     #[clap(long, hide = true)]
116 |     pub(crate) bytes: bool,
117 | 
118 |     /// display stats of each schema group
119 |     #[clap(long, hide = true)]
120 |     pub(crate) stats: bool,
121 | 
122 |     /// number of file names to print
123 |     #[clap(long)]
124 |     pub(crate) n: Option<usize>,
125 | 
126 |     /// sort by number of rows, files, or bytes
127 |     #[clap(long, default_value = "bytes")]
128 |     pub(crate) sort: String,
129 | }
130 | 
131 | /// Arguments for the `schema` subcommand
132 | #[derive(Clone, Parser)]
133 | pub(crate) struct SchemaArgs {
134 |     /// display help message
135 |     #[clap(short, long, action = clap::ArgAction::HelpLong, help_heading = "General Options")]
136 |     help: Option<bool>,
137 | 
138 |     /// input path(s) to use
139 |     #[clap()]
140 |     pub(crate) paths: Option<Vec<PathBuf>>,
141 | 
142 |     /// recursively list all files in tree
143 |     #[clap(short, long)]
144 |     pub(crate) tree: bool,
145 | 
146 |     /// display bytes stats
147 |     #[clap(long, hide = true)]
148 |     pub(crate) bytes: bool,
149 | 
150 |     /// display stats of each schema group
151 |     #[clap(long, hide = true)]
152 |     pub(crate) stats: bool,
153 | 
154 |     /// columns to print
155 |     #[clap(long)]
156 |     pub(crate) columns: Option<Vec<String>>,
157 | 
158 |     /// number of schemas to print
159 |     #[clap(long)]
160 |     pub(crate) n: Option<usize>,
161 | 
162 |     /// show examples
163 |     #[clap(long)]
164 |     pub(crate) examples: bool,
165 | 
166 |     /// show absolute paths in examples
167 |     #[clap(long)]
168 |     pub(crate) absolute: bool,
169 | 
170 |     /// sort by number of rows, files, or bytes
171 |     #[clap(long, default_value = "bytes")]
172 |     pub(crate) sort: String,
173 | }
174 | 
175 | /// Arguments for the `schema` subcommand
176 | #[derive(Clone, Parser)]
177 | pub(crate) struct SchemasArgs {
178 |     /// display help message
179 |     #[clap(short, long, action = clap::ArgAction::HelpLong, help_heading = "General Options")]
180 |     help: Option<bool>,
181 | 
182 |     /// input path(s) to use
183 |     #[clap()]
184 |     pub(crate) paths: Option<Vec<PathBuf>>,
185 | 
186 |     /// recursively list all files in tree
187 |     #[clap(short, long)]
188 |     pub(crate) tree: bool,
189 | 
190 |     /// sort by number of rows, files, or bytes
191 |     #[clap(long, default_value = "bytes")]
192 |     pub(crate) sort: String,
193 | }
194 | 
195 | /// Arguments for the `data` subcommand
196 | #[derive(Clone, Parser)]
197 | pub(crate) struct DataArgs {
198 |     //
199 |     // // input options
200 |     //
201 |     ///                      input path(s) to use
202 |     #[clap(
203 |         verbatim_doc_comment,
204 |         help_heading = "Input Options",
205 |         display_order = 1
206 |     )]
207 |     pub(crate) paths: Option<Vec<PathBuf>>,
208 | 
209 |     ///                  recursively use all files in tree as inputs
210 |     #[clap(short, long, verbatim_doc_comment, help_heading = "Input Options")]
211 |     pub(crate) tree: bool,
212 | 
213 |     //
214 |     // // transform options
215 |     //
216 |     /// select only these columns [alias --columns]
217 |     #[clap(
218 |         short,
219 |         long,
220 |         help = cstr!("select only these columns [alias <white><bold>--select</bold></white>]"),
221 |         help_heading = "Transform Options",
222 |         aliases = ["select"],
223 |         value_name="COLUMN",
224 |         num_args(1..)
225 |     )]
226 |     pub(crate) columns: Option<Vec<String>>,
227 | 
228 |     /// drop column(s)
229 |     #[clap(short, long, help_heading = "Transform Options", num_args(1..))]
230 |     pub(crate) drop: Option<Vec<String>>,
231 | 
232 |     /// add new columns, syntax NAME:TYPE [alias --with]
233 |     #[clap(
234 |         long,
235 |         help = cstr!("insert columns, syntax <white><bold>NAME:TYPE</bold></white> [alias <white><bold>--with</bold></white>]"),
236 |         help_heading = "Transform Options",
237 |         value_name="NEW_COL",
238 |         num_args(1..),
239 |         aliases = ["with"]
240 |     )]
241 |     pub(crate) with_columns: Option<Vec<String>>,
242 | 
243 |     /// rename column(s), syntax OLD_NAME=NEW_NAME
244 |     #[clap(
245 |         short,
246 |         long,
247 |         help = cstr!("rename column(s), syntax <white><bold>OLD_NAME=NEW_NAME</bold></white>"),
248 |         help_heading = "Transform Options",
249 |         num_args(1..)
250 |     )]
251 |     pub(crate) rename: Option<Vec<String>>,
252 | 
253 |     /// change column type(s), syntax COLUMN=TYPE
254 |     #[clap(
255 |         long,
256 |         help = cstr!("change column type(s), syntax <white><bold>COLUMN=TYPE</bold></white>"),
257 |         help_heading = "Transform Options",
258 |         num_args(1..)
259 |     )]
260 |     pub(crate) cast: Option<Vec<String>>,
261 | 
262 |     /// set column values
263 |     #[clap(
264 |         long,
265 |         help = cstr!("set column value, syntax <white><bold>COLUMN=VALUE</bold></white>"),
266 |         help_heading = "Transform Options",
267 |         value_name="COLUMN",
268 |         num_args(1..)
269 |     )]
270 |     pub(crate) set: Option<Vec<String>>,
271 | 
272 |     /// set column values to null
273 |     #[clap(
274 |         long,
275 |         help_heading = "Transform Options",
276 |         value_name="COLUMN",
277 |         num_args(1..)
278 |     )]
279 |     pub(crate) nullify: Option<Vec<String>>,
280 | 
281 |     /// replace values of a column
282 |     #[clap(
283 |         long,
284 |         help = cstr!("replace values, syntax <white><bold>COLUMN.OLD_VALUE=NEW_VALUE</bold></white>"),
285 |         help_heading = "Transform Options",
286 |         value_name="VALUE",
287 |         num_args(1..)
288 |     )]
289 |     pub(crate) replace: Option<Vec<String>>,
290 | 
291 |     /// filter rows by values, syntax COLUMN=VALUE
292 |     #[clap(
293 |         short,
294 |         long,
295 |         help = cstr!("filter rows by values, syntax <white><bold>COLUMN=VALUE</bold></white>
296 |     or <white><bold>COLUMN.is_null</bold></white> or <white><bold>COLUMN.is_not_null</bold></white>"),
297 |         help_heading = "Transform Options",
298 |         num_args(1..)
299 |     )]
300 |     pub(crate) filter: Option<Vec<String>>,
301 | 
302 |     /// sort rows, syntax COLUMN[:desc]
303 |     #[clap(
304 |         short,
305 |         long,
306 |         help = cstr!("sort rows, syntax <white><bold>COLUMN[:desc]</bold></white>"),
307 |         help_heading = "Transform Options",
308 |         num_args(1..)
309 |     )]
310 |     pub(crate) sort: Option<Vec<String>>,
311 | 
312 |     /// keep only the first n rows [alias --limit]
313 |     #[clap(
314 |         long,
315 |         help = cstr!("keep only the first n rows [alias <white><bold>--limit</bold></white>]"),
316 |         help_heading = "Transform Options",
317 |         aliases = ["limit"]
318 |     )]
319 |     pub(crate) head: Option<usize>,
320 | 
321 |     /// keep only the last n rows
322 |     #[clap(long, help_heading = "Transform Options")]
323 |     pub(crate) tail: Option<usize>,
324 | 
325 |     /// skip the first n rows of table
326 |     #[clap(long, help_heading = "Transform Options")]
327 |     pub(crate) offset: Option<usize>,
328 | 
329 |     /// compute value counts of column(s)
330 |     #[clap(long, help_heading = "Transform Options", value_name = "COLUMN")]
331 |     pub(crate) value_counts: Option<String>,
332 | 
333 |     //
334 |     // // output options
335 |     //
336 |     /// skip printing a summary
337 |     #[clap(long, help_heading = "Output Options")]
338 |     pub(crate) no_summary: bool,
339 | 
340 |     /// number of rows to print in stdout, all for all
341 |     #[clap(
342 |         short,
343 |         long,
344 |         help = cstr!("number of rows to print in stdout, <white><bold>all</bold></white> for all"),
345 |         help_heading = "Output Options"
346 |     )]
347 |     pub(crate) n: Option<String>,
348 | 
349 |     /// output data as csv
350 |     #[clap(long, help_heading = "Output Options")]
351 |     pub(crate) csv: bool,
352 | 
353 |     /// output data as json
354 |     #[clap(long, help_heading = "Output Options")]
355 |     pub(crate) json: bool,
356 | 
357 |     /// output data as json lines
358 |     #[clap(long, help_heading = "Output Options")]
359 |     pub(crate) jsonl: bool,
360 | 
361 |     /// encode binary columns as hex for output
362 |     #[clap(long, help_heading = "Output Options")]
363 |     pub(crate) hex: bool,
364 | 
365 |     /// modify files in place
366 |     #[clap(long, help_heading = "Output Options")]
367 |     pub(crate) inplace: bool,
368 | 
369 |     /// write all data to a single new file
370 |     #[clap(long, help_heading = "Output Options", value_name = "FILE_PATH")]
371 |     pub(crate) output_file: Option<PathBuf>,
372 | 
373 |     /// rewrite all files into this output directory
374 |     #[clap(long, help_heading = "Output Options", value_name = "DIR_PATH")]
375 |     pub(crate) output_dir: Option<PathBuf>,
376 | 
377 |     /// prefix to add to output filenames
378 |     #[clap(long, help_heading = "Output Options", value_name = "PRE-FIX")]
379 |     pub(crate) output_prefix: Option<String>,
380 | 
381 |     /// postfix to add to output filenames
382 |     #[clap(long, help_heading = "Output Options", value_name = "POST-FIX")]
383 |     pub(crate) output_postfix: Option<String>,
384 | 
385 |     /// partition output over this column
386 |     #[clap(
387 |         long,
388 |         help_heading = "Output Options",
389 |         value_name = "COLUMN",
390 |         hide = true
391 |     )]
392 |     pub(crate) partition: Option<String>,
393 | 
394 |     /// partition mode, by range of values per partition
395 |     #[clap(
396 |         long,
397 |         help_heading = "Output Options",
398 |         value_name = "SIZE",
399 |         hide = true
400 |     )]
401 |     pub(crate) partition_by_value: Option<String>,
402 | 
403 |     /// partition mode, by max bytes per partition
404 |     #[clap(
405 |         long,
406 |         help_heading = "Output Options",
407 |         value_name = "BYTES",
408 |         hide = true
409 |     )]
410 |     pub(crate) partition_by_bytes: Option<String>,
411 | 
412 |     /// partition mode, by max rows per partition
413 |     #[clap(
414 |         long,
415 |         help_heading = "Output Options",
416 |         value_name = "ROWS",
417 |         hide = true
418 |     )]
419 |     pub(crate) partition_by_rows: Option<String>,
420 | 
421 |     /// load as DataFrame in interactive python session
422 |     #[clap(long, help_heading = "Output Options")]
423 |     pub(crate) df: bool,
424 | 
425 |     /// load as LazyFrame in interactive python session
426 |     #[clap(long, help_heading = "Output Options")]
427 |     pub(crate) lf: bool,
428 | 
429 |     /// python executable to use with --df or --lf
430 |     #[clap(
431 |         long,
432 |         help = cstr!("python executable to use with <white><bold>--df</bold></white> or <white><bold>--lf</bold></white>"),
433 |         help_heading = "Output Options"
434 |     )]
435 |     pub(crate) executable: Option<String>,
436 | 
437 |     /// confirm that files should be edited
438 |     #[clap(long, help_heading = "Output Options")]
439 |     pub(crate) confirm: bool,
440 | 
441 |     /// dry run without editing files
442 |     #[clap(long, help_heading = "Output Options")]
443 |     pub(crate) dry: bool,
444 | }
445 | 


--------------------------------------------------------------------------------
/crates/tbl-cli/src/transform.rs:
--------------------------------------------------------------------------------
  1 | use crate::{DataArgs, TblCliError};
  2 | use polars::prelude::*;
  3 | use std::str::FromStr;
  4 | 
  5 | pub(crate) fn apply_transformations(
  6 |     lf: LazyFrame,
  7 |     args: &DataArgs,
  8 | ) -> Result<LazyFrame, TblCliError> {
  9 |     let lf = apply_with_columns(lf, args.with_columns.as_deref())?;
 10 |     let lf = apply_filter(lf, args.filter.as_deref())?;
 11 |     let lf = apply_drop(lf, args.drop.as_deref())?;
 12 |     let lf = apply_cast(lf, args.cast.as_deref())?;
 13 |     let lf = apply_set(lf, args.set.as_deref())?;
 14 |     let lf = apply_nullify(lf, args.nullify.as_deref())?;
 15 |     let lf = apply_replace(lf, args.replace.as_deref())?;
 16 |     let lf = apply_select(lf, args.columns.as_deref())?;
 17 |     let lf = apply_offset(lf, args.offset)?;
 18 |     let lf = apply_head(lf, args.head)?;
 19 |     let lf = apply_tail(lf, args.tail)?;
 20 |     let lf = apply_value_counts(lf, args.value_counts.as_deref())?;
 21 |     let lf = apply_sort(lf, args.sort.as_deref())?;
 22 |     let lf = apply_rename(lf, args.rename.as_deref())?;
 23 |     Ok(lf)
 24 | }
 25 | 
 26 | pub(crate) fn apply_with_columns(
 27 |     lf: LazyFrame,
 28 |     columns: Option<&[String]>,
 29 | ) -> Result<LazyFrame, TblCliError> {
 30 |     match columns {
 31 |         None => Ok(lf),
 32 |         Some(columns) => {
 33 |             let mut new_lf = lf;
 34 |             for col_spec in columns {
 35 |                 new_lf = new_lf.with_column(parse_new_column_expr(col_spec)?);
 36 |             }
 37 |             Ok(new_lf)
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | fn parse_new_column_expr(col_spec: &str) -> Result<Expr, TblCliError> {
 43 |     let parts: Vec<&str> = col_spec.splitn(3, ':').collect();
 44 |     if parts.len() < 2 || parts.len() > 3 {
 45 |         return Err(TblCliError::Error(
 46 |             "invalid format for with_column".to_string(),
 47 |         ));
 48 |     }
 49 |     let (name, type_str) = (parts[0], parts[1]);
 50 |     let value_str = parts.get(2).and_then(|s| s.split('=').nth(1));
 51 |     let dtype = parse_dtype(type_str)?;
 52 |     let expr = if let Some(value) = value_str {
 53 |         create_value_expr(value, &dtype)?
 54 |     } else {
 55 |         lit(NULL).cast(dtype)
 56 |     };
 57 |     let expr = expr.alias(name);
 58 |     Ok(expr)
 59 | }
 60 | 
 61 | fn parse_dtype(type_str: &str) -> Result<DataType, TblCliError> {
 62 |     match type_str.to_lowercase().as_str() {
 63 |         "i8" => Ok(DataType::Int8),
 64 |         "i16" => Ok(DataType::Int16),
 65 |         "i32" => Ok(DataType::Int32),
 66 |         "i64" => Ok(DataType::Int64),
 67 |         "u8" => Ok(DataType::UInt8),
 68 |         "u16" => Ok(DataType::UInt16),
 69 |         "u32" => Ok(DataType::UInt32),
 70 |         "u64" => Ok(DataType::UInt64),
 71 |         "f32" => Ok(DataType::Float32),
 72 |         "f64" => Ok(DataType::Float64),
 73 |         "bool" => Ok(DataType::Boolean),
 74 |         "str" => Ok(DataType::String),
 75 |         "date" => Ok(DataType::Date),
 76 |         "datetime" => Ok(DataType::Datetime(TimeUnit::Microseconds, None)),
 77 |         _ => Err(TblCliError::Error("invalid data type".to_string())),
 78 |     }
 79 | }
 80 | 
 81 | fn create_value_expr(value: &str, dtype: &DataType) -> Result<Expr, TblCliError> {
 82 |     match dtype {
 83 |         DataType::Int8 => Ok(lit(
 84 |             i8::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
 85 |         )),
 86 |         DataType::Int16 => Ok(lit(
 87 |             i16::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
 88 |         )),
 89 |         DataType::Int32 => Ok(lit(
 90 |             i32::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
 91 |         )),
 92 |         DataType::Int64 => Ok(lit(
 93 |             i64::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
 94 |         )),
 95 |         DataType::UInt8 => Ok(lit(
 96 |             u8::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
 97 |         )),
 98 |         DataType::UInt16 => Ok(lit(
 99 |             u16::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
100 |         )),
101 |         DataType::UInt32 => Ok(lit(
102 |             u32::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
103 |         )),
104 |         DataType::UInt64 => Ok(lit(
105 |             u64::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
106 |         )),
107 |         DataType::Float32 => Ok(lit(
108 |             f32::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
109 |         )),
110 |         DataType::Float64 => Ok(lit(
111 |             f64::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
112 |         )),
113 |         DataType::Boolean => Ok(lit(
114 |             bool::from_str(value).map_err(|_| TblCliError::Error(value.to_string()))?
115 |         )),
116 |         DataType::String => Ok(lit(value.to_string())),
117 |         DataType::Date => {
118 |             let naive_date =
119 |                 chrono::NaiveDate::parse_from_str(value, "%Y-%m-%d").map_err(|_| {
120 |                     TblCliError::Error("set default date string as %Y-%m-%d".to_string())
121 |                 })?;
122 |             Ok(lit(naive_date
123 |                 .and_hms_opt(0, 0, 0)
124 |                 .ok_or_else(|| TblCliError::Error("Failed to create NaiveDateTime".to_string()))?
125 |                 .and_utc()
126 |                 .timestamp_millis()))
127 |         }
128 |         DataType::Datetime(_, _) => {
129 |             let naive_datetime = chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S")
130 |                 .map_err(|_| TblCliError::Error(value.to_string()))?;
131 |             Ok(lit(naive_datetime.and_utc().timestamp_millis()))
132 |         }
133 |         _ => Err(TblCliError::Error("Unsupported dtype".to_string())),
134 |     }
135 | }
136 | 
137 | pub(crate) fn apply_filter(
138 |     lf: LazyFrame,
139 |     filters: Option<&[String]>,
140 | ) -> Result<LazyFrame, TblCliError> {
141 |     let schema = lf
142 |         .clone()
143 |         .schema()
144 |         .map_err(|e| TblCliError::Error(e.to_string()))?;
145 | 
146 |     match filters {
147 |         None => Ok(lf),
148 |         Some(filters) => {
149 |             let mut new_lf = lf;
150 |             for filter in filters {
151 |                 new_lf = apply_single_filter(new_lf, filter, &schema)?;
152 |             }
153 |             Ok(new_lf)
154 |         }
155 |     }
156 | }
157 | 
158 | fn apply_single_filter(
159 |     lf: LazyFrame,
160 |     filter: &str,
161 |     schema: &Schema,
162 | ) -> Result<LazyFrame, TblCliError> {
163 |     if filter.contains("!=") {
164 |         apply_comparison_filter(lf, filter, schema, "!=")
165 |     } else if filter.contains(">=") {
166 |         apply_comparison_filter(lf, filter, schema, ">=")
167 |     } else if filter.contains("<=") {
168 |         apply_comparison_filter(lf, filter, schema, "<=")
169 |     } else if filter.contains('=') {
170 |         apply_comparison_filter(lf, filter, schema, "=")
171 |     } else if filter.contains(">") {
172 |         apply_comparison_filter(lf, filter, schema, ">")
173 |     } else if filter.contains("<") {
174 |         apply_comparison_filter(lf, filter, schema, "<")
175 |     } else if filter.ends_with(".is_null") {
176 |         apply_null_filter(lf, filter, schema, true)
177 |     } else if filter.ends_with(".is_not_null") {
178 |         apply_null_filter(lf, filter, schema, false)
179 |     } else {
180 |         Err(TblCliError::Error("Invalid filter format".to_string()))
181 |     }
182 | }
183 | 
184 | fn apply_comparison_filter(
185 |     lf: LazyFrame,
186 |     filter: &str,
187 |     schema: &Schema,
188 |     operator: &str,
189 | ) -> Result<LazyFrame, TblCliError> {
190 |     let parts: Vec<&str> = if operator == "=" {
191 |         filter.split('=').collect()
192 |     } else if operator == "!=" {
193 |         filter.split("!=").collect()
194 |     } else if operator == ">" {
195 |         filter.split('>').collect()
196 |     } else if operator == "<" {
197 |         filter.split('<').collect()
198 |     } else if operator == ">=" {
199 |         filter.split(">=").collect()
200 |     } else if operator == "<=" {
201 |         filter.split("<=").collect()
202 |     } else {
203 |         return Err(TblCliError::Error(format!(
204 |             "Invalid filter operator: {}",
205 |             operator
206 |         )));
207 |     };
208 | 
209 |     if parts.len() != 2 {
210 |         return Err(TblCliError::Error("Invalid filter format".to_string()));
211 |     }
212 | 
213 |     let (column, value) = (parts[0], parts[1]);
214 |     let column_type = schema
215 |         .get(column)
216 |         .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?;
217 | 
218 |     let filter_expr = match column_type {
219 |         DataType::Binary => {
220 |             if let Some(hex_value) = value.strip_prefix("0x") {
221 |                 let binary_value = hex::decode(hex_value)
222 |                     .map_err(|e| TblCliError::Error(format!("Invalid hex value: {}", e)))?;
223 |                 if operator == "=" {
224 |                     col(column).eq(lit(binary_value))
225 |                 } else if operator == "!=" {
226 |                     col(column).neq(lit(binary_value))
227 |                 } else if operator == ">" {
228 |                     col(column).gt(lit(binary_value))
229 |                 } else if operator == "<" {
230 |                     col(column).lt(lit(binary_value))
231 |                 } else if operator == ">=" {
232 |                     col(column).gt_eq(lit(binary_value))
233 |                 } else if operator == "<=" {
234 |                     col(column).lt_eq(lit(binary_value))
235 |                 } else {
236 |                     return Err(TblCliError::Error(format!(
237 |                         "Invalid filter operator: {}",
238 |                         operator
239 |                     )));
240 |                 }
241 |             } else {
242 |                 return Err(TblCliError::Error(
243 |                     "Binary value must start with 0x".to_string(),
244 |                 ));
245 |             }
246 |         }
247 |         DataType::String => {
248 |             if operator == "=" {
249 |                 col(column).eq(lit(value))
250 |             } else if operator == "!=" {
251 |                 col(column).neq(lit(value))
252 |             } else if operator == ">" {
253 |                 col(column).gt(lit(value))
254 |             } else if operator == "<" {
255 |                 col(column).lt(lit(value))
256 |             } else if operator == ">=" {
257 |                 col(column).gt_eq(lit(value))
258 |             } else if operator == "<=" {
259 |                 col(column).lt_eq(lit(value))
260 |             } else {
261 |                 return Err(TblCliError::Error(format!(
262 |                     "Invalid filter operator: {}",
263 |                     operator
264 |                 )));
265 |             }
266 |         }
267 |         DataType::UInt64 | DataType::Int64 | DataType::UInt32 | DataType::Int32 => {
268 |             let int_value = if let Some(hex_value) = value.strip_prefix("0x") {
269 |                 i64::from_str_radix(hex_value, 16)
270 |                     .map_err(|e| TblCliError::Error(format!("Invalid hex integer: {}", e)))?
271 |             } else {
272 |                 value
273 |                     .parse::<i64>()
274 |                     .map_err(|e| TblCliError::Error(format!("Invalid integer: {}", e)))?
275 |             };
276 |             if operator == "=" {
277 |                 col(column).eq(lit(int_value))
278 |             } else if operator == "!=" {
279 |                 col(column).neq(lit(int_value))
280 |             } else if operator == ">" {
281 |                 col(column).gt(lit(int_value))
282 |             } else if operator == "<" {
283 |                 col(column).lt(lit(int_value))
284 |             } else if operator == ">=" {
285 |                 col(column).gt_eq(lit(int_value))
286 |             } else if operator == "<=" {
287 |                 col(column).lt_eq(lit(int_value))
288 |             } else {
289 |                 return Err(TblCliError::Error(format!(
290 |                     "Invalid filter operator: {}",
291 |                     operator
292 |                 )));
293 |             }
294 |         }
295 |         _ => {
296 |             return Err(TblCliError::Error(format!(
297 |                 "Unsupported column type for '{}': {:?}",
298 |                 column, column_type
299 |             )))
300 |         }
301 |     };
302 | 
303 |     Ok(lf.filter(filter_expr))
304 | }
305 | 
306 | fn apply_null_filter(
307 |     lf: LazyFrame,
308 |     filter: &str,
309 |     schema: &Schema,
310 |     is_null: bool,
311 | ) -> Result<LazyFrame, TblCliError> {
312 |     let column = filter.trim_end_matches(if is_null { ".is_null" } else { ".is_not_null" });
313 | 
314 |     if schema.get(column).is_none() {
315 |         return Err(TblCliError::Error(format!("Column '{}' not found", column)));
316 |     }
317 | 
318 |     let filter_expr = if is_null {
319 |         col(column).is_null()
320 |     } else {
321 |         col(column).is_not_null()
322 |     };
323 | 
324 |     Ok(lf.filter(filter_expr))
325 | }
326 | 
327 | pub(crate) fn apply_rename(
328 |     lf: LazyFrame,
329 |     rename: Option<&[String]>,
330 | ) -> Result<LazyFrame, TblCliError> {
331 |     match rename {
332 |         None => Ok(lf),
333 |         Some(rename) => {
334 |             let (existing, new): (Vec<String>, Vec<String>) =
335 |                 rename
336 |                     .iter()
337 |                     .try_fold((Vec::new(), Vec::new()), |(mut old, mut new), r| {
338 |                         let parts: Vec<&str> = r.split('=').collect();
339 |                         if parts.len() != 2 {
340 |                             return Err(TblCliError::Error("Invalid rename format".to_string()));
341 |                         }
342 |                         old.push(parts[0].to_string());
343 |                         new.push(parts[1].to_string());
344 |                         Ok((old, new))
345 |                     })?;
346 | 
347 |             Ok(lf.rename(existing, new))
348 |         }
349 |     }
350 | }
351 | 
352 | pub(crate) fn apply_drop(
353 |     lf: LazyFrame,
354 |     columns: Option<&[String]>,
355 | ) -> Result<LazyFrame, TblCliError> {
356 |     match columns {
357 |         None => Ok(lf),
358 |         Some(columns) => Ok(lf.drop(columns)),
359 |     }
360 | }
361 | 
362 | pub(crate) fn apply_cast(lf: LazyFrame, cast: Option<&[String]>) -> Result<LazyFrame, TblCliError> {
363 |     match cast {
364 |         None => Ok(lf),
365 |         Some(cast) => {
366 |             let mut new_lf = lf;
367 |             for c in cast {
368 |                 let parts: Vec<&str> = c.split('=').collect();
369 |                 if parts.len() != 2 {
370 |                     return Err(TblCliError::Error("InvalidCastFormat".to_string()));
371 |                 }
372 |                 let (column, dtype_str) = (parts[0], parts[1]);
373 |                 let dtype = parse_dtype(dtype_str)?;
374 |                 new_lf = new_lf.with_column(col(column).cast(dtype));
375 |             }
376 |             Ok(new_lf)
377 |         }
378 |     }
379 | }
380 | 
381 | pub(crate) fn apply_set(lf: LazyFrame, set: Option<&[String]>) -> Result<LazyFrame, TblCliError> {
382 |     match set {
383 |         None => Ok(lf),
384 |         Some(set) => {
385 |             let mut new_lf = lf;
386 |             let schema = new_lf
387 |                 .schema()
388 |                 .map_err(|e| TblCliError::Error(e.to_string()))?;
389 | 
390 |             for s in set {
391 |                 println!("s: {:?}", s);
392 |                 let parts: Vec<&str> = s.split('=').collect();
393 |                 println!("parts: {:?}", parts);
394 |                 if parts.len() != 2 {
395 |                     return Err(TblCliError::Error("Invalid set format".to_string()));
396 |                 }
397 |                 let (column, value) = (parts[0], parts[1]);
398 | 
399 |                 let column_type = schema
400 |                     .get(column)
401 |                     .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?;
402 |                 println!("column_type: {:?}", column_type);
403 |                 println!("column: {:?}", column);
404 | 
405 |                 let set_expr = raw_str_to_lit(column, value, column_type)?;
406 |                 println!("set_expr: {:?}", set_expr);
407 |                 new_lf = new_lf.with_column(set_expr.cast(column_type.clone()));
408 |                 println!("done");
409 |             }
410 |             Ok(new_lf)
411 |         }
412 |     }
413 | }
414 | 
415 | fn raw_str_to_lit(column: &str, value: &str, dtype: &DataType) -> Result<Expr, TblCliError> {
416 |     let lit_value = match dtype {
417 |         DataType::Int8 => lit(i8::from_str(value)
418 |             .map_err(|_| TblCliError::Error(format!("Invalid i8 value: {}", value)))?),
419 |         DataType::Int16 => lit(i16::from_str(value)
420 |             .map_err(|_| TblCliError::Error(format!("Invalid i16 value: {}", value)))?),
421 |         DataType::Int32 => lit(i32::from_str(value)
422 |             .map_err(|_| TblCliError::Error(format!("Invalid i32 value: {}", value)))?),
423 |         DataType::Int64 => lit(i64::from_str(value)
424 |             .map_err(|_| TblCliError::Error(format!("Invalid i64 value: {}", value)))?),
425 |         DataType::UInt8 => lit(u8::from_str(value)
426 |             .map_err(|_| TblCliError::Error(format!("Invalid u8 value: {}", value)))?),
427 |         DataType::UInt16 => lit(u16::from_str(value)
428 |             .map_err(|_| TblCliError::Error(format!("Invalid u16 value: {}", value)))?),
429 |         DataType::UInt32 => lit(u32::from_str(value)
430 |             .map_err(|_| TblCliError::Error(format!("Invalid u32 value: {}", value)))?),
431 |         DataType::UInt64 => lit(u64::from_str(value)
432 |             .map_err(|_| TblCliError::Error(format!("Invalid u64 value: {}", value)))?),
433 |         DataType::Float32 => lit(f32::from_str(value)
434 |             .map_err(|_| TblCliError::Error(format!("Invalid f32 value: {}", value)))?),
435 |         DataType::Float64 => lit(f64::from_str(value)
436 |             .map_err(|_| TblCliError::Error(format!("Invalid f64 value: {}", value)))?),
437 |         DataType::Boolean => lit(bool::from_str(value)
438 |             .map_err(|_| TblCliError::Error(format!("Invalid boolean value: {}", value)))?),
439 |         DataType::String => lit(value.to_string()),
440 |         DataType::Date => {
441 |             let naive_date =
442 |                 chrono::NaiveDate::parse_from_str(value, "%Y-%m-%d").map_err(|_| {
443 |                     TblCliError::Error("Invalid date format. Use YYYY-MM-DD".to_string())
444 |                 })?;
445 |             lit(naive_date
446 |                 .and_hms_opt(0, 0, 0)
447 |                 .ok_or_else(|| TblCliError::Error("Failed to create NaiveDateTime".to_string()))?
448 |                 .and_utc()
449 |                 .timestamp_millis())
450 |         }
451 |         DataType::Datetime(_, _) => {
452 |             let naive_datetime = chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S")
453 |                 .map_err(|_| {
454 |                     TblCliError::Error(
455 |                         "Invalid datetime format. Use YYYY-MM-DD HH:MM:SS".to_string(),
456 |                     )
457 |                 })?;
458 |             lit(naive_datetime.and_utc().timestamp_millis())
459 |         }
460 |         DataType::Binary => {
461 |             if let Some(hex_value) = value.strip_prefix("0x") {
462 |                 let binary_value = hex::decode(hex_value)
463 |                     .map_err(|e| TblCliError::Error(format!("Invalid hex value: {}", e)))?;
464 |                 lit(binary_value)
465 |             } else {
466 |                 return Err(TblCliError::Error(
467 |                     "Binary value must start with 0x".to_string(),
468 |                 ));
469 |             }
470 |         }
471 |         _ => {
472 |             return Err(TblCliError::Error(format!(
473 |                 "Unsupported column type for '{}': {:?}",
474 |                 column, dtype
475 |             )))
476 |         }
477 |     };
478 | 
479 |     Ok(lit_value.alias(column))
480 | }
481 | 
482 | pub(crate) fn apply_nullify(
483 |     lf: LazyFrame,
484 |     raw_columns: Option<&[String]>,
485 | ) -> Result<LazyFrame, TblCliError> {
486 |     match raw_columns {
487 |         None => Ok(lf),
488 |         Some(columns) => {
489 |             let mut new_lf = lf;
490 |             let schema = new_lf
491 |                 .schema()
492 |                 .map_err(|e| TblCliError::Error(e.to_string()))?;
493 | 
494 |             for column in columns.iter() {
495 |                 let column_type = schema
496 |                     .get(column)
497 |                     .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?;
498 |                 new_lf = new_lf.with_column(
499 |                     lit(LiteralValue::Null)
500 |                         .cast(column_type.clone())
501 |                         .alias(column),
502 |                 );
503 |             }
504 |             Ok(new_lf)
505 |         }
506 |     }
507 | }
508 | 
509 | pub(crate) fn apply_replace(
510 |     lf: LazyFrame,
511 |     raw_values: Option<&[String]>,
512 | ) -> Result<LazyFrame, TblCliError> {
513 |     match raw_values {
514 |         None => Ok(lf),
515 |         Some(values) => {
516 |             let mut new_lf = lf;
517 |             let schema = new_lf
518 |                 .schema()
519 |                 .map_err(|e| TblCliError::Error(e.to_string()))?;
520 | 
521 |             for value in values.iter() {
522 |                 // get column
523 |                 let parts: Vec<&str> = value.split('.').collect();
524 |                 if parts.len() != 2 {
525 |                     return Err(TblCliError::Error("Invalid format".to_string()));
526 |                 }
527 |                 let (column, before_after) = (parts[0], parts[1]);
528 | 
529 |                 // get old_value / new_value
530 |                 let parts: Vec<&str> = before_after.split('=').collect();
531 |                 if parts.len() != 2 {
532 |                     return Err(TblCliError::Error("Invalid format".to_string()));
533 |                 }
534 |                 let (old_value, new_value) = (parts[0], parts[1]);
535 | 
536 |                 let column_type = schema
537 |                     .get(column)
538 |                     .ok_or_else(|| TblCliError::Error(format!("Column '{}' not found", column)))?;
539 | 
540 |                 let old_expr = raw_str_to_lit(column, old_value, column_type)?;
541 |                 let new_expr = raw_str_to_lit(column, new_value, column_type)?;
542 |                 new_lf = new_lf.with_column(col(column).replace(old_expr, new_expr));
543 |             }
544 |             Ok(new_lf)
545 |         }
546 |     }
547 | }
548 | 
549 | pub(crate) fn apply_sort(
550 |     lf: LazyFrame,
551 |     raw_columns: Option<&[String]>,
552 | ) -> Result<LazyFrame, TblCliError> {
553 |     match raw_columns {
554 |         None => Ok(lf),
555 |         Some(raw_columns) => {
556 |             let mut columns: Vec<String> = Vec::new();
557 |             let mut descending: Vec<bool> = Vec::new();
558 |             for column in raw_columns.iter() {
559 |                 let column = column.to_string();
560 |                 if column.ends_with(":desc") {
561 |                     columns.push(column[..column.len() - 5].to_string());
562 |                     descending.push(true);
563 |                 } else {
564 |                     columns.push(column);
565 |                     descending.push(false);
566 |                 }
567 |             }
568 |             let options = polars::chunked_array::ops::SortMultipleOptions::default()
569 |                 .with_order_descending_multi(descending);
570 |             Ok(lf.sort(columns, options))
571 |         }
572 |     }
573 | }
574 | 
575 | pub(crate) fn apply_select(
576 |     lf: LazyFrame,
577 |     columns: Option<&[String]>,
578 | ) -> Result<LazyFrame, TblCliError> {
579 |     match columns {
580 |         None => Ok(lf),
581 |         Some(columns) => {
582 |             let exprs: Vec<Expr> = columns.iter().map(|c| col(c)).collect();
583 |             Ok(lf.select(&exprs))
584 |         }
585 |     }
586 | }
587 | 
588 | pub(crate) fn apply_head(lf: LazyFrame, n: Option<usize>) -> Result<LazyFrame, TblCliError> {
589 |     match n {
590 |         None => Ok(lf),
591 |         Some(n) => Ok(lf.slice(0, n as u32)),
592 |     }
593 | }
594 | 
595 | pub(crate) fn apply_tail(lf: LazyFrame, n: Option<usize>) -> Result<LazyFrame, TblCliError> {
596 |     match n {
597 |         None => Ok(lf),
598 |         Some(n) => Ok(lf.tail(n as u32)),
599 |     }
600 | }
601 | 
602 | pub(crate) fn apply_offset(lf: LazyFrame, n: Option<usize>) -> Result<LazyFrame, TblCliError> {
603 |     match n {
604 |         None => Ok(lf),
605 |         Some(n) => Ok(lf.slice(n as i64, u32::MAX)),
606 |     }
607 | }
608 | 
609 | pub(crate) fn apply_value_counts(lf: LazyFrame, n: Option<&str>) -> Result<LazyFrame, TblCliError> {
610 |     match n {
611 |         None => Ok(lf),
612 |         Some(column) => {
613 |             // let expr = col(column).value_counts(true, false, "count".to_string(), false);
614 |             // Ok(lf.select([expr]))
615 |             let sort_options = SortMultipleOptions::new().with_order_descending(true);
616 |             let value_counts = lf
617 |                 .group_by(&[col(column)])
618 |                 .agg(&[col(column).count().alias("count")])
619 |                 .sort(["count"], sort_options);
620 |             Ok(value_counts)
621 |         }
622 |     }
623 | }
624 | 


--------------------------------------------------------------------------------
/crates/tbl-core/src/filesystem/outputs.rs:
--------------------------------------------------------------------------------
  1 | use crate::TblError;
  2 | use std::collections::HashMap;
  3 | use std::path::PathBuf;
  4 | 
  5 | /// output path spec
  6 | #[derive(Default, Debug)]
  7 | pub struct OutputPathSpec {
  8 |     /// inputs
  9 |     pub inputs: Option<Vec<PathBuf>>,
 10 |     /// output_dir
 11 |     pub output_dir: Option<PathBuf>,
 12 |     /// tree
 13 |     pub tree: bool,
 14 |     /// file_prefix
 15 |     pub file_prefix: Option<String>,
 16 |     /// file_postfix
 17 |     pub file_postfix: Option<String>,
 18 |     /// sort
 19 |     pub sort: bool,
 20 | }
 21 | 
 22 | impl OutputPathSpec {
 23 |     /// create new OutputPathSpec
 24 |     pub fn new() -> Self {
 25 |         OutputPathSpec::default()
 26 |     }
 27 | 
 28 |     /// set inputs
 29 |     pub fn inputs<I>(mut self, inputs: I) -> Self
 30 |     where
 31 |         I: Into<InputPaths>,
 32 |     {
 33 |         self.inputs = inputs.into().0;
 34 |         self
 35 |     }
 36 | 
 37 |     /// set output_dir
 38 |     pub fn output_dir<T>(mut self, output_dir: T) -> Self
 39 |     where
 40 |         T: Into<OutputDirType>,
 41 |     {
 42 |         self.output_dir = output_dir.into().into();
 43 |         self
 44 |     }
 45 | 
 46 |     /// set tree
 47 |     pub fn tree(mut self, tree: bool) -> Self {
 48 |         self.tree = tree;
 49 |         self
 50 |     }
 51 | 
 52 |     /// set file_prefix
 53 |     pub fn file_prefix<T>(mut self, file_prefix: T) -> Self
 54 |     where
 55 |         T: Into<Option<String>>,
 56 |     {
 57 |         self.file_prefix = file_prefix.into();
 58 |         self
 59 |     }
 60 | 
 61 |     /// set file_postfix
 62 |     pub fn file_postfix<T>(mut self, file_postfix: T) -> Self
 63 |     where
 64 |         T: Into<Option<String>>,
 65 |     {
 66 |         self.file_postfix = file_postfix.into();
 67 |         self
 68 |     }
 69 | 
 70 |     /// set sort
 71 |     pub fn sort(mut self, sort: bool) -> Self {
 72 |         self.sort = sort;
 73 |         self
 74 |     }
 75 | }
 76 | 
 77 | /// output dir type
 78 | pub enum OutputDirType {
 79 |     /// &str
 80 |     Str(&'static str),
 81 |     /// String
 82 |     String(String),
 83 |     /// PathBuf
 84 |     PathBuf(PathBuf),
 85 |     /// None
 86 |     None,
 87 | }
 88 | 
 89 | impl From<OutputDirType> for Option<PathBuf> {
 90 |     fn from(output_dir: OutputDirType) -> Self {
 91 |         match output_dir {
 92 |             OutputDirType::Str(s) => Some(PathBuf::from(s)),
 93 |             OutputDirType::String(s) => Some(PathBuf::from(s)),
 94 |             OutputDirType::PathBuf(p) => Some(p),
 95 |             OutputDirType::None => None,
 96 |         }
 97 |     }
 98 | }
 99 | 
100 | // Implement From for all the required types
101 | impl From<&'static str> for OutputDirType {
102 |     fn from(s: &'static str) -> Self {
103 |         OutputDirType::Str(s)
104 |     }
105 | }
106 | 
107 | impl From<String> for OutputDirType {
108 |     fn from(s: String) -> Self {
109 |         OutputDirType::String(s)
110 |     }
111 | }
112 | 
113 | impl From<PathBuf> for OutputDirType {
114 |     fn from(p: PathBuf) -> Self {
115 |         OutputDirType::PathBuf(p)
116 |     }
117 | }
118 | 
119 | impl<T> From<Option<T>> for OutputDirType
120 | where
121 |     T: Into<OutputDirType>,
122 | {
123 |     fn from(opt: Option<T>) -> Self {
124 |         match opt {
125 |             Some(v) => v.into(),
126 |             None => OutputDirType::None,
127 |         }
128 |     }
129 | }
130 | 
131 | // New wrapper type
132 | /// InputPaths
133 | pub struct InputPaths(Option<Vec<PathBuf>>);
134 | 
135 | impl From<Vec<PathBuf>> for InputPaths {
136 |     fn from(v: Vec<PathBuf>) -> Self {
137 |         InputPaths(Some(v))
138 |     }
139 | }
140 | 
141 | impl From<Option<Vec<PathBuf>>> for InputPaths {
142 |     fn from(v: Option<Vec<PathBuf>>) -> Self {
143 |         InputPaths(v)
144 |     }
145 | }
146 | 
147 | impl From<Vec<String>> for InputPaths {
148 |     fn from(v: Vec<String>) -> Self {
149 |         InputPaths(Some(v.into_iter().map(PathBuf::from).collect()))
150 |     }
151 | }
152 | 
153 | impl From<Option<Vec<String>>> for InputPaths {
154 |     fn from(v: Option<Vec<String>>) -> Self {
155 |         InputPaths(v.map(|strings| strings.into_iter().map(PathBuf::from).collect()))
156 |     }
157 | }
158 | 
159 | impl<'a> From<Vec<&'a str>> for InputPaths {
160 |     fn from(v: Vec<&'a str>) -> Self {
161 |         InputPaths(Some(v.into_iter().map(PathBuf::from).collect()))
162 |     }
163 | }
164 | 
165 | impl<'a> From<Option<Vec<&'a str>>> for InputPaths {
166 |     fn from(v: Option<Vec<&'a str>>) -> Self {
167 |         InputPaths(v.map(|strings| strings.into_iter().map(PathBuf::from).collect()))
168 |     }
169 | }
170 | 
171 | /** get_output_dir() has many possible combinations of parameters
172 | 
173 | possible dimensions of inputs
174 | - dimension: with or without --tree
175 | - dimension: with or without --output-dir
176 | - dimension: with or without --inputs
177 | - dimension: single or multiple --inputs
178 | - dimension: relative or absolute --inputs
179 | - dimension: file or directory --inputs
180 | 
181 | cases that are easy:
182 | - without --inputs, without --tree, without --output-dir
183 | - read from CWD, write outputs to CWD
184 | - without --inputs, without --tree, with --output-dir
185 | - read from CWD, write outputs to --output-dir
186 | - without --inputs, with --tree, without --output-dir
187 | - read from CWD, write each file in its own original dir
188 | - without --inputs, with --tree, with --output-dir
189 | - read from CWD, write relative tree paths relative to --output-dir tree
190 | 
191 | cases that are harder:
192 | - with single file --inputs
193 | - --tree doesnt matter
194 | - without --output-dir: writes file to that file's dir
195 | - with --output-dir: writes file to that dir
196 | - with single dir --inputs
197 | - without --tree, without --output-dir
198 | - read from the input dir, write to the input dir
199 | - without --tree, with --output-dir
200 | - read from the input dir, write to the output dir
201 | - with --tree, without --output-dir
202 | - use the input dir as tree root for both reading and writing
203 | - with --tree, with --output-dir
204 | - use input tree as reading tree root, output dir as writing tree root
205 | - with multiple --inputs
206 | - just treat each input path independently
207 | 
208 | if --output-dir is used without --tree, every output goes directly in directory
209 | if --output-dir is used with --tree, the --output-dir is used as the new tree root
210 | */
211 | pub fn get_output_paths(
212 |     // inputs: Option<Vec<PathBuf>>,
213 |     // output_dir: Option<PathBuf>,
214 |     // tree: bool,
215 |     output_spec: OutputPathSpec,
216 | ) -> Result<(Vec<PathBuf>, Vec<PathBuf>), TblError> {
217 |     // gather inputs
218 |     let output_dir = output_spec.output_dir;
219 |     let inputs = match output_spec.inputs {
220 |         None => vec![std::env::current_dir()?],
221 |         Some(inputs) => inputs,
222 |     };
223 | 
224 |     // process each input separately
225 |     let mut return_inputs: Vec<PathBuf> = Vec::new();
226 |     let mut return_outputs: Vec<PathBuf> = Vec::new();
227 |     for input in inputs {
228 |         let metadata = std::fs::metadata(&input)?;
229 |         if metadata.is_file() {
230 |             // case 1: input is a file
231 |             let output = super::manipulate::convert_file_path(
232 |                 &input,
233 |                 &output_dir,
234 |                 &output_spec.file_prefix,
235 |                 &output_spec.file_postfix,
236 |             )?;
237 |             return_inputs.push(input.clone());
238 |             return_outputs.push(output);
239 |         } else if metadata.is_dir() {
240 |             if !output_spec.tree {
241 |                 // case 2: input is a directory, non-tree mode
242 |                 for sub_input in super::gather::get_directory_tabular_files(&input)?.into_iter() {
243 |                     let output = super::manipulate::convert_file_path(
244 |                         &sub_input,
245 |                         &output_dir,
246 |                         &output_spec.file_prefix,
247 |                         &output_spec.file_postfix,
248 |                     )?;
249 |                     return_inputs.push(sub_input);
250 |                     return_outputs.push(output);
251 |                 }
252 |             } else {
253 |                 // case 3: input is a directory, tree mode
254 |                 for sub_input in super::gather::get_tree_tabular_files(&input)?.into_iter() {
255 |                     // use relative path of tree leaf, change root to output_dir if provided
256 |                     let new_path = if let Some(output_dir) = output_dir.clone() {
257 |                         let relative_path = sub_input.strip_prefix(&input)?.to_path_buf();
258 |                         output_dir.join(relative_path)
259 |                     } else {
260 |                         sub_input.clone()
261 |                     };
262 | 
263 |                     // change file prefix and postfix
264 |                     let output = super::manipulate::convert_file_path(
265 |                         &new_path,
266 |                         &None,
267 |                         &output_spec.file_prefix,
268 |                         &output_spec.file_postfix,
269 |                     )?;
270 | 
271 |                     return_inputs.push(sub_input.clone());
272 |                     return_outputs.push(output);
273 |                 }
274 |             }
275 |         } else {
276 |             return Err(TblError::Error("".to_string()));
277 |         };
278 |     }
279 | 
280 |     let (return_inputs, return_outputs) = if output_spec.sort {
281 |         // Create a vector of paired inputs and outputs
282 |         let mut paired = return_inputs
283 |             .into_iter()
284 |             .zip(return_outputs)
285 |             .collect::<Vec<_>>();
286 | 
287 |         // Sort the paired vector based on the output paths
288 |         paired.sort_by(|a, b| a.1.cmp(&b.1));
289 | 
290 |         // Unzip the sorted paired vector back into separate input and output vectors
291 |         paired.into_iter().unzip()
292 |     } else {
293 |         (return_inputs, return_outputs)
294 |     };
295 | 
296 |     // check that all output paths are unique to avoid collisions
297 |     let mut count_per_output: HashMap<PathBuf, usize> = HashMap::new();
298 |     for output in return_outputs.iter() {
299 |         *count_per_output.entry(output.clone()).or_insert(0) += 1;
300 |         if count_per_output[output] > 1 {
301 |             return Err(TblError::Error(format!(
302 |                 "Duplicate output path: {:?}",
303 |                 output
304 |             )));
305 |         }
306 |     }
307 | 
308 |     Ok((return_inputs, return_outputs))
309 | }
310 | 
311 | /*
312 | tests
313 | for the tests, generate the following file tree:
314 | root/
315 | super_data_a.parquet
316 | super_data_b.parquet
317 | data1/
318 | data1_a.parquet
319 | data1_b.parquet
320 | sub_data1_1/
321 | sub_data1_a.parquet
322 | sub_data1_b.parquet
323 | data2/
324 | data2_a.parquet
325 | data2_b.parquet
326 | test cases:
327 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]))
328 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).tree(true))
329 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./root"))
330 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./root").tree(true))
331 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./other_root"))
332 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root"]).output_dir("./other_root").tree(true))
333 | 
334 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]))
335 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).tree(true))
336 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./root"))
337 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./root").tree(true))
338 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./other_root"))
339 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1"]).output_dir("./other_root").tree(true))
340 | 
341 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]))
342 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).tree(true))
343 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./root"))
344 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./root").tree(true))
345 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./other_root"))
346 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1", "./root/data2"]).output_dir("./other_root").tree(true))
347 | 
348 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]))
349 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).tree(true))
350 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./root"))
351 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./root").tree(true))
352 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./other_root"))
353 | get_output_paths(OutputPathSpec::new().inputs(vec!["./root/data1/data1_a.parquet", "./root/super_data_a.parquet"]).output_dir("./other_root").tree(true))
354 | */
355 | #[cfg(test)]
356 | mod tests {
357 |     use super::*;
358 |     use std::fs::{self, File};
359 |     use tempfile::TempDir;
360 | 
361 |     fn create_test_file_tree() -> Result<TempDir, TblError> {
362 |         let temp_dir = TempDir::new()?;
363 |         println!("Created temporary directory: {:?}", temp_dir.path());
364 |         let root = temp_dir.path().join("root");
365 | 
366 |         fs::create_dir(&root)?;
367 |         File::create(root.join("super_data_a.parquet"))?;
368 |         File::create(root.join("super_data_b.parquet"))?;
369 | 
370 |         let data1 = root.join("data1");
371 |         fs::create_dir(&data1)?;
372 |         File::create(data1.join("data1_a.parquet"))?;
373 |         File::create(data1.join("data1_b.parquet"))?;
374 | 
375 |         let sub_data1_1 = data1.join("sub_data1_1");
376 |         fs::create_dir(&sub_data1_1)?;
377 |         File::create(sub_data1_1.join("sub_data1_a.parquet"))?;
378 |         File::create(sub_data1_1.join("sub_data1_b.parquet"))?;
379 | 
380 |         let data2 = root.join("data2");
381 |         fs::create_dir(&data2)?;
382 |         File::create(data2.join("data2_a.parquet"))?;
383 |         File::create(data2.join("data2_b.parquet"))?;
384 | 
385 |         Ok(temp_dir)
386 |     }
387 | 
388 |     struct TestCase {
389 |         name: &'static str,
390 |         spec: OutputPathSpec,
391 |         expected_outputs: Vec<&'static str>,
392 |     }
393 | 
394 |     macro_rules! generate_tests {
395 |         ($($name:ident: $value:expr,)*) => {
396 |             $(
397 |                 #[test]
398 |                 fn $name() -> Result<(), TblError> {
399 |                     let test_case: TestCase = $value;
400 |                     let mut spec = test_case.spec;
401 | 
402 |                     // Create temporary directory and add its path to inputs and output_dir
403 |                     let temp_dir = create_test_file_tree()?;
404 |                     let temp_path = temp_dir.path().to_path_buf();
405 | 
406 |                     // Update inputs with temporary directory path
407 |                     if let Some(inputs) = spec.inputs.as_ref() {
408 |                         spec.inputs = Some(inputs.iter().map(|p| temp_path.join(p)).collect());
409 |                     } else {
410 |                         spec.inputs = Some(vec![temp_path.join("root")]);
411 |                     }
412 | 
413 |                     // Update output_dir with temporary directory path if it exists
414 |                     if let Some(output_dir) = spec.output_dir.as_ref() {
415 |                         spec.output_dir = Some(temp_path.join(output_dir));
416 |                     }
417 | 
418 |                     let (_inputs, outputs) = match get_output_paths(spec) {
419 |                         Ok((inputs, outputs)) => (inputs, outputs),
420 |                         Err(e) => return Err(TblError::Error(format!("{}", e).to_string())),
421 |                     };
422 | 
423 |                     let expected_outputs: Vec<PathBuf> = test_case.expected_outputs
424 |                         .into_iter()
425 |                         .map(|p| temp_dir.path().join(p))
426 |                         .collect();
427 | 
428 |                     let mut sorted_outputs = outputs.clone();
429 |                     sorted_outputs.sort();
430 |                     let mut sorted_expected_outputs = expected_outputs.clone();
431 |                     sorted_expected_outputs.sort();
432 |                     assert_eq!(
433 |                         sorted_outputs,
434 |                         sorted_expected_outputs,
435 |                         "Test case '{}' failed.\nExpected (sorted): {:?}\nGot (sorted): {:?}",
436 |                         test_case.name,
437 |                         sorted_expected_outputs,
438 |                         sorted_outputs
439 |                         );
440 | 
441 |                     Ok(())
442 |                 }
443 |             )*
444 |         }
445 |     }
446 | 
447 |     generate_tests! {
448 |         test_root_input: TestCase {
449 |             name: "Root input",
450 |             spec: OutputPathSpec::new().inputs(vec!["root"]),
451 |             expected_outputs: vec![
452 |                 "root/super_data_a.parquet",
453 |                 "root/super_data_b.parquet",
454 |             ],
455 |         },
456 |         test_root_input_tree: TestCase {
457 |             name: "Root input with tree",
458 |             spec: OutputPathSpec::new().inputs(vec!["root"]).tree(true),
459 |             expected_outputs: vec![
460 |                 "root/super_data_a.parquet",
461 |                 "root/super_data_b.parquet",
462 |                 "root/data1/data1_a.parquet",
463 |                 "root/data1/data1_b.parquet",
464 |                 "root/data1/sub_data1_1/sub_data1_a.parquet",
465 |                 "root/data1/sub_data1_1/sub_data1_b.parquet",
466 |                 "root/data2/data2_a.parquet",
467 |                 "root/data2/data2_b.parquet",
468 |             ],
469 |         },
470 |         test_root_input_self_output_dir: TestCase {
471 |             name: "Root input with self output dir",
472 |             spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("root"),
473 |             expected_outputs: vec![
474 |                 "root/super_data_a.parquet",
475 |                 "root/super_data_b.parquet",
476 |             ],
477 |         },
478 |         test_root_input_self_output_dir_tree: TestCase {
479 |             name: "Root input with self output dir tree",
480 |             spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("root").tree(true),
481 |             expected_outputs: vec![
482 |                 "root/super_data_a.parquet",
483 |                 "root/super_data_b.parquet",
484 |                 "root/data1/data1_a.parquet",
485 |                 "root/data1/data1_b.parquet",
486 |                 "root/data1/sub_data1_1/sub_data1_a.parquet",
487 |                 "root/data1/sub_data1_1/sub_data1_b.parquet",
488 |                 "root/data2/data2_a.parquet",
489 |                 "root/data2/data2_b.parquet",
490 |             ],
491 |         },
492 |         test_root_input_output_dir: TestCase {
493 |             name: "Root input with other output dir",
494 |             spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("other_root"),
495 |             expected_outputs: vec![
496 |                 "other_root/super_data_a.parquet",
497 |                 "other_root/super_data_b.parquet",
498 |             ],
499 |         },
500 |         test_root_input_output_dir_tree: TestCase {
501 |             name: "Root input with other output dir tree",
502 |             spec: OutputPathSpec::new().inputs(vec!["root"]).output_dir("other_root").tree(true),
503 |             expected_outputs: vec![
504 |                 "other_root/super_data_a.parquet",
505 |                 "other_root/super_data_b.parquet",
506 |                 "other_root/data1/data1_a.parquet",
507 |                 "other_root/data1/data1_b.parquet",
508 |                 "other_root/data1/sub_data1_1/sub_data1_a.parquet",
509 |                 "other_root/data1/sub_data1_1/sub_data1_b.parquet",
510 |                 "other_root/data2/data2_a.parquet",
511 |                 "other_root/data2/data2_b.parquet",
512 |             ],
513 |         },
514 | 
515 |         test_data1_input: TestCase {
516 |             name: "Data1 input",
517 |             spec: OutputPathSpec::new().inputs(vec!["root/data1"]),
518 |             expected_outputs: vec![
519 |                 "root/data1/data1_a.parquet",
520 |                 "root/data1/data1_b.parquet",
521 |             ],
522 |         },
523 |         test_data1_input_tree: TestCase {
524 |             name: "Data1 input with tree",
525 |             spec: OutputPathSpec::new().inputs(vec!["root/data1"]).tree(true),
526 |             expected_outputs: vec![
527 |                 "root/data1/data1_a.parquet",
528 |                 "root/data1/data1_b.parquet",
529 |                 "root/data1/sub_data1_1/sub_data1_a.parquet",
530 |                 "root/data1/sub_data1_1/sub_data1_b.parquet",
531 |             ],
532 |         },
533 |         test_data1_input_root_output: TestCase {
534 |             name: "Data1 input with root output",
535 |             spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("root"),
536 |             expected_outputs: vec![
537 |                 "root/data1_a.parquet",
538 |                 "root/data1_b.parquet",
539 |             ],
540 |         },
541 |         test_data1_input_root_output_tree: TestCase {
542 |             name: "Data1 input with root output and tree",
543 |             spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("root").tree(true),
544 |             expected_outputs: vec![
545 |                 "root/data1_a.parquet",
546 |                 "root/data1_b.parquet",
547 |                 "root/sub_data1_1/sub_data1_a.parquet",
548 |                 "root/sub_data1_1/sub_data1_b.parquet",
549 |             ],
550 |         },
551 |         test_data1_input_other_output: TestCase {
552 |             name: "Data1 input with other output",
553 |             spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("other_root"),
554 |             expected_outputs: vec![
555 |                 "other_root/data1_a.parquet",
556 |                 "other_root/data1_b.parquet",
557 |             ],
558 |         },
559 |         test_data1_input_other_output_tree: TestCase {
560 |             name: "Data1 input with other output and tree",
561 |             spec: OutputPathSpec::new().inputs(vec!["root/data1"]).output_dir("other_root").tree(true),
562 |             expected_outputs: vec![
563 |                 "other_root/data1_a.parquet",
564 |                 "other_root/data1_b.parquet",
565 |                 "other_root/sub_data1_1/sub_data1_a.parquet",
566 |                 "other_root/sub_data1_1/sub_data1_b.parquet",
567 |             ],
568 |         },
569 |         test_data1_data2_input: TestCase {
570 |             name: "Data1 and Data2 input",
571 |             spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]),
572 |             expected_outputs: vec![
573 |                 "root/data1/data1_a.parquet",
574 |                 "root/data1/data1_b.parquet",
575 |                 "root/data2/data2_a.parquet",
576 |                 "root/data2/data2_b.parquet",
577 |             ],
578 |         },
579 |         test_data1_data2_input_tree: TestCase {
580 |             name: "Data1 and Data2 input with tree",
581 |             spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).tree(true),
582 |             expected_outputs: vec![
583 |                 "root/data1/data1_a.parquet",
584 |                 "root/data1/data1_b.parquet",
585 |                 "root/data1/sub_data1_1/sub_data1_a.parquet",
586 |                 "root/data1/sub_data1_1/sub_data1_b.parquet",
587 |                 "root/data2/data2_a.parquet",
588 |                 "root/data2/data2_b.parquet",
589 |             ],
590 |         },
591 |         test_data1_data2_input_root_output: TestCase {
592 |             name: "Data1 and Data2 input with root output",
593 |             spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("root"),
594 |             expected_outputs: vec![
595 |                 "root/data1_a.parquet",
596 |                 "root/data1_b.parquet",
597 |                 "root/data2_a.parquet",
598 |                 "root/data2_b.parquet",
599 |             ],
600 |         },
601 |         test_data1_data2_input_root_output_tree: TestCase {
602 |             name: "Data1 and Data2 input with root output and tree",
603 |             spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("root").tree(true),
604 |             expected_outputs: vec![
605 |                 "root/data1_a.parquet",
606 |                 "root/data1_b.parquet",
607 |                 "root/sub_data1_1/sub_data1_a.parquet",
608 |                 "root/sub_data1_1/sub_data1_b.parquet",
609 |                 "root/data2_a.parquet",
610 |                 "root/data2_b.parquet",
611 |             ],
612 |         },
613 |         test_data1_data2_input_other_output: TestCase {
614 |             name: "Data1 and Data2 input with other output",
615 |             spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("other_root"),
616 |             expected_outputs: vec![
617 |                 "other_root/data1_a.parquet",
618 |                 "other_root/data1_b.parquet",
619 |                 "other_root/data2_a.parquet",
620 |                 "other_root/data2_b.parquet",
621 |             ],
622 |         },
623 |         test_data1_data2_input_other_output_tree: TestCase {
624 |             name: "Data1 and Data2 input with other output and tree",
625 |             spec: OutputPathSpec::new().inputs(vec!["root/data1", "root/data2"]).output_dir("other_root").tree(true),
626 |             expected_outputs: vec![
627 |                 "other_root/data1_a.parquet",
628 |                 "other_root/data1_b.parquet",
629 |                 "other_root/sub_data1_1/sub_data1_a.parquet",
630 |                 "other_root/sub_data1_1/sub_data1_b.parquet",
631 |                 "other_root/data2_a.parquet",
632 |                 "other_root/data2_b.parquet",
633 |             ],
634 |         },
635 |         test_specific_files_input: TestCase {
636 |             name: "Specific files input",
637 |             spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]),
638 |             expected_outputs: vec![
639 |                 "root/data1/data1_a.parquet",
640 |                 "root/super_data_a.parquet",
641 |             ],
642 |         },
643 |         test_specific_files_input_tree: TestCase {
644 |             name: "Specific files input with tree",
645 |             spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).tree(true),
646 |             expected_outputs: vec![
647 |                 "root/data1/data1_a.parquet",
648 |                 "root/super_data_a.parquet",
649 |             ],
650 |         },
651 |         test_specific_files_input_root_output: TestCase {
652 |             name: "Specific files input with root output",
653 |             spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("root"),
654 |             expected_outputs: vec![
655 |                 "root/data1_a.parquet",
656 |                 "root/super_data_a.parquet",
657 |             ],
658 |         },
659 |         test_specific_files_input_root_output_tree: TestCase {
660 |             name: "Specific files input with root output and tree",
661 |             spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("root").tree(true),
662 |             expected_outputs: vec![
663 |                 "root/data1_a.parquet",
664 |                 "root/super_data_a.parquet",
665 |             ],
666 |         },
667 |         test_specific_files_input_other_output: TestCase {
668 |             name: "Specific files input with other output",
669 |             spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("other_root"),
670 |             expected_outputs: vec![
671 |                 "other_root/data1_a.parquet",
672 |                 "other_root/super_data_a.parquet",
673 |             ],
674 |         },
675 |         test_specific_files_input_other_output_tree: TestCase {
676 |             name: "Specific files input with other output and tree",
677 |             spec: OutputPathSpec::new().inputs(vec!["root/data1/data1_a.parquet", "root/super_data_a.parquet"]).output_dir("other_root").tree(true),
678 |             expected_outputs: vec![
679 |                 "other_root/data1_a.parquet",
680 |                 "other_root/super_data_a.parquet",
681 |             ],
682 |         },
683 | 
684 |     }
685 | }
686 | 


--------------------------------------------------------------------------------