├── rust-toolchain.toml
├── rustfmt.toml
├── demo.gif
├── ferris.png
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── release.yml
├── ci
    └── cargo-out-dir
├── .gitignore
├── dist-workspace.toml
├── src
    ├── progress.rs
    ├── interrupt.rs
    ├── calibrate.rs
    ├── args.rs
    ├── main.rs
    └── walk.rs
├── LICENSE
├── Cargo.toml
└── README.md


/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "1.90"
3 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 79
2 | use_small_heuristics = "max"
3 | 


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkorunic/findlargedir/HEAD/demo.gif


--------------------------------------------------------------------------------
/ferris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkorunic/findlargedir/HEAD/ferris.png


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "weekly"
 7 |       day: "sunday"
 8 |       time: "22:00"
 9 |     open-pull-requests-limit: 10
10 | 


--------------------------------------------------------------------------------
/ci/cargo-out-dir:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Finds Cargo's `OUT_DIR` directory from the most recent build.
 4 | #
 5 | # This requires one parameter corresponding to the target directory
 6 | # to search for the build output.
 7 | 
 8 | if [ $# != 1 ]; then
 9 |   echo "Usage: $(basename "$0") <target-dir>" >&2
10 |   exit 2
11 | fi
12 | 
13 | # This works by finding the most recent stamp file, which is produced by
14 | # every ripgrep build.
15 | target_dir="$1"
16 | find "$target_dir" -type f -name findlargedir -print0 \
17 |   | xargs -0 ls -t \
18 |   | head -n1 \
19 |   | xargs dirname
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Rust ###
 2 | # Generated by Cargo
 3 | # will have compiled files and executables
 4 | debug/
 5 | target/
 6 | 
 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 8 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 9 | Cargo.lock
10 | 
11 | # These are backup files generated by rustfmt
12 | **/*.rs.bk
13 | 
14 | # MSVC Windows builds of rustc generate these, which store debugging information
15 | *.pdb
16 | 
17 | # IntelliJ tools
18 | .idea/
19 | 
20 | # End of https://www.toptal.com/developers/gitignore/api/rust
21 | 


--------------------------------------------------------------------------------
/dist-workspace.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["cargo:."]
 3 | 
 4 | # Config for 'dist'
 5 | [dist]
 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax)
 7 | cargo-dist-version = "0.30.2"
 8 | # CI backends to support
 9 | ci = "github"
10 | # The installers to generate for each app
11 | installers = ["shell"]
12 | # Target platforms to build apps for (Rust target-triple syntax)
13 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl"]
14 | # Path that installers should place binaries in
15 | install-path = "CARGO_HOME"
16 | # Whether to install an updater program
17 | install-updater = false
18 | 


--------------------------------------------------------------------------------
/src/progress.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Duration;
 2 | 
 3 | use indicatif::{ProgressBar, ProgressStyle};
 4 | 
 5 | /// Default tick chars
 6 | const PROGRESS_CHARS: &str = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏";
 7 | 
 8 | /// Default tick in milliseconds
 9 | const PROGRESS_TICK: u64 = 80;
10 | 
11 | /// Initializes a new `ProgressBar` with a spinner style.
12 | ///
13 | /// # Arguments
14 | /// * `msg` - A message of generic type `S` that implements `Into<String>`, which will be displayed on the spinner.
15 | ///
16 | /// # Returns
17 | /// Returns a `ProgressBar` object configured with a steady tick and custom spinner style.
18 | ///
19 | /// # Examples
20 | /// ```
21 | /// let spinner = new_spinner("Loading...");
22 | /// ```
23 | pub fn new_spinner<S>(msg: S) -> ProgressBar
24 | where
25 |     S: Into<String>,
26 | {
27 |     let pb = ProgressBar::new_spinner();
28 |     pb.enable_steady_tick(Duration::from_millis(PROGRESS_TICK));
29 |     pb.set_style(ProgressStyle::default_spinner().tick_chars(PROGRESS_CHARS));
30 |     pb.set_message(msg.into());
31 | 
32 |     pb
33 | }
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2022 Dinko Korunic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/interrupt.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | use std::sync::atomic::AtomicBool;
 3 | 
 4 | use anyhow::{Context, Error};
 5 | use signal_hook::consts::TERM_SIGNALS;
 6 | use signal_hook::flag::register;
 7 | 
 8 | /// Sets up a handler for process interruption signals (each signal in `TERM_SIGNALS`).
 9 | /// This function configures a handler that will set a shared atomic boolean to `true`
10 | /// whenever an interruption signal is received, indicating that the process should shut down.
11 | ///
12 | /// # Arguments
13 | /// * `shutdown` - An `&Arc<AtomicBool>` shared among threads, used to signal shutdown when set to `true`.
14 | ///
15 | /// # Returns
16 | /// Returns `Ok(())` if the handler is successfully set, or an `Error` if any issues occur during setup.
17 | ///
18 | /// # Errors
19 | /// Returns an error if the signal handler cannot be set, encapsulated in an `anyhow::Error`.
20 | pub fn setup_interrupt_handler(
21 |     shutdown: &Arc<AtomicBool>,
22 | ) -> Result<(), Error> {
23 |     for sig in TERM_SIGNALS {
24 |         let name =
25 |             signal_hook::low_level::signal_name(*sig).unwrap_or_default();
26 |         register(*sig, shutdown.clone()).with_context(|| {
27 |             format!("Unable to register signal handler for {name}/{sig}")
28 |         })?;
29 |     }
30 | 
31 |     Ok(())
32 | }
33 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "findlargedir"
 3 | version = "0.10.3"
 4 | authors = ["Dinko Korunic <dinko.korunic@gmail.com>"]
 5 | categories = ["command-line-utilities"]
 6 | description = "find all blackhole directories with a huge amount of filesystem entries in a flat structure"
 7 | repository = "https://github.com/dkorunic/findlargedir"
 8 | homepage = "https://github.com/dkorunic/findlargedir"
 9 | readme = "README.md"
10 | license = "MIT"
11 | exclude = [".gitignore"]
12 | edition = "2024"
13 | rust-version = "1.88.0"
14 | 
15 | [dependencies]
16 | mimalloc = "0.1.48"
17 | rayon = "1.11.0"
18 | tempfile = "3.23.0"
19 | anyhow = "1.0.100"
20 | human_format = "1.1.0"
21 | clap = { version = "4.5.51", features = ["derive", "unicode", "wrap_help"] }
22 | rm_rf = "0.6.2"
23 | ansi_term = "0.12.1"
24 | fs-err = "3.1.3"
25 | indicatif = { version = "0.18.2", features = ["rayon"] }
26 | fdlimit = "0.3.0"
27 | ahash = "0.8.12"
28 | anstyle = "1.0.13"
29 | signal-hook = "0.3.18"
30 | ignore = "0.4.25"
31 | normpath = "1.5.0"
32 | 
33 | [profile.release]
34 | opt-level = 3
35 | debug = "none"
36 | strip = "symbols"
37 | debug-assertions = false
38 | overflow-checks = true
39 | lto = "fat"
40 | panic = "abort"
41 | codegen-units = 1
42 | 
43 | # The profile that 'dist' will build with
44 | [profile.dist]
45 | inherits = "release"
46 | lto = "fat"
47 | 


--------------------------------------------------------------------------------
/src/calibrate.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | use std::os::unix::fs::MetadataExt;
  3 | use std::path::Path;
  4 | use std::process;
  5 | use std::sync::Arc;
  6 | use std::sync::atomic::{AtomicBool, Ordering};
  7 | 
  8 | use anyhow::{Context, Error};
  9 | use fs_err as fs;
 10 | use rayon::prelude::*;
 11 | use rm_rf::ensure_removed;
 12 | 
 13 | use crate::{args, progress};
 14 | 
 15 | /// Default number of files to create in the calibration directory
 16 | pub const DEFAULT_TEST_COUNT: u64 = 100;
 17 | 
 18 | /// Default exit error code in case of premature termination
 19 | const ERROR_EXIT: i32 = 1;
 20 | 
 21 | /// Calculates the size-to-inode ratio for a given directory.
 22 | ///
 23 | /// This function initiates a calibration process by creating a specified number of files
 24 | /// within the `test_path` directory to determine the average file size to inode ratio.
 25 | /// It uses a multi-threaded approach to create files and monitors for a shutdown signal
 26 | /// to safely terminate and clean up if necessary.
 27 | ///
 28 | /// # Arguments
 29 | /// * `test_path` - A reference to the path where test files will be created.
 30 | /// * `shutdown` - A shared atomic boolean to signal shutdown and cleanup.
 31 | /// * `args` - A shared structure containing runtime arguments such as the number of threads
 32 | ///   and the number of files to create for calibration.
 33 | ///
 34 | /// # Returns
 35 | /// Returns a `Result<u64, Error>` which is the calculated size-to-inode ratio if successful,
 36 | /// or an error if the operation fails at any step.
 37 | ///
 38 | /// # Errors
 39 | /// This function can return an error if it fails to create the thread pool, create files,
 40 | /// delete the directory, or retrieve metadata from the test directory.
 41 | ///
 42 | /// # Examples
 43 | /// ```
 44 | /// let test_path = Path::new("/tmp/test_dir");
 45 | /// let shutdown = Arc::new(AtomicBool::new(false));
 46 | /// let args = Arc::new(args::Args {
 47 | ///     threads: 4,
 48 | ///     calibration_count: 1000,
 49 | /// });
 50 | /// let ratio = get_inode_ratio(&test_path, &shutdown, &args);
 51 | /// match ratio {
 52 | ///     Ok(ratio) => println!("Size-to-inode ratio: {}", ratio),
 53 | ///     Err(e) => println!("Failed to calculate size-to-inode ratio: {}", e),
 54 | /// }
 55 | /// ```
 56 | pub fn get_inode_ratio(
 57 |     test_path: &Path,
 58 |     shutdown: &Arc<AtomicBool>,
 59 |     args: &Arc<args::Args>,
 60 | ) -> Result<u64, Error> {
 61 |     println!("Starting test directory calibration in {}", test_path.display(),);
 62 | 
 63 |     // Thread pool for mass file creation
 64 |     let pool = rayon::ThreadPoolBuilder::new()
 65 |         .num_threads(args.threads)
 66 |         .build()
 67 |         .context("Unable to spawn calibration thread pool")?;
 68 | 
 69 |     let pb = progress::new_spinner("Creating test files in progress...");
 70 | 
 71 |     // Mass create files; filenames are short to get minimal size to inode ratio
 72 |     let res: Result<(), Error> = pool.install(|| {
 73 |         (0..args.calibration_count).into_par_iter().try_for_each(|i| {
 74 |             if !shutdown.load(Ordering::Relaxed) {
 75 |                 File::create(test_path.join(i.to_string()))
 76 |                     .context("Unable to create test file")?;
 77 |             }
 78 | 
 79 |             Ok(())
 80 |         })
 81 |     });
 82 | 
 83 |     pb.finish_with_message("Done.");
 84 | 
 85 |     // Check for calibration errors
 86 |     if let Err(e) = res {
 87 |         println!("Fatal program error, exiting: {e}");
 88 | 
 89 |         // TempDir cleanup will most likely fail as well
 90 |         _ = ensure_removed(test_path);
 91 | 
 92 |         process::exit(ERROR_EXIT);
 93 |     }
 94 | 
 95 |     // Terminate on received interrupt signal
 96 |     if shutdown.load(Ordering::Relaxed) {
 97 |         println!(
 98 |             "Requested program exit, stopping and deleting temporary files...",
 99 |         );
100 |         ensure_removed(test_path).expect(
101 |             "Unable to completely delete calibration directory, exiting",
102 |         );
103 | 
104 |         process::exit(ERROR_EXIT);
105 |     }
106 | 
107 |     let size_inode_ratio = fs::metadata(test_path)
108 |         .context("Unable to retrieve calibration directory metadata")?
109 |         .size()
110 |         / args.calibration_count;
111 |     println!(
112 |         "Calibration done. Calculated size-to-inode ratio: {size_inode_ratio}"
113 |     );
114 | 
115 |     Ok(size_inode_ratio)
116 | }
117 | 


--------------------------------------------------------------------------------
/src/args.rs:
--------------------------------------------------------------------------------
  1 | use std::path::{Path, PathBuf};
  2 | use std::thread;
  3 | 
  4 | use anstyle::AnsiColor;
  5 | use anyhow::{Error, anyhow};
  6 | use clap::Parser;
  7 | use clap::ValueHint;
  8 | use clap::builder::{ValueParser, styling::Styles};
  9 | use normpath::PathExt;
 10 | 
 11 | const STYLES: Styles = Styles::styled()
 12 |     .header(AnsiColor::Yellow.on_default())
 13 |     .usage(AnsiColor::Green.on_default())
 14 |     .literal(AnsiColor::Green.on_default())
 15 |     .placeholder(AnsiColor::Green.on_default());
 16 | 
 17 | #[derive(Parser, Default, Debug, Clone)]
 18 | #[clap(author, version, about, long_about = None, styles=STYLES)]
 19 | pub struct Args {
 20 |     /// Follow symlinks
 21 |     #[clap(short = 'f', long, action = clap::ArgAction::Set, default_value_t = false)]
 22 |     pub follow_symlinks: bool,
 23 | 
 24 |     /// Perform accurate directory entry counting
 25 |     #[clap(short = 'a', long, action = clap::ArgAction::Set, default_value_t = false)]
 26 |     pub accurate: bool,
 27 | 
 28 |     /// Do not cross mount points
 29 |     #[clap(short = 'o', long, action = clap::ArgAction::Set, default_value_t = true)]
 30 |     pub one_filesystem: bool,
 31 | 
 32 |     /// Calibration directory file count
 33 |     #[clap(short = 'c', long, value_parser, default_value_t = crate::calibrate::DEFAULT_TEST_COUNT)]
 34 |     pub calibration_count: u64,
 35 | 
 36 |     /// Alert threshold count (print the estimate)
 37 |     #[clap(short = 'A', long, value_parser, default_value_t = crate::walk::ALERT_COUNT)]
 38 |     pub alert_threshold: u64,
 39 | 
 40 |     /// Blacklist threshold count (print the estimate and stop deeper scan)
 41 |     #[clap(short = 'B', long, value_parser, default_value_t = crate::walk::BLACKLIST_COUNT)]
 42 |     pub blacklist_threshold: u64,
 43 | 
 44 |     /// Number of threads to use when calibrating and scanning
 45 |     #[clap(short = 'x', long, value_parser = ValueParser::new(parse_threads), default_value_t = thread::available_parallelism().map(| n | n.get()).unwrap_or(2)
 46 |     )]
 47 |     pub threads: usize,
 48 | 
 49 |     /// Seconds between status updates, set to 0 to disable
 50 |     #[clap(short = 'p', long, value_parser, default_value_t = crate::walk::STATUS_SECONDS)]
 51 |     pub updates: u64,
 52 | 
 53 |     /// Skip calibration and provide directory entry to inode size ratio (typically ~21-32)
 54 |     #[clap(short = 'i', long, value_parser, default_value_t = 0u64)]
 55 |     pub size_inode_ratio: u64,
 56 | 
 57 |     /// Custom calibration directory path
 58 |     #[clap(short = 't', long, value_parser, value_hint = ValueHint::AnyPath)]
 59 |     pub calibration_path: Option<PathBuf>,
 60 | 
 61 |     /// Directories to exclude from scanning
 62 |     #[clap(short = 's', long, value_parser, value_hint = ValueHint::AnyPath)]
 63 |     pub skip_path: Vec<PathBuf>,
 64 | 
 65 |     /// Paths to check for large directories
 66 |     #[clap(required = true, value_parser = ValueParser::new(parse_paths), value_hint = ValueHint::AnyPath
 67 |     )]
 68 |     pub path: Vec<PathBuf>,
 69 | }
 70 | 
 71 | /// Parse and validate threads option
 72 | fn parse_threads(x: &str) -> Result<usize, Error> {
 73 |     match x.parse::<usize>() {
 74 |         Ok(v) => match v {
 75 |             v if !(2..=65535).contains(&v) => {
 76 |                 Err(anyhow!("threads should be in (2..65536) range"))
 77 |             }
 78 |             v => Ok(v),
 79 |         },
 80 |         Err(e) => Err(Error::from(e)),
 81 |     }
 82 | }
 83 | 
 84 | /// Parses a string into a `PathBuf`, checking if the path is a directory and exists.
 85 | ///
 86 | /// # Arguments
 87 | ///
 88 | /// * `x` - A string slice to be parsed into a `PathBuf`.
 89 | ///
 90 | /// # Returns
 91 | ///
 92 | /// * `Result<PathBuf, Error>` - An `Ok` variant containing a normalized `PathBuf` if the path is an existing directory,
 93 | ///    or an `Err` variant with an error message if the path does not exist or is not a directory.
 94 | fn parse_paths(x: &str) -> Result<PathBuf, Error> {
 95 |     let p = Path::new(x);
 96 | 
 97 |     if directory_exists(p) {
 98 |         Ok(p.normalize()?.into_path_buf())
 99 |     } else {
100 |         Err(anyhow!("'{x}' is not an existing directory"))
101 |     }
102 | }
103 | 
104 | /// Checks if the given path is a directory and exists.
105 | ///
106 | /// # Arguments
107 | ///
108 | /// * `x` - A reference to the path to check.
109 | ///
110 | /// # Returns
111 | ///
112 | /// * `bool` - `true` if the path is an existing directory, `false` otherwise.
113 | #[inline]
114 | fn directory_exists(x: &Path) -> bool {
115 |     x.is_dir() && x.normalize().is_ok()
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all, clippy::pedantic)]
  2 | 
  3 | use std::os::unix::fs::MetadataExt;
  4 | use std::sync::Arc;
  5 | use std::sync::atomic::AtomicBool;
  6 | use std::time::Instant;
  7 | 
  8 | use ahash::AHashSet;
  9 | use anyhow::{Context, Error, Result};
 10 | use clap::Parser;
 11 | use fdlimit::{Outcome, raise_fd_limit};
 12 | use fs_err as fs;
 13 | use indicatif::HumanDuration;
 14 | use tempfile::TempDir;
 15 | 
 16 | mod args;
 17 | mod calibrate;
 18 | mod interrupt;
 19 | mod progress;
 20 | mod walk;
 21 | 
 22 | use mimalloc::MiMalloc;
 23 | 
 24 | #[global_allocator]
 25 | static GLOBAL: MiMalloc = MiMalloc;
 26 | 
 27 | /// Entry point for the filesystem scanning application.
 28 | ///
 29 | /// This function sets up necessary configurations and initiates the parallel filesystem scan
 30 | /// by calling `parallel_search`. It handles command-line arguments and sets up the environment
 31 | /// for the application to run.
 32 | ///
 33 | /// # Behavior:
 34 | /// - Parses command-line arguments to configure the scanning process.
 35 | /// - Sets up signal handling for graceful shutdowns.
 36 | /// - Initiates the filesystem scan by calling `parallel_search` with appropriate parameters.
 37 | /// - Handles any errors returned by `parallel_search` and exits with an appropriate status code.
 38 | ///
 39 | /// # Returns:
 40 | /// - Typically does not return and calls `std::process::exit` to terminate the program.
 41 | fn main() -> Result<(), Error> {
 42 |     let args = Arc::new(args::Args::parse());
 43 | 
 44 |     // Setup termination signal (SIGINT, SIGTERM and SIGQUIT) handlers that will cause program to stop
 45 |     let shutdown = Arc::new(AtomicBool::new(false));
 46 |     let shutdown_walk = shutdown.clone();
 47 |     interrupt::setup_interrupt_handler(&shutdown)?;
 48 | 
 49 |     println!("Using {} threads for calibration and scanning", args.threads);
 50 | 
 51 |     // Attempt to raise FD limit
 52 |     if let Ok(Outcome::LimitRaised { to: x, .. }) = raise_fd_limit() {
 53 |         println!("Maximum number of file descriptors available: {x}");
 54 |     }
 55 | 
 56 |     // Search only unique paths
 57 |     let mut visited_paths = AHashSet::with_capacity(args.path.len());
 58 | 
 59 |     for path in args.path.clone() {
 60 |         // Keep order of provided path arguments, but skip already visited paths
 61 |         match visited_paths.get(&path) {
 62 |             None => visited_paths.insert(path.clone()),
 63 |             _ => continue,
 64 |         };
 65 | 
 66 |         println!("Started analysis for path {}", path.display());
 67 | 
 68 |         // Retrieve Unix metadata for top search path
 69 |         let path_metadata = fs::metadata(&path)
 70 |             .context("Unable to retrieve top search directory metadata")?;
 71 | 
 72 |         // Directory inode size to number of entries ratio is either manually provided in
 73 |         // `args.size_inode_ratio` or determined from manually provided calibration path
 74 |         // `args.calibration_path` or determined from calibration directory created in search root
 75 |         // `TempDir::new_in(path.as_path())`
 76 |         let size_inode_ratio = if args.size_inode_ratio > 0 {
 77 |             args.size_inode_ratio
 78 |         } else if let Some(ref user_path) = args.calibration_path {
 79 |             // User has specified his calibration directory so attempt to check if it resides on
 80 |             // the same device
 81 |             if fs::metadata(user_path.as_path()).context(
 82 |                 "Unable to retrieve user-specified calibration directory metadata",
 83 |             )?.dev() != path_metadata.dev()
 84 |             {
 85 |                 println!(
 86 |                     "Oops, test directory resides on a different device than path {}, results are possibly unreliable!",
 87 |                     path.display()
 88 |                 );
 89 |             }
 90 | 
 91 |             // Prepare temporary calibration directory in user path
 92 |             let tmp_dir =
 93 |                 Arc::new(TempDir::new_in(user_path.as_path()).context(
 94 |                     "Unable to setup/create calibration test directory",
 95 |                 )?);
 96 | 
 97 |             calibrate::get_inode_ratio(tmp_dir.path(), &shutdown_walk, &args)
 98 |                 .context("Unable to calibrate inode to size ratio")?
 99 |         } else {
100 |             // Prepare temporary calibration directory in root of the search path
101 |             let tmp_dir = Arc::new(TempDir::new_in(path.as_path()).context(
102 |                 "Unable to setup/create calibration test directory",
103 |             )?);
104 | 
105 |             calibrate::get_inode_ratio(tmp_dir.path(), &shutdown_walk, &args)
106 |                 .context("Unable to calibrate inode to size ratio")?
107 |         };
108 | 
109 |         let start = Instant::now();
110 |         let pb = progress::new_spinner(format!(
111 |             "Scanning path {} in progress...",
112 |             path.display()
113 |         ));
114 | 
115 |         let dir_count = walk::parallel_search(
116 |             &path,
117 |             &path_metadata,
118 |             size_inode_ratio,
119 |             &shutdown_walk,
120 |             &args,
121 |         );
122 | 
123 |         pb.finish_with_message("Done.");
124 | 
125 |         println!(
126 |             "Scanning path {} completed. Directories scanned: {}, Time elapsed: {}",
127 |             path.display(),
128 |             dir_count,
129 |             HumanDuration(start.elapsed())
130 |         );
131 |     }
132 | 
133 |     Ok(())
134 | }
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # findlargedir
  2 | 
  3 | [![GitHub license](https://img.shields.io/github/license/dkorunic/findlargedir.svg)](https://github.com/dkorunic/findlargedir/blob/master/LICENSE.txt)
  4 | [![GitHub release](https://img.shields.io/github/release/dkorunic/findlargedir.svg)](https://github.com/dkorunic/findlargedir/releases/latest)
  5 | [![release](https://github.com/dkorunic/findlargedir/actions/workflows/release.yml/badge.svg)](https://github.com/dkorunic/findlargedir/actions/workflows/release.yml)
  6 | 
  7 | ![](ferris.png)
  8 | 
  9 | (Ferris the Detective by [Esther Arzola](https://www.redbubble.com/people/earzola/shop), original design by [Karen Rustad Tölva](https://www.rustacean.net))
 10 | 
 11 | ## About
 12 | 
 13 | Findlargedir is a tool specifically written to help **quickly** identify "black hole" directories on an any filesystem having more than 100k entries in a single flat structure. When a directory has **many entries** (directories or files), getting directory listing gets slower and slower, impacting performance of all processes attempting to get a directory listing (for instance to delete some files and/or to find some specific files). Processes reading large directory inodes get frozen while doing so and end up in the **uninterruptible sleep** ("D" state) for longer and longer periods of time. Depending on the filesystem, this might start to become visible with 100k entries and starts being a very noticeable performance impact with 1M+ entries.
 14 | 
 15 | Such directories mostly **cannot shrink back** even if content gets cleaned up due to the fact that most Linux and Un\*x filesystems do not support directory inode shrinking (for instance very common ext3/ext4). This often happens with forgotten Web sessions directory (PHP sessions folder where GC interval was configured to several days), various cache folders (CMS compiled templates and caches), POSIX filesystem emulating object storage, etc.
 16 | 
 17 | Program will attempt to identify any number of such events and report on them based on **calibration**, ie. how many assumed directory entries are packed in each directory inode for each filesystem. While doing so, it will determine directory inode growth ratio to number of entries/inodes and will use that ratio to quickly scan filesystem, avoiding doing expensive/slow directory lookups. While there are many tools that scan the filesystem (`find`, `du`, `ncdu`, etc.), none of them use heuristics to avoid expensive lookups, since they are designed to be **fully accurate**, while this tool is meant to use heuristics and alert on issues **without getting stuck** on problematic folders.
 18 | 
 19 | Program will **not follow symlinks** and **requires r/w permissions** to calibrate directory to be able to calculate a directory inode size to number of entries ratio and estimate a number of entries in a directory without actually counting them. While this method is just an approximation of the actual number of entries in a directory, it is good enough to quickly scan for offending directories.
 20 | 
 21 | ![Demo](demo.gif)
 22 | 
 23 | ## Caveats
 24 | 
 25 | - requires r/w privileges for an each filesystem being tested, it will also create a temporary directory with a lot of temporary files which are cleaned up afterwards
 26 | - accurate mode (`-a`) can cause an excessive I/O and an excessive memory use; only use when appropriate
 27 | 
 28 | ## Usage
 29 | 
 30 | ```shell
 31 | Usage: findlargedir [OPTIONS] <PATH>...
 32 | 
 33 | Arguments:
 34 |   <PATH>...  Paths to check for large directories
 35 | 
 36 | Options:
 37 |   -a, --accurate <ACCURATE>
 38 |           Perform accurate directory entry counting [default: false] [possible values: true, false]
 39 |   -o, --one-filesystem <ONE_FILESYSTEM>
 40 |           Do not cross mount points [default: true] [possible values: true, false]
 41 |   -c, --calibration-count <CALIBRATION_COUNT>
 42 |           Calibration directory file count [default: 100000]
 43 |   -A, --alert-threshold <ALERT_THRESHOLD>
 44 |           Alert threshold count (print the estimate) [default: 10000]
 45 |   -B, --blacklist-threshold <BLACKLIST_THRESHOLD>
 46 |           Blacklist threshold count (print the estimate and stop deeper scan) [default: 100000]
 47 |   -x, --threads <THREADS>
 48 |           Number of threads to use when calibrating and scanning [default: 24]
 49 |   -p, --updates <UPDATES>
 50 |           Seconds between status updates, set to 0 to disable [default: 20]
 51 |   -i, --size-inode-ratio <SIZE_INODE_RATIO>
 52 |           Skip calibration and provide directory entry to inode size ratio (typically ~21-32) [default: 0]
 53 |   -t, --calibration-path <CALIBRATION_PATH>
 54 |           Custom calibration directory path
 55 |   -s, --skip-path <SKIP_PATH>
 56 |           Directories to exclude from scanning
 57 |   -h, --help
 58 |           Print help information
 59 |   -V, --version
 60 |           Print version information
 61 | ```
 62 | 
 63 | When using **accurate mode** (`-a` parameter) beware that large directory lookups will stall the process completely for extended periods of time. What this mode does is basically a secondary fully accurate pass on a possibly offending directory calculating exact number of entries.
 64 | 
 65 | To avoid descending into mounted filesystems (as in find -xdev option), parameter **one-filesystem mode** (`-o` parameter) is toggled by default, but it can be disabled if necessary.
 66 | 
 67 | It is possible to completely skip calibration phase by manually providing directory inode size to number of entries ratio with `-i` parameter. It makes sense only when you already know the ratio, for example from previous runs.
 68 | 
 69 | Setting `-p` paramter to 0 will stop program from giving occasional status updates.
 70 | 
 71 | ## Benchmarks
 72 | 
 73 | ### Findlargedir vs GNU find
 74 | 
 75 | #### Mid-range server / mechanical storage
 76 | 
 77 | Hardware: 8-core Xeon E5-1630 with 4-drive SATA RAID-10
 78 | 
 79 | Benchmark setup:
 80 | 
 81 | ```shell
 82 | $ cat bench1.sh
 83 | #!/bin/dash
 84 | exec /usr/bin/find / -xdev -type d -size +200000c
 85 | 
 86 | $ cat bench2.sh
 87 | #!/bin/dash
 88 | exec /usr/local/sbin/findlargedir /
 89 | ```
 90 | 
 91 | Actual results measured with [hyperfine](https://github.com/sharkdp/hyperfine):
 92 | 
 93 | ```shell
 94 | $ hyperfine --prepare 'echo 3 | tee /proc/sys/vm/drop_caches' \
 95 |   ./bench1.sh ./bench2.sh
 96 | 
 97 | Benchmark 1: ./bench1.sh
 98 |   Time (mean ± σ):     357.040 s ±  7.176 s    [User: 2.324 s, System: 13.881 s]
 99 |   Range (min … max):   349.639 s … 367.636 s    10 runs
100 | 
101 | Benchmark 2: ./bench2.sh
102 |   Time (mean ± σ):     199.751 s ±  4.431 s    [User: 75.163 s, System: 141.271 s]
103 |   Range (min … max):   190.136 s … 203.432 s    10 runs
104 | 
105 | Summary
106 |   './bench2.sh' ran
107 |     1.79 ± 0.05 times faster than './bench1.sh'
108 | ```
109 | 
110 | #### High-end server / SSD storage
111 | 
112 | Hardware: 48-core Xeon Silver 4214, 7-drive SM883 SATA HW RAID-5 array, 2TB content (dozen of containers with small files)
113 | 
114 | Same benchmark setup. Results:
115 | 
116 | ```shell
117 | $ hyperfine --prepare 'echo 3 | tee /proc/sys/vm/drop_caches' \
118 |   ./bench1.sh ./bench2.sh
119 | 
120 | Benchmark 1: ./bench1.sh
121 |   Time (mean ± σ):     392.433 s ±  1.952 s    [User: 16.056 s, System: 81.994 s]
122 |   Range (min … max):   390.284 s … 395.732 s    10 runs
123 | 
124 | Benchmark 2: ./bench2.sh
125 |   Time (mean ± σ):     34.650 s ±  0.469 s    [User: 79.441 s, System: 528.939 s]
126 |   Range (min … max):   34.049 s … 35.388 s    10 runs
127 | 
128 | Summary
129 |   './bench2.sh' ran
130 |    11.33 ± 0.16 times faster than './bench1.sh'
131 | ```
132 | 
133 | ## Star history
134 | 
135 | [![Star History Chart](https://api.star-history.com/svg?repos=dkorunic/findlargedir&type=Date)](https://star-history.com/#dkorunic/findlargedir&Date)
136 | 


--------------------------------------------------------------------------------
/src/walk.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::Metadata;
  2 | use std::fs::read_dir;
  3 | use std::os::unix::fs::MetadataExt;
  4 | use std::path::Path;
  5 | use std::path::PathBuf;
  6 | use std::process;
  7 | use std::sync::Arc;
  8 | use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
  9 | use std::thread::sleep;
 10 | use std::time::Duration;
 11 | 
 12 | use crate::args::Args;
 13 | use ahash::AHashSet;
 14 | use ansi_term::Colour::{Green, Red, Yellow};
 15 | use fs_err as fs;
 16 | use human_format::Formatter;
 17 | use ignore::{DirEntry, Error, WalkBuilder, WalkState};
 18 | use indicatif::HumanBytes;
 19 | 
 20 | /// Default number of files in a folder to cause alert
 21 | pub const ALERT_COUNT: u64 = 10_000;
 22 | 
 23 | /// Default number of files in a folder to cause red alert and further blacklist from the deeper
 24 | /// scan
 25 | pub const BLACKLIST_COUNT: u64 = 100_000;
 26 | 
 27 | /// Default exit error code in case of premature termination
 28 | const ERROR_EXIT: i32 = 1;
 29 | 
 30 | /// Default status update period in seconds
 31 | pub const STATUS_SECONDS: u64 = 20;
 32 | 
 33 | /// Perform a parallel filesystem search based on specified criteria and arguments.
 34 | ///
 35 | /// # Arguments
 36 | /// * `path` - A reference to the starting path for the filesystem search.
 37 | /// * `path_metadata` - A reference to the metadata of the starting path.
 38 | /// * `size_inode_ratio` - The ratio used to calculate the approximate number of files in a directory.
 39 | /// * `shutdown_walk` - A shared reference to a boolean flag indicating if the search should be terminated.
 40 | /// * `args` - A shared reference to the command-line arguments provided.
 41 | ///
 42 | /// # Returns
 43 | /// The total count of processed directories during the filesystem search.
 44 | ///
 45 | /// # Behaviors
 46 | /// - Creates a hash set of paths to be excluded from scanning.
 47 | /// - Initializes a thread pool for status reporting and filesystem traversal.
 48 | /// - Updates the processed directory count based on the status update interval.
 49 | /// - Initiates the parallel filesystem walk using specified parameters.
 50 | /// - Terminates the search if a shutdown signal is received.
 51 | /// - Processes each directory entry encountered during the search.
 52 | ///
 53 | /// # Types
 54 | /// * `path` - `&PathBuf`
 55 | /// * `path_metadata` - `&Metadata`
 56 | /// * `size_inode_ratio` - `u64`
 57 | /// * `shutdown_walk` - `&Arc<AtomicBool>`
 58 | /// * `args` - `&Arc<Args>`
 59 | /// * Return Type - `u64`
 60 | pub fn parallel_search(
 61 |     path: &PathBuf,
 62 |     path_metadata: &Metadata,
 63 |     size_inode_ratio: u64,
 64 |     shutdown_walk: &Arc<AtomicBool>,
 65 |     args: &Arc<Args>,
 66 | ) -> u64 {
 67 |     // Create hash set for path exclusions
 68 |     let skip_path = &args.skip_path.iter().cloned().collect::<AHashSet<_>>();
 69 | 
 70 |     // Thread pool for status reporting and filesystem walk
 71 |     let pool = Arc::new(
 72 |         rayon::ThreadPoolBuilder::new()
 73 |             .num_threads(1)
 74 |             .build()
 75 |             .expect("Unable to spawn reporting thread pool"),
 76 |     );
 77 | 
 78 |     // Processed directory count
 79 |     let dir_count = &Arc::new(AtomicU64::new(0));
 80 | 
 81 |     // Status update thread
 82 |     if args.updates > 0 {
 83 |         let dir_count = dir_count.clone();
 84 |         let sleep_delay = args.updates;
 85 | 
 86 |         pool.spawn(move || loop {
 87 |             sleep(Duration::from_secs(sleep_delay));
 88 | 
 89 |             let count = dir_count.load(Ordering::Acquire);
 90 |             println!(
 91 |                 "Processed {} directories so far, next update in {} seconds",
 92 |                 Green.paint(count.to_string()),
 93 |                 sleep_delay
 94 |             );
 95 |         });
 96 |     }
 97 | 
 98 |     // Perform target filesystem walking
 99 |     WalkBuilder::new(path)
100 |         .hidden(false)
101 |         .standard_filters(false)
102 |         .follow_links(args.follow_symlinks)
103 |         .threads(args.threads)
104 |         .build_parallel()
105 |         .run(|| {
106 |             Box::new({
107 |                 move |dir_entry_result| {
108 |                     // Terminate on received interrupt signal
109 |                     if shutdown_walk.load(Ordering::Relaxed) {
110 |                         println!("Requested program exit, stopping scan...");
111 | 
112 |                         process::exit(ERROR_EXIT);
113 |                     }
114 | 
115 |                     process_dir_entry(
116 |                         path_metadata,
117 |                         size_inode_ratio,
118 |                         &dir_entry_result,
119 |                         skip_path,
120 |                         args,
121 |                         dir_count,
122 |                     )
123 |                 }
124 |             })
125 |         });
126 | 
127 |     dir_count.load(Ordering::Acquire)
128 | }
129 | 
130 | /// Processes a directory entry based on specified criteria and arguments.
131 | ///
132 | /// # Arguments
133 | /// * `path_metadata` - A reference to the metadata of the current directory.
134 | /// * `size_inode_ratio` - The ratio used to calculate the approximate number of files in the directory.
135 | /// * `dir_entry_result` - The result of attempting to read a directory entry.
136 | /// * `skip_path` - A set of paths to be excluded from scanning.
137 | /// * `args` - A shared reference to the command-line arguments provided.
138 | /// * `dir_count` - A shared reference to the atomic counter for visited directories.
139 | ///
140 | /// # Returns
141 | /// The state of the directory processing, indicating whether to continue, skip, or stop scanning.
142 | ///
143 | /// # Behaviors
144 | /// - Checks if the directory entry is a directory; if not, continues to the next entry.
145 | /// - Increments the visited directory count.
146 | /// - Skips scanning if the directory is in the skip path list.
147 | /// - Skips scanning if the directory is on a different filesystem and the `one_filesystem` flag is set.
148 | /// - Calculates the size and approximate file count of the directory entry.
149 | /// - Prints warnings and potentially marks the directory as an offender based on file count thresholds.
150 | /// - Returns the appropriate state for further scanning based on the calculated conditions.
151 | ///
152 | /// # Types
153 | /// * `path_metadata` - `&Metadata`
154 | /// * `size_inode_ratio` - `u64`
155 | /// * `dir_entry_result` - `&Result<DirEntry, ignore::Error>`
156 | /// * `skip_path` - `&AHashSet<PathBuf>`
157 | /// * `args` - `&Arc<Args>`
158 | /// * `dir_count` - `&Arc<AtomicU64>`
159 | /// * Return Type - `WalkState`
160 | fn process_dir_entry(
161 |     path_metadata: &Metadata,
162 |     size_inode_ratio: u64,
163 |     dir_entry_result: &Result<DirEntry, Error>,
164 |     skip_path: &AHashSet<PathBuf>,
165 |     args: &Arc<Args>,
166 |     dir_count: &Arc<AtomicU64>,
167 | ) -> WalkState {
168 |     if let Ok(dir_entry) = dir_entry_result
169 |         && let Some(dir_entry_type) = dir_entry.file_type()
170 |     {
171 |         if !dir_entry_type.is_dir() {
172 |             return WalkState::Continue;
173 |         }
174 | 
175 |         let full_path = dir_entry.path();
176 | 
177 |         // Visited directory count
178 |         dir_count.fetch_add(1, Ordering::AcqRel);
179 | 
180 |         // Ignore skip paths, typically being virtual filesystems (/proc, /dev, /sys, /run)
181 |         if !skip_path.is_empty()
182 |             && skip_path.contains(&full_path.to_path_buf())
183 |         {
184 |             println!(
185 |                 "Skipping further scan at {} as requested",
186 |                 full_path.display()
187 |             );
188 | 
189 |             return WalkState::Skip;
190 |         }
191 | 
192 |         // Retrieve Unix metadata for a given directory
193 |         if let Ok(dir_entry_metadata) = fs::metadata(full_path) {
194 |             // If `one_filesystem` flag has been set and if directory is not residing
195 |             // on the same device as top search path, print warning and abort deeper
196 |             // scanning
197 |             if args.one_filesystem
198 |                 && (dir_entry_metadata.dev() != path_metadata.dev())
199 |             {
200 |                 println!(
201 |                     "Identified filesystem boundary at {}, skipping...",
202 |                     full_path.display()
203 |                 );
204 | 
205 |                 return WalkState::Skip;
206 |             }
207 | 
208 |             // Identify size and calculate approximate directory entry count
209 |             let size = dir_entry_metadata.size();
210 |             let approx_files = size / size_inode_ratio;
211 | 
212 |             // Print count warnings if necessary
213 |             if approx_files > args.blacklist_threshold {
214 |                 print_offender(
215 |                     full_path,
216 |                     size,
217 |                     approx_files,
218 |                     args.accurate,
219 |                     true,
220 |                 );
221 | 
222 |                 return WalkState::Skip;
223 |             } else if approx_files > args.alert_threshold {
224 |                 print_offender(
225 |                     full_path,
226 |                     size,
227 |                     approx_files,
228 |                     args.accurate,
229 |                     false,
230 |                 );
231 | 
232 |                 return WalkState::Continue;
233 |             }
234 |         }
235 |     }
236 | 
237 |     WalkState::Continue
238 | }
239 | 
240 | #[allow(clippy::cast_precision_loss)]
241 | /// Prints information about directories that exceed specified thresholds.
242 | ///
243 | /// This function is called when the estimated number of files in a directory exceeds either the alert or blacklist thresholds.
244 | /// It outputs details about the directory and its file count, and can optionally mark the directory as an offender based on its size.
245 | ///
246 | /// # Arguments
247 | /// * `path` - The path of the directory being evaluated.
248 | /// * `size` - The size of the directory in bytes.
249 | /// * `file_count` - The estimated number of files in the directory.
250 | /// * `accurate` - A boolean flag indicating whether the size estimation is considered accurate.
251 | /// * `is_blacklisted` - A boolean flag indicating whether the directory exceeds the blacklist threshold.
252 | fn print_offender(
253 |     full_path: &Path,
254 |     size: u64,
255 |     approx_files: u64,
256 |     accurate: bool,
257 |     red_alert: bool,
258 | ) {
259 |     // Pretty print either the accurate directory count or the approximation
260 |     let human_files = if accurate {
261 |         let exact_files = match read_dir(full_path) {
262 |             Ok(r) => r.count() as u64,
263 |             Err(_) => approx_files,
264 |         };
265 |         Formatter::new().format(exact_files as f64)
266 |     } else {
267 |         Formatter::new().format(approx_files as f64)
268 |     };
269 | 
270 |     println!(
271 |         "Found directory {} with inode size {} and {}{} files",
272 |         full_path.display(),
273 |         HumanBytes(size),
274 |         if accurate { "" } else { "approx " },
275 |         if red_alert {
276 |             Red.paint(human_files)
277 |         } else {
278 |             Yellow.paint(human_files)
279 |         }
280 |     );
281 | }
282 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | # This file was autogenerated by dist: https://axodotdev.github.io/cargo-dist
  2 | #
  3 | # Copyright 2022-2024, axodotdev
  4 | # SPDX-License-Identifier: MIT or Apache-2.0
  5 | #
  6 | # CI that:
  7 | #
  8 | # * checks for a Git Tag that looks like a release
  9 | # * builds artifacts with dist (archives, installers, hashes)
 10 | # * uploads those artifacts to temporary workflow zip
 11 | # * on success, uploads the artifacts to a GitHub Release
 12 | #
 13 | # Note that the GitHub Release will be created with a generated
 14 | # title/body based on your changelogs.
 15 | 
 16 | name: Release
 17 | permissions:
 18 |   "contents": "write"
 19 | 
 20 | # This task will run whenever you push a git tag that looks like a version
 21 | # like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc.
 22 | # Various formats will be parsed into a VERSION and an optional PACKAGE_NAME, where
 23 | # PACKAGE_NAME must be the name of a Cargo package in your workspace, and VERSION
 24 | # must be a Cargo-style SemVer Version (must have at least major.minor.patch).
 25 | #
 26 | # If PACKAGE_NAME is specified, then the announcement will be for that
 27 | # package (erroring out if it doesn't have the given version or isn't dist-able).
 28 | #
 29 | # If PACKAGE_NAME isn't specified, then the announcement will be for all
 30 | # (dist-able) packages in the workspace with that version (this mode is
 31 | # intended for workspaces with only one dist-able package, or with all dist-able
 32 | # packages versioned/released in lockstep).
 33 | #
 34 | # If you push multiple tags at once, separate instances of this workflow will
 35 | # spin up, creating an independent announcement for each one. However, GitHub
 36 | # will hard limit this to 3 tags per commit, as it will assume more tags is a
 37 | # mistake.
 38 | #
 39 | # If there's a prerelease-style suffix to the version, then the release(s)
 40 | # will be marked as a prerelease.
 41 | on:
 42 |   pull_request:
 43 |   push:
 44 |     tags:
 45 |       - '**[0-9]+.[0-9]+.[0-9]+*'
 46 | 
 47 | jobs:
 48 |   # Run 'dist plan' (or host) to determine what tasks we need to do
 49 |   plan:
 50 |     runs-on: "ubuntu-22.04"
 51 |     outputs:
 52 |       val: ${{ steps.plan.outputs.manifest }}
 53 |       tag: ${{ !github.event.pull_request && github.ref_name || '' }}
 54 |       tag-flag: ${{ !github.event.pull_request && format('--tag={0}', github.ref_name) || '' }}
 55 |       publishing: ${{ !github.event.pull_request }}
 56 |     env:
 57 |       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 58 |     steps:
 59 |       - uses: actions/checkout@v4
 60 |         with:
 61 |           persist-credentials: false
 62 |           submodules: recursive
 63 |       - name: Install dist
 64 |         # we specify bash to get pipefail; it guards against the `curl` command
 65 |         # failing. otherwise `sh` won't catch that `curl` returned non-0
 66 |         shell: bash
 67 |         run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh"
 68 |       - name: Cache dist
 69 |         uses: actions/upload-artifact@v4
 70 |         with:
 71 |           name: cargo-dist-cache
 72 |           path: ~/.cargo/bin/dist
 73 |       # sure would be cool if github gave us proper conditionals...
 74 |       # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible
 75 |       # functionality based on whether this is a pull_request, and whether it's from a fork.
 76 |       # (PRs run on the *source* but secrets are usually on the *target* -- that's *good*
 77 |       # but also really annoying to build CI around when it needs secrets to work right.)
 78 |       - id: plan
 79 |         run: |
 80 |           dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json
 81 |           echo "dist ran successfully"
 82 |           cat plan-dist-manifest.json
 83 |           echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT"
 84 |       - name: "Upload dist-manifest.json"
 85 |         uses: actions/upload-artifact@v4
 86 |         with:
 87 |           name: artifacts-plan-dist-manifest
 88 |           path: plan-dist-manifest.json
 89 | 
 90 |   # Build and packages all the platform-specific things
 91 |   build-local-artifacts:
 92 |     name: build-local-artifacts (${{ join(matrix.targets, ', ') }})
 93 |     # Let the initial task tell us to not run (currently very blunt)
 94 |     needs:
 95 |       - plan
 96 |     if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }}
 97 |     strategy:
 98 |       fail-fast: false
 99 |       # Target platforms/runners are computed by dist in create-release.
100 |       # Each member of the matrix has the following arguments:
101 |       #
102 |       # - runner: the github runner
103 |       # - dist-args: cli flags to pass to dist
104 |       # - install-dist: expression to run to install dist on the runner
105 |       #
106 |       # Typically there will be:
107 |       # - 1 "global" task that builds universal installers
108 |       # - N "local" tasks that build each platform's binaries and platform-specific installers
109 |       matrix: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix }}
110 |     runs-on: ${{ matrix.runner }}
111 |     container: ${{ matrix.container && matrix.container.image || null }}
112 |     env:
113 |       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
114 |       BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json
115 |     steps:
116 |       - name: enable windows longpaths
117 |         run: |
118 |           git config --global core.longpaths true
119 |       - uses: actions/checkout@v4
120 |         with:
121 |           persist-credentials: false
122 |           submodules: recursive
123 |       - name: Install Rust non-interactively if not already installed
124 |         if: ${{ matrix.container }}
125 |         run: |
126 |           if ! command -v cargo > /dev/null 2>&1; then
127 |             curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
128 |             echo "$HOME/.cargo/bin" >> $GITHUB_PATH
129 |           fi
130 |       - name: Install dist
131 |         run: ${{ matrix.install_dist.run }}
132 |       # Get the dist-manifest
133 |       - name: Fetch local artifacts
134 |         uses: actions/download-artifact@v4
135 |         with:
136 |           pattern: artifacts-*
137 |           path: target/distrib/
138 |           merge-multiple: true
139 |       - name: Install dependencies
140 |         run: |
141 |           ${{ matrix.packages_install }}
142 |       - name: Build artifacts
143 |         run: |
144 |           # Actually do builds and make zips and whatnot
145 |           dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json
146 |           echo "dist ran successfully"
147 |       - id: cargo-dist
148 |         name: Post-build
149 |         # We force bash here just because github makes it really hard to get values up
150 |         # to "real" actions without writing to env-vars, and writing to env-vars has
151 |         # inconsistent syntax between shell and powershell.
152 |         shell: bash
153 |         run: |
154 |           # Parse out what we just built and upload it to scratch storage
155 |           echo "paths<<EOF" >> "$GITHUB_OUTPUT"
156 |           dist print-upload-files-from-manifest --manifest dist-manifest.json >> "$GITHUB_OUTPUT"
157 |           echo "EOF" >> "$GITHUB_OUTPUT"
158 | 
159 |           cp dist-manifest.json "$BUILD_MANIFEST_NAME"
160 |       - name: "Upload artifacts"
161 |         uses: actions/upload-artifact@v4
162 |         with:
163 |           name: artifacts-build-local-${{ join(matrix.targets, '_') }}
164 |           path: |
165 |             ${{ steps.cargo-dist.outputs.paths }}
166 |             ${{ env.BUILD_MANIFEST_NAME }}
167 | 
168 |   # Build and package all the platform-agnostic(ish) things
169 |   build-global-artifacts:
170 |     needs:
171 |       - plan
172 |       - build-local-artifacts
173 |     runs-on: "ubuntu-22.04"
174 |     env:
175 |       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
176 |       BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json
177 |     steps:
178 |       - uses: actions/checkout@v4
179 |         with:
180 |           persist-credentials: false
181 |           submodules: recursive
182 |       - name: Install cached dist
183 |         uses: actions/download-artifact@v4
184 |         with:
185 |           name: cargo-dist-cache
186 |           path: ~/.cargo/bin/
187 |       - run: chmod +x ~/.cargo/bin/dist
188 |       # Get all the local artifacts for the global tasks to use (for e.g. checksums)
189 |       - name: Fetch local artifacts
190 |         uses: actions/download-artifact@v4
191 |         with:
192 |           pattern: artifacts-*
193 |           path: target/distrib/
194 |           merge-multiple: true
195 |       - id: cargo-dist
196 |         shell: bash
197 |         run: |
198 |           dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json
199 |           echo "dist ran successfully"
200 | 
201 |           # Parse out what we just built and upload it to scratch storage
202 |           echo "paths<<EOF" >> "$GITHUB_OUTPUT"
203 |           jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT"
204 |           echo "EOF" >> "$GITHUB_OUTPUT"
205 | 
206 |           cp dist-manifest.json "$BUILD_MANIFEST_NAME"
207 |       - name: "Upload artifacts"
208 |         uses: actions/upload-artifact@v4
209 |         with:
210 |           name: artifacts-build-global
211 |           path: |
212 |             ${{ steps.cargo-dist.outputs.paths }}
213 |             ${{ env.BUILD_MANIFEST_NAME }}
214 |   # Determines if we should publish/announce
215 |   host:
216 |     needs:
217 |       - plan
218 |       - build-local-artifacts
219 |       - build-global-artifacts
220 |     # Only run if we're "publishing", and only if plan, local and global didn't fail (skipped is fine)
221 |     if: ${{ always() && needs.plan.result == 'success' && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }}
222 |     env:
223 |       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
224 |     runs-on: "ubuntu-22.04"
225 |     outputs:
226 |       val: ${{ steps.host.outputs.manifest }}
227 |     steps:
228 |       - uses: actions/checkout@v4
229 |         with:
230 |           persist-credentials: false
231 |           submodules: recursive
232 |       - name: Install cached dist
233 |         uses: actions/download-artifact@v4
234 |         with:
235 |           name: cargo-dist-cache
236 |           path: ~/.cargo/bin/
237 |       - run: chmod +x ~/.cargo/bin/dist
238 |       # Fetch artifacts from scratch-storage
239 |       - name: Fetch artifacts
240 |         uses: actions/download-artifact@v4
241 |         with:
242 |           pattern: artifacts-*
243 |           path: target/distrib/
244 |           merge-multiple: true
245 |       - id: host
246 |         shell: bash
247 |         run: |
248 |           dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json
249 |           echo "artifacts uploaded and released successfully"
250 |           cat dist-manifest.json
251 |           echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT"
252 |       - name: "Upload dist-manifest.json"
253 |         uses: actions/upload-artifact@v4
254 |         with:
255 |           # Overwrite the previous copy
256 |           name: artifacts-dist-manifest
257 |           path: dist-manifest.json
258 |       # Create a GitHub Release while uploading all files to it
259 |       - name: "Download GitHub Artifacts"
260 |         uses: actions/download-artifact@v4
261 |         with:
262 |           pattern: artifacts-*
263 |           path: artifacts
264 |           merge-multiple: true
265 |       - name: Cleanup
266 |         run: |
267 |           # Remove the granular manifests
268 |           rm -f artifacts/*-dist-manifest.json
269 |       - name: Create GitHub Release
270 |         env:
271 |           PRERELEASE_FLAG: "${{ fromJson(steps.host.outputs.manifest).announcement_is_prerelease && '--prerelease' || '' }}"
272 |           ANNOUNCEMENT_TITLE: "${{ fromJson(steps.host.outputs.manifest).announcement_title }}"
273 |           ANNOUNCEMENT_BODY: "${{ fromJson(steps.host.outputs.manifest).announcement_github_body }}"
274 |           RELEASE_COMMIT: "${{ github.sha }}"
275 |         run: |
276 |           # Write and read notes from a file to avoid quoting breaking things
277 |           echo "$ANNOUNCEMENT_BODY" > $RUNNER_TEMP/notes.txt
278 | 
279 |           gh release create "${{ needs.plan.outputs.tag }}" --target "$RELEASE_COMMIT" $PRERELEASE_FLAG --title "$ANNOUNCEMENT_TITLE" --notes-file "$RUNNER_TEMP/notes.txt" artifacts/*
280 | 
281 |   announce:
282 |     needs:
283 |       - plan
284 |       - host
285 |     # use "always() && ..." to allow us to wait for all publish jobs while
286 |     # still allowing individual publish jobs to skip themselves (for prereleases).
287 |     # "host" however must run to completion, no skipping allowed!
288 |     if: ${{ always() && needs.host.result == 'success' }}
289 |     runs-on: "ubuntu-22.04"
290 |     env:
291 |       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
292 |     steps:
293 |       - uses: actions/checkout@v4
294 |         with:
295 |           persist-credentials: false
296 |           submodules: recursive
297 | 


--------------------------------------------------------------------------------