├── rust-toolchain.toml ├── rustfmt.toml ├── demo.gif ├── ferris.png ├── .github ├── dependabot.yml └── workflows │ └── release.yml ├── ci └── cargo-out-dir ├── .gitignore ├── dist-workspace.toml ├── src ├── progress.rs ├── interrupt.rs ├── calibrate.rs ├── args.rs ├── main.rs └── walk.rs ├── LICENSE ├── Cargo.toml └── README.md /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "1.90" 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 79 2 | use_small_heuristics = "max" 3 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkorunic/findlargedir/HEAD/demo.gif -------------------------------------------------------------------------------- /ferris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkorunic/findlargedir/HEAD/ferris.png -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | day: "sunday" 8 | time: "22:00" 9 | open-pull-requests-limit: 10 10 | -------------------------------------------------------------------------------- /ci/cargo-out-dir: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Finds Cargo's `OUT_DIR` directory from the most recent build. 4 | # 5 | # This requires one parameter corresponding to the target directory 6 | # to search for the build output. 7 | 8 | if [ $# != 1 ]; then 9 | echo "Usage: $(basename "$0") " >&2 10 | exit 2 11 | fi 12 | 13 | # This works by finding the most recent stamp file, which is produced by 14 | # every ripgrep build. 15 | target_dir="$1" 16 | find "$target_dir" -type f -name findlargedir -print0 \ 17 | | xargs -0 ls -t \ 18 | | head -n1 \ 19 | | xargs dirname 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Rust ### 2 | # Generated by Cargo 3 | # will have compiled files and executables 4 | debug/ 5 | target/ 6 | 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 8 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 9 | Cargo.lock 10 | 11 | # These are backup files generated by rustfmt 12 | **/*.rs.bk 13 | 14 | # MSVC Windows builds of rustc generate these, which store debugging information 15 | *.pdb 16 | 17 | # IntelliJ tools 18 | .idea/ 19 | 20 | # End of https://www.toptal.com/developers/gitignore/api/rust 21 | -------------------------------------------------------------------------------- /dist-workspace.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["cargo:."] 3 | 4 | # Config for 'dist' 5 | [dist] 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax) 7 | cargo-dist-version = "0.30.2" 8 | # CI backends to support 9 | ci = "github" 10 | # The installers to generate for each app 11 | installers = ["shell"] 12 | # Target platforms to build apps for (Rust target-triple syntax) 13 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl"] 14 | # Path that installers should place binaries in 15 | install-path = "CARGO_HOME" 16 | # Whether to install an updater program 17 | install-updater = false 18 | -------------------------------------------------------------------------------- /src/progress.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use indicatif::{ProgressBar, ProgressStyle}; 4 | 5 | /// Default tick chars 6 | const PROGRESS_CHARS: &str = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"; 7 | 8 | /// Default tick in milliseconds 9 | const PROGRESS_TICK: u64 = 80; 10 | 11 | /// Initializes a new `ProgressBar` with a spinner style. 12 | /// 13 | /// # Arguments 14 | /// * `msg` - A message of generic type `S` that implements `Into`, which will be displayed on the spinner. 15 | /// 16 | /// # Returns 17 | /// Returns a `ProgressBar` object configured with a steady tick and custom spinner style. 18 | /// 19 | /// # Examples 20 | /// ``` 21 | /// let spinner = new_spinner("Loading..."); 22 | /// ``` 23 | pub fn new_spinner(msg: S) -> ProgressBar 24 | where 25 | S: Into, 26 | { 27 | let pb = ProgressBar::new_spinner(); 28 | pb.enable_steady_tick(Duration::from_millis(PROGRESS_TICK)); 29 | pb.set_style(ProgressStyle::default_spinner().tick_chars(PROGRESS_CHARS)); 30 | pb.set_message(msg.into()); 31 | 32 | pb 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2022 Dinko Korunic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/interrupt.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::sync::atomic::AtomicBool; 3 | 4 | use anyhow::{Context, Error}; 5 | use signal_hook::consts::TERM_SIGNALS; 6 | use signal_hook::flag::register; 7 | 8 | /// Sets up a handler for process interruption signals (each signal in `TERM_SIGNALS`). 9 | /// This function configures a handler that will set a shared atomic boolean to `true` 10 | /// whenever an interruption signal is received, indicating that the process should shut down. 11 | /// 12 | /// # Arguments 13 | /// * `shutdown` - An `&Arc` shared among threads, used to signal shutdown when set to `true`. 14 | /// 15 | /// # Returns 16 | /// Returns `Ok(())` if the handler is successfully set, or an `Error` if any issues occur during setup. 17 | /// 18 | /// # Errors 19 | /// Returns an error if the signal handler cannot be set, encapsulated in an `anyhow::Error`. 20 | pub fn setup_interrupt_handler( 21 | shutdown: &Arc, 22 | ) -> Result<(), Error> { 23 | for sig in TERM_SIGNALS { 24 | let name = 25 | signal_hook::low_level::signal_name(*sig).unwrap_or_default(); 26 | register(*sig, shutdown.clone()).with_context(|| { 27 | format!("Unable to register signal handler for {name}/{sig}") 28 | })?; 29 | } 30 | 31 | Ok(()) 32 | } 33 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "findlargedir" 3 | version = "0.10.3" 4 | authors = ["Dinko Korunic "] 5 | categories = ["command-line-utilities"] 6 | description = "find all blackhole directories with a huge amount of filesystem entries in a flat structure" 7 | repository = "https://github.com/dkorunic/findlargedir" 8 | homepage = "https://github.com/dkorunic/findlargedir" 9 | readme = "README.md" 10 | license = "MIT" 11 | exclude = [".gitignore"] 12 | edition = "2024" 13 | rust-version = "1.88.0" 14 | 15 | [dependencies] 16 | mimalloc = "0.1.48" 17 | rayon = "1.11.0" 18 | tempfile = "3.23.0" 19 | anyhow = "1.0.100" 20 | human_format = "1.1.0" 21 | clap = { version = "4.5.51", features = ["derive", "unicode", "wrap_help"] } 22 | rm_rf = "0.6.2" 23 | ansi_term = "0.12.1" 24 | fs-err = "3.1.3" 25 | indicatif = { version = "0.18.2", features = ["rayon"] } 26 | fdlimit = "0.3.0" 27 | ahash = "0.8.12" 28 | anstyle = "1.0.13" 29 | signal-hook = "0.3.18" 30 | ignore = "0.4.25" 31 | normpath = "1.5.0" 32 | 33 | [profile.release] 34 | opt-level = 3 35 | debug = "none" 36 | strip = "symbols" 37 | debug-assertions = false 38 | overflow-checks = true 39 | lto = "fat" 40 | panic = "abort" 41 | codegen-units = 1 42 | 43 | # The profile that 'dist' will build with 44 | [profile.dist] 45 | inherits = "release" 46 | lto = "fat" 47 | -------------------------------------------------------------------------------- /src/calibrate.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::os::unix::fs::MetadataExt; 3 | use std::path::Path; 4 | use std::process; 5 | use std::sync::Arc; 6 | use std::sync::atomic::{AtomicBool, Ordering}; 7 | 8 | use anyhow::{Context, Error}; 9 | use fs_err as fs; 10 | use rayon::prelude::*; 11 | use rm_rf::ensure_removed; 12 | 13 | use crate::{args, progress}; 14 | 15 | /// Default number of files to create in the calibration directory 16 | pub const DEFAULT_TEST_COUNT: u64 = 100; 17 | 18 | /// Default exit error code in case of premature termination 19 | const ERROR_EXIT: i32 = 1; 20 | 21 | /// Calculates the size-to-inode ratio for a given directory. 22 | /// 23 | /// This function initiates a calibration process by creating a specified number of files 24 | /// within the `test_path` directory to determine the average file size to inode ratio. 25 | /// It uses a multi-threaded approach to create files and monitors for a shutdown signal 26 | /// to safely terminate and clean up if necessary. 27 | /// 28 | /// # Arguments 29 | /// * `test_path` - A reference to the path where test files will be created. 30 | /// * `shutdown` - A shared atomic boolean to signal shutdown and cleanup. 31 | /// * `args` - A shared structure containing runtime arguments such as the number of threads 32 | /// and the number of files to create for calibration. 33 | /// 34 | /// # Returns 35 | /// Returns a `Result` which is the calculated size-to-inode ratio if successful, 36 | /// or an error if the operation fails at any step. 37 | /// 38 | /// # Errors 39 | /// This function can return an error if it fails to create the thread pool, create files, 40 | /// delete the directory, or retrieve metadata from the test directory. 41 | /// 42 | /// # Examples 43 | /// ``` 44 | /// let test_path = Path::new("/tmp/test_dir"); 45 | /// let shutdown = Arc::new(AtomicBool::new(false)); 46 | /// let args = Arc::new(args::Args { 47 | /// threads: 4, 48 | /// calibration_count: 1000, 49 | /// }); 50 | /// let ratio = get_inode_ratio(&test_path, &shutdown, &args); 51 | /// match ratio { 52 | /// Ok(ratio) => println!("Size-to-inode ratio: {}", ratio), 53 | /// Err(e) => println!("Failed to calculate size-to-inode ratio: {}", e), 54 | /// } 55 | /// ``` 56 | pub fn get_inode_ratio( 57 | test_path: &Path, 58 | shutdown: &Arc, 59 | args: &Arc, 60 | ) -> Result { 61 | println!("Starting test directory calibration in {}", test_path.display(),); 62 | 63 | // Thread pool for mass file creation 64 | let pool = rayon::ThreadPoolBuilder::new() 65 | .num_threads(args.threads) 66 | .build() 67 | .context("Unable to spawn calibration thread pool")?; 68 | 69 | let pb = progress::new_spinner("Creating test files in progress..."); 70 | 71 | // Mass create files; filenames are short to get minimal size to inode ratio 72 | let res: Result<(), Error> = pool.install(|| { 73 | (0..args.calibration_count).into_par_iter().try_for_each(|i| { 74 | if !shutdown.load(Ordering::Relaxed) { 75 | File::create(test_path.join(i.to_string())) 76 | .context("Unable to create test file")?; 77 | } 78 | 79 | Ok(()) 80 | }) 81 | }); 82 | 83 | pb.finish_with_message("Done."); 84 | 85 | // Check for calibration errors 86 | if let Err(e) = res { 87 | println!("Fatal program error, exiting: {e}"); 88 | 89 | // TempDir cleanup will most likely fail as well 90 | _ = ensure_removed(test_path); 91 | 92 | process::exit(ERROR_EXIT); 93 | } 94 | 95 | // Terminate on received interrupt signal 96 | if shutdown.load(Ordering::Relaxed) { 97 | println!( 98 | "Requested program exit, stopping and deleting temporary files...", 99 | ); 100 | ensure_removed(test_path).expect( 101 | "Unable to completely delete calibration directory, exiting", 102 | ); 103 | 104 | process::exit(ERROR_EXIT); 105 | } 106 | 107 | let size_inode_ratio = fs::metadata(test_path) 108 | .context("Unable to retrieve calibration directory metadata")? 109 | .size() 110 | / args.calibration_count; 111 | println!( 112 | "Calibration done. Calculated size-to-inode ratio: {size_inode_ratio}" 113 | ); 114 | 115 | Ok(size_inode_ratio) 116 | } 117 | -------------------------------------------------------------------------------- /src/args.rs: -------------------------------------------------------------------------------- 1 | use std::path::{Path, PathBuf}; 2 | use std::thread; 3 | 4 | use anstyle::AnsiColor; 5 | use anyhow::{Error, anyhow}; 6 | use clap::Parser; 7 | use clap::ValueHint; 8 | use clap::builder::{ValueParser, styling::Styles}; 9 | use normpath::PathExt; 10 | 11 | const STYLES: Styles = Styles::styled() 12 | .header(AnsiColor::Yellow.on_default()) 13 | .usage(AnsiColor::Green.on_default()) 14 | .literal(AnsiColor::Green.on_default()) 15 | .placeholder(AnsiColor::Green.on_default()); 16 | 17 | #[derive(Parser, Default, Debug, Clone)] 18 | #[clap(author, version, about, long_about = None, styles=STYLES)] 19 | pub struct Args { 20 | /// Follow symlinks 21 | #[clap(short = 'f', long, action = clap::ArgAction::Set, default_value_t = false)] 22 | pub follow_symlinks: bool, 23 | 24 | /// Perform accurate directory entry counting 25 | #[clap(short = 'a', long, action = clap::ArgAction::Set, default_value_t = false)] 26 | pub accurate: bool, 27 | 28 | /// Do not cross mount points 29 | #[clap(short = 'o', long, action = clap::ArgAction::Set, default_value_t = true)] 30 | pub one_filesystem: bool, 31 | 32 | /// Calibration directory file count 33 | #[clap(short = 'c', long, value_parser, default_value_t = crate::calibrate::DEFAULT_TEST_COUNT)] 34 | pub calibration_count: u64, 35 | 36 | /// Alert threshold count (print the estimate) 37 | #[clap(short = 'A', long, value_parser, default_value_t = crate::walk::ALERT_COUNT)] 38 | pub alert_threshold: u64, 39 | 40 | /// Blacklist threshold count (print the estimate and stop deeper scan) 41 | #[clap(short = 'B', long, value_parser, default_value_t = crate::walk::BLACKLIST_COUNT)] 42 | pub blacklist_threshold: u64, 43 | 44 | /// Number of threads to use when calibrating and scanning 45 | #[clap(short = 'x', long, value_parser = ValueParser::new(parse_threads), default_value_t = thread::available_parallelism().map(| n | n.get()).unwrap_or(2) 46 | )] 47 | pub threads: usize, 48 | 49 | /// Seconds between status updates, set to 0 to disable 50 | #[clap(short = 'p', long, value_parser, default_value_t = crate::walk::STATUS_SECONDS)] 51 | pub updates: u64, 52 | 53 | /// Skip calibration and provide directory entry to inode size ratio (typically ~21-32) 54 | #[clap(short = 'i', long, value_parser, default_value_t = 0u64)] 55 | pub size_inode_ratio: u64, 56 | 57 | /// Custom calibration directory path 58 | #[clap(short = 't', long, value_parser, value_hint = ValueHint::AnyPath)] 59 | pub calibration_path: Option, 60 | 61 | /// Directories to exclude from scanning 62 | #[clap(short = 's', long, value_parser, value_hint = ValueHint::AnyPath)] 63 | pub skip_path: Vec, 64 | 65 | /// Paths to check for large directories 66 | #[clap(required = true, value_parser = ValueParser::new(parse_paths), value_hint = ValueHint::AnyPath 67 | )] 68 | pub path: Vec, 69 | } 70 | 71 | /// Parse and validate threads option 72 | fn parse_threads(x: &str) -> Result { 73 | match x.parse::() { 74 | Ok(v) => match v { 75 | v if !(2..=65535).contains(&v) => { 76 | Err(anyhow!("threads should be in (2..65536) range")) 77 | } 78 | v => Ok(v), 79 | }, 80 | Err(e) => Err(Error::from(e)), 81 | } 82 | } 83 | 84 | /// Parses a string into a `PathBuf`, checking if the path is a directory and exists. 85 | /// 86 | /// # Arguments 87 | /// 88 | /// * `x` - A string slice to be parsed into a `PathBuf`. 89 | /// 90 | /// # Returns 91 | /// 92 | /// * `Result` - An `Ok` variant containing a normalized `PathBuf` if the path is an existing directory, 93 | /// or an `Err` variant with an error message if the path does not exist or is not a directory. 94 | fn parse_paths(x: &str) -> Result { 95 | let p = Path::new(x); 96 | 97 | if directory_exists(p) { 98 | Ok(p.normalize()?.into_path_buf()) 99 | } else { 100 | Err(anyhow!("'{x}' is not an existing directory")) 101 | } 102 | } 103 | 104 | /// Checks if the given path is a directory and exists. 105 | /// 106 | /// # Arguments 107 | /// 108 | /// * `x` - A reference to the path to check. 109 | /// 110 | /// # Returns 111 | /// 112 | /// * `bool` - `true` if the path is an existing directory, `false` otherwise. 113 | #[inline] 114 | fn directory_exists(x: &Path) -> bool { 115 | x.is_dir() && x.normalize().is_ok() 116 | } 117 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all, clippy::pedantic)] 2 | 3 | use std::os::unix::fs::MetadataExt; 4 | use std::sync::Arc; 5 | use std::sync::atomic::AtomicBool; 6 | use std::time::Instant; 7 | 8 | use ahash::AHashSet; 9 | use anyhow::{Context, Error, Result}; 10 | use clap::Parser; 11 | use fdlimit::{Outcome, raise_fd_limit}; 12 | use fs_err as fs; 13 | use indicatif::HumanDuration; 14 | use tempfile::TempDir; 15 | 16 | mod args; 17 | mod calibrate; 18 | mod interrupt; 19 | mod progress; 20 | mod walk; 21 | 22 | use mimalloc::MiMalloc; 23 | 24 | #[global_allocator] 25 | static GLOBAL: MiMalloc = MiMalloc; 26 | 27 | /// Entry point for the filesystem scanning application. 28 | /// 29 | /// This function sets up necessary configurations and initiates the parallel filesystem scan 30 | /// by calling `parallel_search`. It handles command-line arguments and sets up the environment 31 | /// for the application to run. 32 | /// 33 | /// # Behavior: 34 | /// - Parses command-line arguments to configure the scanning process. 35 | /// - Sets up signal handling for graceful shutdowns. 36 | /// - Initiates the filesystem scan by calling `parallel_search` with appropriate parameters. 37 | /// - Handles any errors returned by `parallel_search` and exits with an appropriate status code. 38 | /// 39 | /// # Returns: 40 | /// - Typically does not return and calls `std::process::exit` to terminate the program. 41 | fn main() -> Result<(), Error> { 42 | let args = Arc::new(args::Args::parse()); 43 | 44 | // Setup termination signal (SIGINT, SIGTERM and SIGQUIT) handlers that will cause program to stop 45 | let shutdown = Arc::new(AtomicBool::new(false)); 46 | let shutdown_walk = shutdown.clone(); 47 | interrupt::setup_interrupt_handler(&shutdown)?; 48 | 49 | println!("Using {} threads for calibration and scanning", args.threads); 50 | 51 | // Attempt to raise FD limit 52 | if let Ok(Outcome::LimitRaised { to: x, .. }) = raise_fd_limit() { 53 | println!("Maximum number of file descriptors available: {x}"); 54 | } 55 | 56 | // Search only unique paths 57 | let mut visited_paths = AHashSet::with_capacity(args.path.len()); 58 | 59 | for path in args.path.clone() { 60 | // Keep order of provided path arguments, but skip already visited paths 61 | match visited_paths.get(&path) { 62 | None => visited_paths.insert(path.clone()), 63 | _ => continue, 64 | }; 65 | 66 | println!("Started analysis for path {}", path.display()); 67 | 68 | // Retrieve Unix metadata for top search path 69 | let path_metadata = fs::metadata(&path) 70 | .context("Unable to retrieve top search directory metadata")?; 71 | 72 | // Directory inode size to number of entries ratio is either manually provided in 73 | // `args.size_inode_ratio` or determined from manually provided calibration path 74 | // `args.calibration_path` or determined from calibration directory created in search root 75 | // `TempDir::new_in(path.as_path())` 76 | let size_inode_ratio = if args.size_inode_ratio > 0 { 77 | args.size_inode_ratio 78 | } else if let Some(ref user_path) = args.calibration_path { 79 | // User has specified his calibration directory so attempt to check if it resides on 80 | // the same device 81 | if fs::metadata(user_path.as_path()).context( 82 | "Unable to retrieve user-specified calibration directory metadata", 83 | )?.dev() != path_metadata.dev() 84 | { 85 | println!( 86 | "Oops, test directory resides on a different device than path {}, results are possibly unreliable!", 87 | path.display() 88 | ); 89 | } 90 | 91 | // Prepare temporary calibration directory in user path 92 | let tmp_dir = 93 | Arc::new(TempDir::new_in(user_path.as_path()).context( 94 | "Unable to setup/create calibration test directory", 95 | )?); 96 | 97 | calibrate::get_inode_ratio(tmp_dir.path(), &shutdown_walk, &args) 98 | .context("Unable to calibrate inode to size ratio")? 99 | } else { 100 | // Prepare temporary calibration directory in root of the search path 101 | let tmp_dir = Arc::new(TempDir::new_in(path.as_path()).context( 102 | "Unable to setup/create calibration test directory", 103 | )?); 104 | 105 | calibrate::get_inode_ratio(tmp_dir.path(), &shutdown_walk, &args) 106 | .context("Unable to calibrate inode to size ratio")? 107 | }; 108 | 109 | let start = Instant::now(); 110 | let pb = progress::new_spinner(format!( 111 | "Scanning path {} in progress...", 112 | path.display() 113 | )); 114 | 115 | let dir_count = walk::parallel_search( 116 | &path, 117 | &path_metadata, 118 | size_inode_ratio, 119 | &shutdown_walk, 120 | &args, 121 | ); 122 | 123 | pb.finish_with_message("Done."); 124 | 125 | println!( 126 | "Scanning path {} completed. Directories scanned: {}, Time elapsed: {}", 127 | path.display(), 128 | dir_count, 129 | HumanDuration(start.elapsed()) 130 | ); 131 | } 132 | 133 | Ok(()) 134 | } 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # findlargedir 2 | 3 | [![GitHub license](https://img.shields.io/github/license/dkorunic/findlargedir.svg)](https://github.com/dkorunic/findlargedir/blob/master/LICENSE.txt) 4 | [![GitHub release](https://img.shields.io/github/release/dkorunic/findlargedir.svg)](https://github.com/dkorunic/findlargedir/releases/latest) 5 | [![release](https://github.com/dkorunic/findlargedir/actions/workflows/release.yml/badge.svg)](https://github.com/dkorunic/findlargedir/actions/workflows/release.yml) 6 | 7 | ![](ferris.png) 8 | 9 | (Ferris the Detective by [Esther Arzola](https://www.redbubble.com/people/earzola/shop), original design by [Karen Rustad Tölva](https://www.rustacean.net)) 10 | 11 | ## About 12 | 13 | Findlargedir is a tool specifically written to help **quickly** identify "black hole" directories on an any filesystem having more than 100k entries in a single flat structure. When a directory has **many entries** (directories or files), getting directory listing gets slower and slower, impacting performance of all processes attempting to get a directory listing (for instance to delete some files and/or to find some specific files). Processes reading large directory inodes get frozen while doing so and end up in the **uninterruptible sleep** ("D" state) for longer and longer periods of time. Depending on the filesystem, this might start to become visible with 100k entries and starts being a very noticeable performance impact with 1M+ entries. 14 | 15 | Such directories mostly **cannot shrink back** even if content gets cleaned up due to the fact that most Linux and Un\*x filesystems do not support directory inode shrinking (for instance very common ext3/ext4). This often happens with forgotten Web sessions directory (PHP sessions folder where GC interval was configured to several days), various cache folders (CMS compiled templates and caches), POSIX filesystem emulating object storage, etc. 16 | 17 | Program will attempt to identify any number of such events and report on them based on **calibration**, ie. how many assumed directory entries are packed in each directory inode for each filesystem. While doing so, it will determine directory inode growth ratio to number of entries/inodes and will use that ratio to quickly scan filesystem, avoiding doing expensive/slow directory lookups. While there are many tools that scan the filesystem (`find`, `du`, `ncdu`, etc.), none of them use heuristics to avoid expensive lookups, since they are designed to be **fully accurate**, while this tool is meant to use heuristics and alert on issues **without getting stuck** on problematic folders. 18 | 19 | Program will **not follow symlinks** and **requires r/w permissions** to calibrate directory to be able to calculate a directory inode size to number of entries ratio and estimate a number of entries in a directory without actually counting them. While this method is just an approximation of the actual number of entries in a directory, it is good enough to quickly scan for offending directories. 20 | 21 | ![Demo](demo.gif) 22 | 23 | ## Caveats 24 | 25 | - requires r/w privileges for an each filesystem being tested, it will also create a temporary directory with a lot of temporary files which are cleaned up afterwards 26 | - accurate mode (`-a`) can cause an excessive I/O and an excessive memory use; only use when appropriate 27 | 28 | ## Usage 29 | 30 | ```shell 31 | Usage: findlargedir [OPTIONS] ... 32 | 33 | Arguments: 34 | ... Paths to check for large directories 35 | 36 | Options: 37 | -a, --accurate 38 | Perform accurate directory entry counting [default: false] [possible values: true, false] 39 | -o, --one-filesystem 40 | Do not cross mount points [default: true] [possible values: true, false] 41 | -c, --calibration-count 42 | Calibration directory file count [default: 100000] 43 | -A, --alert-threshold 44 | Alert threshold count (print the estimate) [default: 10000] 45 | -B, --blacklist-threshold 46 | Blacklist threshold count (print the estimate and stop deeper scan) [default: 100000] 47 | -x, --threads 48 | Number of threads to use when calibrating and scanning [default: 24] 49 | -p, --updates 50 | Seconds between status updates, set to 0 to disable [default: 20] 51 | -i, --size-inode-ratio 52 | Skip calibration and provide directory entry to inode size ratio (typically ~21-32) [default: 0] 53 | -t, --calibration-path 54 | Custom calibration directory path 55 | -s, --skip-path 56 | Directories to exclude from scanning 57 | -h, --help 58 | Print help information 59 | -V, --version 60 | Print version information 61 | ``` 62 | 63 | When using **accurate mode** (`-a` parameter) beware that large directory lookups will stall the process completely for extended periods of time. What this mode does is basically a secondary fully accurate pass on a possibly offending directory calculating exact number of entries. 64 | 65 | To avoid descending into mounted filesystems (as in find -xdev option), parameter **one-filesystem mode** (`-o` parameter) is toggled by default, but it can be disabled if necessary. 66 | 67 | It is possible to completely skip calibration phase by manually providing directory inode size to number of entries ratio with `-i` parameter. It makes sense only when you already know the ratio, for example from previous runs. 68 | 69 | Setting `-p` paramter to 0 will stop program from giving occasional status updates. 70 | 71 | ## Benchmarks 72 | 73 | ### Findlargedir vs GNU find 74 | 75 | #### Mid-range server / mechanical storage 76 | 77 | Hardware: 8-core Xeon E5-1630 with 4-drive SATA RAID-10 78 | 79 | Benchmark setup: 80 | 81 | ```shell 82 | $ cat bench1.sh 83 | #!/bin/dash 84 | exec /usr/bin/find / -xdev -type d -size +200000c 85 | 86 | $ cat bench2.sh 87 | #!/bin/dash 88 | exec /usr/local/sbin/findlargedir / 89 | ``` 90 | 91 | Actual results measured with [hyperfine](https://github.com/sharkdp/hyperfine): 92 | 93 | ```shell 94 | $ hyperfine --prepare 'echo 3 | tee /proc/sys/vm/drop_caches' \ 95 | ./bench1.sh ./bench2.sh 96 | 97 | Benchmark 1: ./bench1.sh 98 | Time (mean ± σ): 357.040 s ± 7.176 s [User: 2.324 s, System: 13.881 s] 99 | Range (min … max): 349.639 s … 367.636 s 10 runs 100 | 101 | Benchmark 2: ./bench2.sh 102 | Time (mean ± σ): 199.751 s ± 4.431 s [User: 75.163 s, System: 141.271 s] 103 | Range (min … max): 190.136 s … 203.432 s 10 runs 104 | 105 | Summary 106 | './bench2.sh' ran 107 | 1.79 ± 0.05 times faster than './bench1.sh' 108 | ``` 109 | 110 | #### High-end server / SSD storage 111 | 112 | Hardware: 48-core Xeon Silver 4214, 7-drive SM883 SATA HW RAID-5 array, 2TB content (dozen of containers with small files) 113 | 114 | Same benchmark setup. Results: 115 | 116 | ```shell 117 | $ hyperfine --prepare 'echo 3 | tee /proc/sys/vm/drop_caches' \ 118 | ./bench1.sh ./bench2.sh 119 | 120 | Benchmark 1: ./bench1.sh 121 | Time (mean ± σ): 392.433 s ± 1.952 s [User: 16.056 s, System: 81.994 s] 122 | Range (min … max): 390.284 s … 395.732 s 10 runs 123 | 124 | Benchmark 2: ./bench2.sh 125 | Time (mean ± σ): 34.650 s ± 0.469 s [User: 79.441 s, System: 528.939 s] 126 | Range (min … max): 34.049 s … 35.388 s 10 runs 127 | 128 | Summary 129 | './bench2.sh' ran 130 | 11.33 ± 0.16 times faster than './bench1.sh' 131 | ``` 132 | 133 | ## Star history 134 | 135 | [![Star History Chart](https://api.star-history.com/svg?repos=dkorunic/findlargedir&type=Date)](https://star-history.com/#dkorunic/findlargedir&Date) 136 | -------------------------------------------------------------------------------- /src/walk.rs: -------------------------------------------------------------------------------- 1 | use std::fs::Metadata; 2 | use std::fs::read_dir; 3 | use std::os::unix::fs::MetadataExt; 4 | use std::path::Path; 5 | use std::path::PathBuf; 6 | use std::process; 7 | use std::sync::Arc; 8 | use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; 9 | use std::thread::sleep; 10 | use std::time::Duration; 11 | 12 | use crate::args::Args; 13 | use ahash::AHashSet; 14 | use ansi_term::Colour::{Green, Red, Yellow}; 15 | use fs_err as fs; 16 | use human_format::Formatter; 17 | use ignore::{DirEntry, Error, WalkBuilder, WalkState}; 18 | use indicatif::HumanBytes; 19 | 20 | /// Default number of files in a folder to cause alert 21 | pub const ALERT_COUNT: u64 = 10_000; 22 | 23 | /// Default number of files in a folder to cause red alert and further blacklist from the deeper 24 | /// scan 25 | pub const BLACKLIST_COUNT: u64 = 100_000; 26 | 27 | /// Default exit error code in case of premature termination 28 | const ERROR_EXIT: i32 = 1; 29 | 30 | /// Default status update period in seconds 31 | pub const STATUS_SECONDS: u64 = 20; 32 | 33 | /// Perform a parallel filesystem search based on specified criteria and arguments. 34 | /// 35 | /// # Arguments 36 | /// * `path` - A reference to the starting path for the filesystem search. 37 | /// * `path_metadata` - A reference to the metadata of the starting path. 38 | /// * `size_inode_ratio` - The ratio used to calculate the approximate number of files in a directory. 39 | /// * `shutdown_walk` - A shared reference to a boolean flag indicating if the search should be terminated. 40 | /// * `args` - A shared reference to the command-line arguments provided. 41 | /// 42 | /// # Returns 43 | /// The total count of processed directories during the filesystem search. 44 | /// 45 | /// # Behaviors 46 | /// - Creates a hash set of paths to be excluded from scanning. 47 | /// - Initializes a thread pool for status reporting and filesystem traversal. 48 | /// - Updates the processed directory count based on the status update interval. 49 | /// - Initiates the parallel filesystem walk using specified parameters. 50 | /// - Terminates the search if a shutdown signal is received. 51 | /// - Processes each directory entry encountered during the search. 52 | /// 53 | /// # Types 54 | /// * `path` - `&PathBuf` 55 | /// * `path_metadata` - `&Metadata` 56 | /// * `size_inode_ratio` - `u64` 57 | /// * `shutdown_walk` - `&Arc` 58 | /// * `args` - `&Arc` 59 | /// * Return Type - `u64` 60 | pub fn parallel_search( 61 | path: &PathBuf, 62 | path_metadata: &Metadata, 63 | size_inode_ratio: u64, 64 | shutdown_walk: &Arc, 65 | args: &Arc, 66 | ) -> u64 { 67 | // Create hash set for path exclusions 68 | let skip_path = &args.skip_path.iter().cloned().collect::>(); 69 | 70 | // Thread pool for status reporting and filesystem walk 71 | let pool = Arc::new( 72 | rayon::ThreadPoolBuilder::new() 73 | .num_threads(1) 74 | .build() 75 | .expect("Unable to spawn reporting thread pool"), 76 | ); 77 | 78 | // Processed directory count 79 | let dir_count = &Arc::new(AtomicU64::new(0)); 80 | 81 | // Status update thread 82 | if args.updates > 0 { 83 | let dir_count = dir_count.clone(); 84 | let sleep_delay = args.updates; 85 | 86 | pool.spawn(move || loop { 87 | sleep(Duration::from_secs(sleep_delay)); 88 | 89 | let count = dir_count.load(Ordering::Acquire); 90 | println!( 91 | "Processed {} directories so far, next update in {} seconds", 92 | Green.paint(count.to_string()), 93 | sleep_delay 94 | ); 95 | }); 96 | } 97 | 98 | // Perform target filesystem walking 99 | WalkBuilder::new(path) 100 | .hidden(false) 101 | .standard_filters(false) 102 | .follow_links(args.follow_symlinks) 103 | .threads(args.threads) 104 | .build_parallel() 105 | .run(|| { 106 | Box::new({ 107 | move |dir_entry_result| { 108 | // Terminate on received interrupt signal 109 | if shutdown_walk.load(Ordering::Relaxed) { 110 | println!("Requested program exit, stopping scan..."); 111 | 112 | process::exit(ERROR_EXIT); 113 | } 114 | 115 | process_dir_entry( 116 | path_metadata, 117 | size_inode_ratio, 118 | &dir_entry_result, 119 | skip_path, 120 | args, 121 | dir_count, 122 | ) 123 | } 124 | }) 125 | }); 126 | 127 | dir_count.load(Ordering::Acquire) 128 | } 129 | 130 | /// Processes a directory entry based on specified criteria and arguments. 131 | /// 132 | /// # Arguments 133 | /// * `path_metadata` - A reference to the metadata of the current directory. 134 | /// * `size_inode_ratio` - The ratio used to calculate the approximate number of files in the directory. 135 | /// * `dir_entry_result` - The result of attempting to read a directory entry. 136 | /// * `skip_path` - A set of paths to be excluded from scanning. 137 | /// * `args` - A shared reference to the command-line arguments provided. 138 | /// * `dir_count` - A shared reference to the atomic counter for visited directories. 139 | /// 140 | /// # Returns 141 | /// The state of the directory processing, indicating whether to continue, skip, or stop scanning. 142 | /// 143 | /// # Behaviors 144 | /// - Checks if the directory entry is a directory; if not, continues to the next entry. 145 | /// - Increments the visited directory count. 146 | /// - Skips scanning if the directory is in the skip path list. 147 | /// - Skips scanning if the directory is on a different filesystem and the `one_filesystem` flag is set. 148 | /// - Calculates the size and approximate file count of the directory entry. 149 | /// - Prints warnings and potentially marks the directory as an offender based on file count thresholds. 150 | /// - Returns the appropriate state for further scanning based on the calculated conditions. 151 | /// 152 | /// # Types 153 | /// * `path_metadata` - `&Metadata` 154 | /// * `size_inode_ratio` - `u64` 155 | /// * `dir_entry_result` - `&Result` 156 | /// * `skip_path` - `&AHashSet` 157 | /// * `args` - `&Arc` 158 | /// * `dir_count` - `&Arc` 159 | /// * Return Type - `WalkState` 160 | fn process_dir_entry( 161 | path_metadata: &Metadata, 162 | size_inode_ratio: u64, 163 | dir_entry_result: &Result, 164 | skip_path: &AHashSet, 165 | args: &Arc, 166 | dir_count: &Arc, 167 | ) -> WalkState { 168 | if let Ok(dir_entry) = dir_entry_result 169 | && let Some(dir_entry_type) = dir_entry.file_type() 170 | { 171 | if !dir_entry_type.is_dir() { 172 | return WalkState::Continue; 173 | } 174 | 175 | let full_path = dir_entry.path(); 176 | 177 | // Visited directory count 178 | dir_count.fetch_add(1, Ordering::AcqRel); 179 | 180 | // Ignore skip paths, typically being virtual filesystems (/proc, /dev, /sys, /run) 181 | if !skip_path.is_empty() 182 | && skip_path.contains(&full_path.to_path_buf()) 183 | { 184 | println!( 185 | "Skipping further scan at {} as requested", 186 | full_path.display() 187 | ); 188 | 189 | return WalkState::Skip; 190 | } 191 | 192 | // Retrieve Unix metadata for a given directory 193 | if let Ok(dir_entry_metadata) = fs::metadata(full_path) { 194 | // If `one_filesystem` flag has been set and if directory is not residing 195 | // on the same device as top search path, print warning and abort deeper 196 | // scanning 197 | if args.one_filesystem 198 | && (dir_entry_metadata.dev() != path_metadata.dev()) 199 | { 200 | println!( 201 | "Identified filesystem boundary at {}, skipping...", 202 | full_path.display() 203 | ); 204 | 205 | return WalkState::Skip; 206 | } 207 | 208 | // Identify size and calculate approximate directory entry count 209 | let size = dir_entry_metadata.size(); 210 | let approx_files = size / size_inode_ratio; 211 | 212 | // Print count warnings if necessary 213 | if approx_files > args.blacklist_threshold { 214 | print_offender( 215 | full_path, 216 | size, 217 | approx_files, 218 | args.accurate, 219 | true, 220 | ); 221 | 222 | return WalkState::Skip; 223 | } else if approx_files > args.alert_threshold { 224 | print_offender( 225 | full_path, 226 | size, 227 | approx_files, 228 | args.accurate, 229 | false, 230 | ); 231 | 232 | return WalkState::Continue; 233 | } 234 | } 235 | } 236 | 237 | WalkState::Continue 238 | } 239 | 240 | #[allow(clippy::cast_precision_loss)] 241 | /// Prints information about directories that exceed specified thresholds. 242 | /// 243 | /// This function is called when the estimated number of files in a directory exceeds either the alert or blacklist thresholds. 244 | /// It outputs details about the directory and its file count, and can optionally mark the directory as an offender based on its size. 245 | /// 246 | /// # Arguments 247 | /// * `path` - The path of the directory being evaluated. 248 | /// * `size` - The size of the directory in bytes. 249 | /// * `file_count` - The estimated number of files in the directory. 250 | /// * `accurate` - A boolean flag indicating whether the size estimation is considered accurate. 251 | /// * `is_blacklisted` - A boolean flag indicating whether the directory exceeds the blacklist threshold. 252 | fn print_offender( 253 | full_path: &Path, 254 | size: u64, 255 | approx_files: u64, 256 | accurate: bool, 257 | red_alert: bool, 258 | ) { 259 | // Pretty print either the accurate directory count or the approximation 260 | let human_files = if accurate { 261 | let exact_files = match read_dir(full_path) { 262 | Ok(r) => r.count() as u64, 263 | Err(_) => approx_files, 264 | }; 265 | Formatter::new().format(exact_files as f64) 266 | } else { 267 | Formatter::new().format(approx_files as f64) 268 | }; 269 | 270 | println!( 271 | "Found directory {} with inode size {} and {}{} files", 272 | full_path.display(), 273 | HumanBytes(size), 274 | if accurate { "" } else { "approx " }, 275 | if red_alert { 276 | Red.paint(human_files) 277 | } else { 278 | Yellow.paint(human_files) 279 | } 280 | ); 281 | } 282 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by dist: https://axodotdev.github.io/cargo-dist 2 | # 3 | # Copyright 2022-2024, axodotdev 4 | # SPDX-License-Identifier: MIT or Apache-2.0 5 | # 6 | # CI that: 7 | # 8 | # * checks for a Git Tag that looks like a release 9 | # * builds artifacts with dist (archives, installers, hashes) 10 | # * uploads those artifacts to temporary workflow zip 11 | # * on success, uploads the artifacts to a GitHub Release 12 | # 13 | # Note that the GitHub Release will be created with a generated 14 | # title/body based on your changelogs. 15 | 16 | name: Release 17 | permissions: 18 | "contents": "write" 19 | 20 | # This task will run whenever you push a git tag that looks like a version 21 | # like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc. 22 | # Various formats will be parsed into a VERSION and an optional PACKAGE_NAME, where 23 | # PACKAGE_NAME must be the name of a Cargo package in your workspace, and VERSION 24 | # must be a Cargo-style SemVer Version (must have at least major.minor.patch). 25 | # 26 | # If PACKAGE_NAME is specified, then the announcement will be for that 27 | # package (erroring out if it doesn't have the given version or isn't dist-able). 28 | # 29 | # If PACKAGE_NAME isn't specified, then the announcement will be for all 30 | # (dist-able) packages in the workspace with that version (this mode is 31 | # intended for workspaces with only one dist-able package, or with all dist-able 32 | # packages versioned/released in lockstep). 33 | # 34 | # If you push multiple tags at once, separate instances of this workflow will 35 | # spin up, creating an independent announcement for each one. However, GitHub 36 | # will hard limit this to 3 tags per commit, as it will assume more tags is a 37 | # mistake. 38 | # 39 | # If there's a prerelease-style suffix to the version, then the release(s) 40 | # will be marked as a prerelease. 41 | on: 42 | pull_request: 43 | push: 44 | tags: 45 | - '**[0-9]+.[0-9]+.[0-9]+*' 46 | 47 | jobs: 48 | # Run 'dist plan' (or host) to determine what tasks we need to do 49 | plan: 50 | runs-on: "ubuntu-22.04" 51 | outputs: 52 | val: ${{ steps.plan.outputs.manifest }} 53 | tag: ${{ !github.event.pull_request && github.ref_name || '' }} 54 | tag-flag: ${{ !github.event.pull_request && format('--tag={0}', github.ref_name) || '' }} 55 | publishing: ${{ !github.event.pull_request }} 56 | env: 57 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 58 | steps: 59 | - uses: actions/checkout@v4 60 | with: 61 | persist-credentials: false 62 | submodules: recursive 63 | - name: Install dist 64 | # we specify bash to get pipefail; it guards against the `curl` command 65 | # failing. otherwise `sh` won't catch that `curl` returned non-0 66 | shell: bash 67 | run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh" 68 | - name: Cache dist 69 | uses: actions/upload-artifact@v4 70 | with: 71 | name: cargo-dist-cache 72 | path: ~/.cargo/bin/dist 73 | # sure would be cool if github gave us proper conditionals... 74 | # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible 75 | # functionality based on whether this is a pull_request, and whether it's from a fork. 76 | # (PRs run on the *source* but secrets are usually on the *target* -- that's *good* 77 | # but also really annoying to build CI around when it needs secrets to work right.) 78 | - id: plan 79 | run: | 80 | dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json 81 | echo "dist ran successfully" 82 | cat plan-dist-manifest.json 83 | echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" 84 | - name: "Upload dist-manifest.json" 85 | uses: actions/upload-artifact@v4 86 | with: 87 | name: artifacts-plan-dist-manifest 88 | path: plan-dist-manifest.json 89 | 90 | # Build and packages all the platform-specific things 91 | build-local-artifacts: 92 | name: build-local-artifacts (${{ join(matrix.targets, ', ') }}) 93 | # Let the initial task tell us to not run (currently very blunt) 94 | needs: 95 | - plan 96 | if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }} 97 | strategy: 98 | fail-fast: false 99 | # Target platforms/runners are computed by dist in create-release. 100 | # Each member of the matrix has the following arguments: 101 | # 102 | # - runner: the github runner 103 | # - dist-args: cli flags to pass to dist 104 | # - install-dist: expression to run to install dist on the runner 105 | # 106 | # Typically there will be: 107 | # - 1 "global" task that builds universal installers 108 | # - N "local" tasks that build each platform's binaries and platform-specific installers 109 | matrix: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix }} 110 | runs-on: ${{ matrix.runner }} 111 | container: ${{ matrix.container && matrix.container.image || null }} 112 | env: 113 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 114 | BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json 115 | steps: 116 | - name: enable windows longpaths 117 | run: | 118 | git config --global core.longpaths true 119 | - uses: actions/checkout@v4 120 | with: 121 | persist-credentials: false 122 | submodules: recursive 123 | - name: Install Rust non-interactively if not already installed 124 | if: ${{ matrix.container }} 125 | run: | 126 | if ! command -v cargo > /dev/null 2>&1; then 127 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 128 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 129 | fi 130 | - name: Install dist 131 | run: ${{ matrix.install_dist.run }} 132 | # Get the dist-manifest 133 | - name: Fetch local artifacts 134 | uses: actions/download-artifact@v4 135 | with: 136 | pattern: artifacts-* 137 | path: target/distrib/ 138 | merge-multiple: true 139 | - name: Install dependencies 140 | run: | 141 | ${{ matrix.packages_install }} 142 | - name: Build artifacts 143 | run: | 144 | # Actually do builds and make zips and whatnot 145 | dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json 146 | echo "dist ran successfully" 147 | - id: cargo-dist 148 | name: Post-build 149 | # We force bash here just because github makes it really hard to get values up 150 | # to "real" actions without writing to env-vars, and writing to env-vars has 151 | # inconsistent syntax between shell and powershell. 152 | shell: bash 153 | run: | 154 | # Parse out what we just built and upload it to scratch storage 155 | echo "paths<> "$GITHUB_OUTPUT" 156 | dist print-upload-files-from-manifest --manifest dist-manifest.json >> "$GITHUB_OUTPUT" 157 | echo "EOF" >> "$GITHUB_OUTPUT" 158 | 159 | cp dist-manifest.json "$BUILD_MANIFEST_NAME" 160 | - name: "Upload artifacts" 161 | uses: actions/upload-artifact@v4 162 | with: 163 | name: artifacts-build-local-${{ join(matrix.targets, '_') }} 164 | path: | 165 | ${{ steps.cargo-dist.outputs.paths }} 166 | ${{ env.BUILD_MANIFEST_NAME }} 167 | 168 | # Build and package all the platform-agnostic(ish) things 169 | build-global-artifacts: 170 | needs: 171 | - plan 172 | - build-local-artifacts 173 | runs-on: "ubuntu-22.04" 174 | env: 175 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 176 | BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json 177 | steps: 178 | - uses: actions/checkout@v4 179 | with: 180 | persist-credentials: false 181 | submodules: recursive 182 | - name: Install cached dist 183 | uses: actions/download-artifact@v4 184 | with: 185 | name: cargo-dist-cache 186 | path: ~/.cargo/bin/ 187 | - run: chmod +x ~/.cargo/bin/dist 188 | # Get all the local artifacts for the global tasks to use (for e.g. checksums) 189 | - name: Fetch local artifacts 190 | uses: actions/download-artifact@v4 191 | with: 192 | pattern: artifacts-* 193 | path: target/distrib/ 194 | merge-multiple: true 195 | - id: cargo-dist 196 | shell: bash 197 | run: | 198 | dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json 199 | echo "dist ran successfully" 200 | 201 | # Parse out what we just built and upload it to scratch storage 202 | echo "paths<> "$GITHUB_OUTPUT" 203 | jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT" 204 | echo "EOF" >> "$GITHUB_OUTPUT" 205 | 206 | cp dist-manifest.json "$BUILD_MANIFEST_NAME" 207 | - name: "Upload artifacts" 208 | uses: actions/upload-artifact@v4 209 | with: 210 | name: artifacts-build-global 211 | path: | 212 | ${{ steps.cargo-dist.outputs.paths }} 213 | ${{ env.BUILD_MANIFEST_NAME }} 214 | # Determines if we should publish/announce 215 | host: 216 | needs: 217 | - plan 218 | - build-local-artifacts 219 | - build-global-artifacts 220 | # Only run if we're "publishing", and only if plan, local and global didn't fail (skipped is fine) 221 | if: ${{ always() && needs.plan.result == 'success' && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} 222 | env: 223 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 224 | runs-on: "ubuntu-22.04" 225 | outputs: 226 | val: ${{ steps.host.outputs.manifest }} 227 | steps: 228 | - uses: actions/checkout@v4 229 | with: 230 | persist-credentials: false 231 | submodules: recursive 232 | - name: Install cached dist 233 | uses: actions/download-artifact@v4 234 | with: 235 | name: cargo-dist-cache 236 | path: ~/.cargo/bin/ 237 | - run: chmod +x ~/.cargo/bin/dist 238 | # Fetch artifacts from scratch-storage 239 | - name: Fetch artifacts 240 | uses: actions/download-artifact@v4 241 | with: 242 | pattern: artifacts-* 243 | path: target/distrib/ 244 | merge-multiple: true 245 | - id: host 246 | shell: bash 247 | run: | 248 | dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json 249 | echo "artifacts uploaded and released successfully" 250 | cat dist-manifest.json 251 | echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" 252 | - name: "Upload dist-manifest.json" 253 | uses: actions/upload-artifact@v4 254 | with: 255 | # Overwrite the previous copy 256 | name: artifacts-dist-manifest 257 | path: dist-manifest.json 258 | # Create a GitHub Release while uploading all files to it 259 | - name: "Download GitHub Artifacts" 260 | uses: actions/download-artifact@v4 261 | with: 262 | pattern: artifacts-* 263 | path: artifacts 264 | merge-multiple: true 265 | - name: Cleanup 266 | run: | 267 | # Remove the granular manifests 268 | rm -f artifacts/*-dist-manifest.json 269 | - name: Create GitHub Release 270 | env: 271 | PRERELEASE_FLAG: "${{ fromJson(steps.host.outputs.manifest).announcement_is_prerelease && '--prerelease' || '' }}" 272 | ANNOUNCEMENT_TITLE: "${{ fromJson(steps.host.outputs.manifest).announcement_title }}" 273 | ANNOUNCEMENT_BODY: "${{ fromJson(steps.host.outputs.manifest).announcement_github_body }}" 274 | RELEASE_COMMIT: "${{ github.sha }}" 275 | run: | 276 | # Write and read notes from a file to avoid quoting breaking things 277 | echo "$ANNOUNCEMENT_BODY" > $RUNNER_TEMP/notes.txt 278 | 279 | gh release create "${{ needs.plan.outputs.tag }}" --target "$RELEASE_COMMIT" $PRERELEASE_FLAG --title "$ANNOUNCEMENT_TITLE" --notes-file "$RUNNER_TEMP/notes.txt" artifacts/* 280 | 281 | announce: 282 | needs: 283 | - plan 284 | - host 285 | # use "always() && ..." to allow us to wait for all publish jobs while 286 | # still allowing individual publish jobs to skip themselves (for prereleases). 287 | # "host" however must run to completion, no skipping allowed! 288 | if: ${{ always() && needs.host.result == 'success' }} 289 | runs-on: "ubuntu-22.04" 290 | env: 291 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 292 | steps: 293 | - uses: actions/checkout@v4 294 | with: 295 | persist-credentials: false 296 | submodules: recursive 297 | --------------------------------------------------------------------------------