├── .gitignore ├── src ├── lib.rs ├── lazyfile.rs ├── metadata.rs ├── json.rs ├── file.rs ├── bin.rs ├── reflink.rs ├── ui.rs ├── hasher.rs └── scanner.rs ├── LICENSE ├── Cargo.toml ├── tests ├── scantest.rs └── filetest.rs ├── README.md └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | **/*.rs.bk 3 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod file; 2 | mod hasher; 3 | #[cfg(feature = "json")] 4 | mod json; 5 | mod lazyfile; 6 | mod metadata; 7 | mod reflink; 8 | mod scanner; 9 | mod ui; 10 | 11 | pub use crate::file::FileContent; 12 | #[cfg(feature = "json")] 13 | pub use crate::json::JsonOutput; 14 | pub use crate::reflink::{LinkType, reflink, reflink_or_hardlink}; 15 | pub use crate::scanner::RunMode; 16 | pub use crate::scanner::Scanner; 17 | pub use crate::ui::UI as TextUserInterface; 18 | -------------------------------------------------------------------------------- /src/lazyfile.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use std::{fs, io}; 3 | 4 | /// Open the file only if necessary. 5 | /// The file will be closed automatically when this object goes out of scope. 6 | pub struct LazyFile<'a> { 7 | path: &'a Path, 8 | file: Option, 9 | } 10 | 11 | impl<'a> LazyFile<'a> { 12 | pub fn new(path: &'a Path) -> Self { 13 | LazyFile { path, file: None } 14 | } 15 | 16 | /// Open the file (or reuse already-opened handle) 17 | pub fn fd(&mut self) -> Result<&mut fs::File, io::Error> { 18 | if let Some(ref mut fd) = self.file { 19 | Ok(fd) 20 | } else { 21 | self.file = Some(fs::File::open(self.path)?); 22 | if let Some(ref mut fd) = self.file { 23 | Ok(fd) 24 | } else { 25 | unreachable!(); 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | © Kornel Lesiński 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /src/metadata.rs: -------------------------------------------------------------------------------- 1 | #[cfg(unix)] 2 | use std::os::unix::fs::MetadataExt; 3 | use std::path::Path; 4 | use std::{fs, io}; 5 | 6 | #[derive(Copy, Clone, Hash, Ord, PartialOrd, PartialEq, Eq, Debug, Default)] 7 | pub struct Metadata { 8 | pub dev: u64, 9 | pub size: u64, 10 | } 11 | 12 | impl Metadata { 13 | pub fn from_path(path: impl AsRef) -> Result { 14 | let m = fs::symlink_metadata(path)?; 15 | Ok(Self::new(&m)) 16 | } 17 | 18 | pub fn new(m: &fs::Metadata) -> Self { 19 | Self { dev: get_device_id(m), size: get_size(m) } 20 | } 21 | } 22 | 23 | #[cfg(unix)] 24 | fn get_device_id(m: &fs::Metadata) -> u64 { 25 | m.dev() 26 | } 27 | 28 | #[cfg(windows)] 29 | fn get_device_id(_m: &fs::Metadata) -> u64 { 30 | // On Windows, we'll use a simple constant for device identification 31 | // This means hardlinking across different drives won't work properly, 32 | // but that's expected behavior and matches filesystem limitations 33 | // TODO: In the future, we could use Windows-specific APIs to get proper device IDs 34 | 0 35 | } 36 | 37 | #[cfg(unix)] 38 | fn get_size(m: &fs::Metadata) -> u64 { 39 | m.size() 40 | } 41 | 42 | #[cfg(windows)] 43 | fn get_size(m: &fs::Metadata) -> u64 { 44 | // Windows polyfill: round up to the next 4KB block to account for block overhead 45 | let len = m.len(); 46 | const BLOCK_SIZE: u64 = 4096; 47 | ((len + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE 48 | } 49 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Kornel Lesiński "] 3 | description = "An incremental file deduplicator which minimizes amount of data read. Replaces duplicate files with identical content with hardlinks." 4 | documentation = "https://github.com/kornelski/dupe-krill#readme" 5 | homepage = "https://github.com/kornelski/dupe-krill" 6 | keywords = ["dupe", "deduplication", "dedupe", "disk-space"] 7 | categories = ["filesystem", "command-line-utilities"] 8 | license = "MIT" 9 | name = "dupe-krill" 10 | readme = "README.md" 11 | repository = "https://github.com/kornelski/dupe-krill" 12 | version = "1.5.0" 13 | edition = "2024" 14 | rust-version = "1.85" 15 | include = ["src/*.rs", "LICENSE", "README.md", "Cargo.toml"] 16 | 17 | [[bin]] 18 | name = "dupe-krill" 19 | path = "src/bin.rs" 20 | 21 | [profile.release] 22 | lto = true 23 | opt-level = 3 24 | panic = "abort" 25 | strip = true 26 | 27 | [dependencies] 28 | getopts = "0.2.21" 29 | ctrlc = "3.2.5" 30 | blake3 = { version = "1.3.3", features = ["pure"] } 31 | smallvec = "1.10.0" 32 | libc = "0.2" 33 | 34 | [dependencies.serde] 35 | optional = true 36 | version = "1.0.160" 37 | 38 | [dependencies.serde_derive] 39 | optional = true 40 | version = "1.0.160" 41 | 42 | [dependencies.serde_json] 43 | optional = true 44 | version = "1.0.95" 45 | 46 | [dev-dependencies] 47 | tempdir = "0.3.7" 48 | 49 | [features] 50 | default = ["json"] 51 | json = ["serde", "serde_derive", "serde_json"] 52 | 53 | [package.metadata.docs.rs] 54 | targets = ["x86_64-unknown-linux-gnu"] 55 | rustdoc-args = ["--generate-link-to-definition"] 56 | -------------------------------------------------------------------------------- /tests/scantest.rs: -------------------------------------------------------------------------------- 1 | use dupe_krill::*; 2 | use std::fs; 3 | 4 | use tempdir::TempDir; 5 | 6 | #[test] 7 | fn scan() { 8 | let mut d = Scanner::new(); 9 | d.scan("tests").unwrap(); 10 | } 11 | 12 | #[test] 13 | fn test_exclude() { 14 | let dir = TempDir::new("excludetest").unwrap(); 15 | let a_path = dir.path().join("a"); 16 | let b_path = dir.path().join("b"); 17 | fs::write(a_path, "foo").unwrap(); 18 | fs::write(b_path, "foo").unwrap(); 19 | 20 | let mut d = Scanner::new(); 21 | d.settings.ignore_small = false; 22 | d.settings.run_mode = RunMode::DryRunNoMerging; 23 | d.exclude(vec!["b".to_string()]); 24 | 25 | d.scan(dir.path()).unwrap(); 26 | let dupes = d.dupes(); 27 | assert_eq!(dupes.len(), 1); 28 | assert_eq!(dupes[0].len(), 1); 29 | assert_eq!(dupes[0][0].paths.len(), 1); 30 | } 31 | 32 | #[test] 33 | fn scan_hardlink() { 34 | let dir = TempDir::new("hardlinktest2").unwrap(); 35 | let a_path = dir.path().join("a"); 36 | let b_path = dir.path().join("b"); 37 | 38 | fs::write(&a_path, b"dupe").unwrap(); 39 | 40 | fs::hard_link(&a_path, &b_path).unwrap(); 41 | 42 | let mut d = Scanner::new(); 43 | d.settings.ignore_small = false; 44 | d.settings.run_mode = RunMode::DryRun; 45 | d.scan(dir.path()).unwrap(); 46 | let dupes = d.dupes(); 47 | assert_eq!(dupes.len(), 1); 48 | assert_eq!(dupes[0][0].paths.len(), 2); 49 | 50 | let mut d = Scanner::new(); 51 | d.settings.ignore_small = false; 52 | d.scan(dir.path()).unwrap(); 53 | let dupes = d.dupes(); 54 | assert_eq!(dupes.len(), 1); 55 | assert_eq!(dupes[0][0].paths.len(), 2); 56 | } 57 | -------------------------------------------------------------------------------- /src/json.rs: -------------------------------------------------------------------------------- 1 | use crate::scanner::{ScanListener, Scanner, Stats}; 2 | use serde_derive::Serialize; 3 | use std::path::Path; 4 | use std::time::Duration; 5 | 6 | #[derive(Debug)] 7 | pub struct JsonOutput; 8 | 9 | impl JsonOutput { 10 | #[must_use] 11 | pub fn new() -> Self { 12 | Self 13 | } 14 | } 15 | 16 | impl ScanListener for JsonOutput { 17 | fn file_scanned(&mut self, _: &Path, _: &Stats) { 18 | // output only at scan_over 19 | } 20 | 21 | fn scan_over(&self, scanner: &Scanner, stats: &Stats, scan_duration: Duration) { 22 | let data = JsonSerializable::new(scanner, stats, scan_duration); 23 | let json_string = serde_json::to_string_pretty(&data).unwrap(); 24 | println!("{json_string}"); 25 | } 26 | 27 | fn hardlinked(&mut self, _: &Path, _: &Path) { 28 | // output only at scan_over 29 | } 30 | 31 | fn reflinked(&mut self, _: &Path, _: &Path) { 32 | // output only at scan_over 33 | } 34 | 35 | fn duplicate_found(&mut self, _: &Path, _: &Path) { 36 | // output only at scan_over 37 | } 38 | } 39 | 40 | #[derive(Serialize)] 41 | #[serde(rename_all = "camelCase")] 42 | struct JsonSerializable { 43 | creator: String, 44 | dupes: Vec>>>, 45 | stats: Stats, 46 | scan_duration: Duration, 47 | } 48 | 49 | impl JsonSerializable { 50 | pub fn new(scanner: &Scanner, stats: &Stats, scan_duration: Duration) -> Self { 51 | Self { 52 | creator: format!("duplicate-kriller {}", env!("CARGO_PKG_VERSION")), 53 | dupes: scanner 54 | .dupes() 55 | .into_iter() 56 | .map(|sets| { 57 | sets.into_iter() 58 | .filter(|set| !set.paths.is_empty()) 59 | .map(|set| set.paths.into_vec()) 60 | .collect::>() 61 | }) 62 | .filter(|sets| sets.len() > 1 || sets.iter().any(|set| set.len() > 1)) 63 | .collect(), 64 | stats: *stats, 65 | scan_duration, 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /tests/filetest.rs: -------------------------------------------------------------------------------- 1 | use dupe_krill::*; 2 | use std::fs; 3 | use tempdir::TempDir; 4 | 5 | #[test] 6 | fn hardlink_of_same_file() { 7 | let dir = TempDir::new("hardlinktest").unwrap(); 8 | let a_path = dir.path().join("a").into_boxed_path(); 9 | let b_path = dir.path().join("b").into_boxed_path(); 10 | 11 | fs::write(&a_path, "hello").unwrap(); 12 | 13 | fs::hard_link(&a_path, &b_path).unwrap(); 14 | 15 | let a = FileContent::from_path(a_path).unwrap(); 16 | let b = FileContent::from_path(b_path).unwrap(); 17 | assert_eq!(a, b); 18 | assert_eq!(b, b); 19 | } 20 | 21 | #[test] 22 | fn different_files() { 23 | let dir = TempDir::new("basictest").unwrap(); 24 | let a_path = dir.path().join("a").into_boxed_path(); 25 | let b_path = dir.path().join("b").into_boxed_path(); 26 | 27 | fs::write(&a_path, "hello").unwrap(); 28 | fs::write(&b_path, "world").unwrap(); 29 | 30 | let a = FileContent::from_path(a_path).unwrap(); 31 | let b = FileContent::from_path(b_path).unwrap(); 32 | assert_eq!(a, a); 33 | assert_eq!(b, b); 34 | assert_ne!(a, b); 35 | } 36 | 37 | #[test] 38 | fn different_files_big() { 39 | let dir = TempDir::new("difftest").unwrap(); 40 | let a_path = dir.path().join("a_big").into_boxed_path(); 41 | let b_path = dir.path().join("b_big").into_boxed_path(); 42 | 43 | let mut content = vec![0xffu8; 1_400_000]; 44 | 45 | fs::write(&a_path, &content).unwrap(); 46 | content[1_388_888] = 1; 47 | fs::write(&b_path, content).unwrap(); 48 | 49 | let a = FileContent::from_path(a_path).unwrap(); 50 | let b = FileContent::from_path(b_path).unwrap(); 51 | assert_ne!(a, b); 52 | assert_eq!(a, a); 53 | assert_eq!(b, b); 54 | } 55 | 56 | #[test] 57 | fn same_content() { 58 | let dir = TempDir::new("sametest").unwrap(); 59 | let a_path = dir.path().join("a").into_boxed_path(); 60 | let b_path = dir.path().join("b").into_boxed_path(); 61 | 62 | fs::write(&a_path, "hello").unwrap(); 63 | fs::write(&b_path, "hello").unwrap(); 64 | 65 | let a = FileContent::from_path(a_path).unwrap(); 66 | let b = FileContent::from_path(b_path).unwrap(); 67 | assert_eq!(a, a); 68 | assert_eq!(b, b); 69 | assert_eq!(a, b); 70 | } 71 | 72 | #[test] 73 | #[cfg(unix)] // Symlinks work differently on Windows 74 | fn symlink() { 75 | let dir = TempDir::new("sametest").unwrap(); 76 | let a_path = dir.path().join("a").into_boxed_path(); 77 | let b_path = dir.path().join("b").into_boxed_path(); 78 | fs::write(&a_path, "hello").unwrap(); 79 | 80 | ::std::os::unix::fs::symlink(&a_path, &b_path).unwrap(); 81 | 82 | let a = FileContent::from_path(a_path).unwrap(); 83 | let b = FileContent::from_path(b_path).unwrap(); 84 | 85 | assert_ne!(a, b); 86 | assert_eq!(b, b); 87 | } 88 | -------------------------------------------------------------------------------- /src/file.rs: -------------------------------------------------------------------------------- 1 | use crate::hasher::Hasher; 2 | use crate::metadata::Metadata; 3 | use smallvec::SmallVec; 4 | use std::cell::RefCell; 5 | use std::cmp::{max, Ordering}; 6 | use std::io; 7 | use std::path::Path; 8 | 9 | #[derive(Debug, Clone)] 10 | pub struct FileSet { 11 | /// Tracks number of hardlinks from stat to also count unseen links outside scanned dirs 12 | pub max_hardlinks: u64, 13 | pub paths: SmallVec<[Box; 1]>, 14 | } 15 | 16 | impl FileSet { 17 | pub fn new(path: Box, max_hardlinks: u64) -> Self { 18 | let mut paths = SmallVec::new(); 19 | paths.push(path); 20 | Self { max_hardlinks, paths } 21 | } 22 | 23 | pub fn push(&mut self, path: Box) { 24 | self.paths.push(path); 25 | } 26 | 27 | /// Number of known hardlinks to this file content 28 | pub fn links(&self) -> u64 { 29 | max(self.max_hardlinks, self.paths.len() as u64) 30 | } 31 | } 32 | 33 | #[derive(Debug)] 34 | /// File content is efficiently compared using this struct's `PartialOrd` implementation 35 | pub struct FileContent { 36 | path: Box, 37 | metadata: Metadata, 38 | /// Hashes of content, calculated incrementally 39 | hashes: RefCell, 40 | } 41 | 42 | impl FileContent { 43 | pub fn from_path(path: Box) -> Result { 44 | let m = Metadata::from_path(&path)?; 45 | Ok(Self::new(path, m)) 46 | } 47 | 48 | #[must_use] 49 | pub fn new(path: Box, metadata: Metadata) -> Self { 50 | Self { 51 | path, 52 | metadata, 53 | hashes: RefCell::new(Hasher::new()), 54 | } 55 | } 56 | } 57 | 58 | impl Eq for FileContent {} 59 | 60 | impl PartialEq for FileContent { 61 | fn eq(&self, other: &Self) -> bool { 62 | self.partial_cmp(other) == Some(Ordering::Equal) 63 | } 64 | } 65 | 66 | impl Ord for FileContent { 67 | fn cmp(&self, other: &Self) -> Ordering { 68 | self.compare(other).unwrap_or(Ordering::Greater) 69 | } 70 | } 71 | 72 | /// That does the bulk of hasing and comparisons 73 | impl PartialOrd for FileContent { 74 | fn partial_cmp(&self, other: &Self) -> Option { 75 | self.compare(other).ok() 76 | } 77 | } 78 | 79 | impl FileContent { 80 | fn compare(&self, other: &Self) -> io::Result { 81 | // Fast pointer comparison 82 | if std::ptr::eq(self, other) { 83 | return Ok(Ordering::Equal); 84 | } 85 | 86 | // Different file sizes mean they're obviously different. 87 | // Also different devices mean they're not the same as far as we're concerned 88 | // (since search is intended for hardlinking and hardlinking only works within the same device). 89 | let cmp = self.metadata.cmp(&other.metadata); 90 | if cmp != Ordering::Equal { 91 | return Ok(cmp); 92 | } 93 | 94 | let mut hashes1 = self.hashes.borrow_mut(); 95 | let mut hashes2 = other.hashes.borrow_mut(); 96 | 97 | hashes1.compare(&mut hashes2, self.metadata.size, &self.path, &other.path) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/bin.rs: -------------------------------------------------------------------------------- 1 | use dupe_krill::{JsonOutput, RunMode, Scanner, TextUserInterface}; 2 | use getopts::Options; 3 | use std::io::Write; 4 | use std::path::PathBuf; 5 | use std::sync::atomic::{AtomicU32, Ordering}; 6 | use std::{env, io}; 7 | 8 | enum OutputMode { 9 | Quiet, 10 | Text, 11 | Json, 12 | } 13 | static CTRL_C_BREAKS: AtomicU32 = AtomicU32::new(0); 14 | 15 | fn main() { 16 | let mut opts = Options::new(); 17 | opts.optflag("d", "dry-run", "Do not change anything on disk. Only print dupes found"); 18 | opts.optflag("s", "small", "Also dedupe small files (smaller than a disk block)"); 19 | opts.optflag("q", "quiet", "Hide regular progress output"); 20 | opts.optmulti("e", "exclude", "Don't scan directories or files with that filename (wildcards are not supported)", ""); 21 | opts.optflag("", "json", "Display results as JSON"); 22 | opts.optflag("C", "reflink", "Strict reflinking (copy-on-write) instead of hardlinking - WILL FAIL IF unsupported"); 23 | opts.optflag("c", "reflink-or-hardlink", "Try reflinks first, fallback to hardlinks if reflinks are not supported"); 24 | opts.optflag("h", "help", "This help text"); 25 | 26 | let mut args = env::args(); 27 | let program = args.next(); 28 | let program = program.as_deref().unwrap_or(env!("CARGO_PKG_NAME")); 29 | 30 | let matches = opts.parse(args).unwrap(); 31 | let output_mode = if matches.opt_present("json") { 32 | OutputMode::Json 33 | } else if matches.opt_present("quiet") { 34 | OutputMode::Quiet 35 | } else { 36 | OutputMode::Text 37 | }; 38 | 39 | if matches.opt_present("h") || matches.free.is_empty() { 40 | println!( 41 | "Hardlink or reflink files with duplicate content (v{}).\n{}\n\n{}", 42 | env!("CARGO_PKG_VERSION"), 43 | env!("CARGO_PKG_HOMEPAGE"), 44 | opts.usage(&(opts.short_usage(program) + " ")) 45 | ); 46 | return; 47 | } 48 | 49 | ctrlc::set_handler(move || { 50 | CTRL_C_BREAKS.fetch_add(1, Ordering::SeqCst); 51 | }) 52 | .ok(); 53 | 54 | let mut s = Scanner::new(); 55 | s.settings.break_on = Some(&CTRL_C_BREAKS); 56 | 57 | // Determines run mode based on command line options 58 | s.settings.run_mode = if matches.opt_present("dry-run") { 59 | RunMode::DryRun 60 | } else if matches.opt_present("reflink") { 61 | RunMode::Reflink 62 | } else if matches.opt_present("reflink-or-hardlink") { 63 | RunMode::ReflinkOrHardlink 64 | } else { 65 | RunMode::Hardlink 66 | }; 67 | 68 | s.settings.ignore_small = !matches.opt_present("small"); 69 | match output_mode { 70 | OutputMode::Quiet => { 71 | // Noop-output is already set by default. 72 | }, 73 | OutputMode::Text => { 74 | // TODO this print statement belongs into the TextUserInterface. 75 | match s.settings.run_mode { 76 | RunMode::DryRun => println!("Dry run. No files will be changed."), 77 | RunMode::Reflink => println!("Using reflinks (copy-on-write) for deduplication."), 78 | RunMode::ReflinkOrHardlink => println!("Using reflinks when possible, falling back to hardlinks."), 79 | _ => {}, // Defaults to hardlink mode, no message needed 80 | } 81 | s.set_listener(Box::new(TextUserInterface::new())); 82 | }, 83 | OutputMode::Json => { 84 | if s.settings.run_mode == RunMode::DryRun { 85 | s.settings.run_mode = RunMode::DryRunNoMerging; 86 | } 87 | if cfg!(feature = "json") { 88 | #[cfg(feature = "json")] 89 | s.set_listener(Box::new(JsonOutput::new())); 90 | } else { 91 | writeln!(&mut std::io::stderr(), "This binary was compiled without JSON support.").unwrap(); 92 | std::process::exit(2) 93 | } 94 | }, 95 | } 96 | 97 | s.exclude(matches.opt_strs("exclude")); 98 | 99 | match inner_main(s, matches.free) { 100 | Ok(()) => {}, 101 | Err(err) => { 102 | writeln!(&mut std::io::stderr(), "Error: {err}").unwrap(); 103 | std::process::exit(1); 104 | }, 105 | } 106 | } 107 | 108 | fn inner_main(mut s: Scanner, args: Vec) -> io::Result<()> { 109 | for arg in args { 110 | let path: PathBuf = arg.into(); 111 | s.enqueue(path)?; 112 | } 113 | s.flush() 114 | } 115 | -------------------------------------------------------------------------------- /src/reflink.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use std::{fs, io}; 3 | 4 | #[derive(Debug, Copy, Clone, Eq, PartialEq)] 5 | pub enum LinkType { 6 | Hardlink, 7 | Reflink, 8 | } 9 | 10 | /// Create a reflink (copy-on-write link) between two files 11 | /// Falls back to hardlinking if reflinking is not supported 12 | pub fn reflink_or_hardlink(src: &Path, dst: &Path) -> io::Result { 13 | // Try reflink first 14 | match reflink(src, dst) { 15 | Ok(()) => Ok(LinkType::Reflink), 16 | Err(_) => { 17 | // Fall back to hardlink 18 | fs::hard_link(src, dst)?; 19 | Ok(LinkType::Hardlink) 20 | } 21 | } 22 | } 23 | 24 | /// Create a reflink (copy-on-write link) between two files 25 | pub fn reflink(src: &Path, dst: &Path) -> io::Result<()> { 26 | #[cfg(target_os = "linux")] 27 | { 28 | reflink_linux(src, dst) 29 | } 30 | #[cfg(target_os = "macos")] 31 | { 32 | reflink_macos(src, dst) 33 | } 34 | #[cfg(target_os = "windows")] 35 | { 36 | reflink_windows(src, dst) 37 | } 38 | #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))] 39 | { 40 | Err(io::Error::new( 41 | io::ErrorKind::Unsupported, 42 | "Reflinks are not supported on this platform", 43 | )) 44 | } 45 | } 46 | 47 | #[cfg(target_os = "linux")] 48 | fn reflink_linux(src: &Path, dst: &Path) -> io::Result<()> { 49 | use std::ffi::CString; 50 | use std::os::unix::ffi::OsStrExt; 51 | 52 | let src_c = CString::new(src.as_os_str().as_bytes())?; 53 | let dst_c = CString::new(dst.as_os_str().as_bytes())?; 54 | 55 | unsafe { 56 | // First try ioctl FICLONE (Btrfs, XFS) 57 | let src_fd = libc::open(src_c.as_ptr(), libc::O_RDONLY); 58 | if src_fd == -1 { 59 | return Err(io::Error::last_os_error()); 60 | } 61 | 62 | let dst_fd = libc::open(dst_c.as_ptr(), libc::O_WRONLY | libc::O_CREAT | libc::O_EXCL, 0o644); 63 | if dst_fd == -1 { 64 | libc::close(src_fd); 65 | return Err(io::Error::last_os_error()); 66 | } 67 | 68 | // FICLONE ioctl constant - this creates a reflink 69 | const FICLONE: libc::c_ulong = 0x40049409; 70 | let result = libc::ioctl(dst_fd, FICLONE, src_fd); 71 | 72 | libc::close(src_fd); 73 | libc::close(dst_fd); 74 | 75 | if result == 0 { 76 | Ok(()) 77 | } else { 78 | // Clean up the created file on failure 79 | let _ = libc::unlink(dst_c.as_ptr()); 80 | Err(io::Error::last_os_error()) 81 | } 82 | } 83 | } 84 | 85 | #[cfg(target_os = "macos")] 86 | fn reflink_macos(src: &Path, dst: &Path) -> io::Result<()> { 87 | use std::ffi::CString; 88 | use std::os::unix::ffi::OsStrExt; 89 | 90 | let src_c = CString::new(src.as_os_str().as_bytes())?; 91 | let dst_c = CString::new(dst.as_os_str().as_bytes())?; 92 | 93 | // Use clonefile() on macOS 94 | unsafe extern "C" { 95 | fn clonefile(src: *const libc::c_char, dst: *const libc::c_char, flags: u32) -> libc::c_int; 96 | } 97 | unsafe { 98 | let result = clonefile(src_c.as_ptr(), dst_c.as_ptr(), 0); 99 | if result == 0 { 100 | Ok(()) 101 | } else { 102 | Err(io::Error::last_os_error()) 103 | } 104 | } 105 | } 106 | 107 | #[cfg(target_os = "windows")] 108 | fn reflink_windows(src: &Path, dst: &Path) -> io::Result<()> { 109 | use std::os::windows::ffi::OsStrExt; 110 | use std::ptr; 111 | 112 | // Convert paths to wide strings for Windows API 113 | let src_wide: Vec = src.as_os_str().encode_wide().chain(Some(0)).collect(); 114 | let dst_wide: Vec = dst.as_os_str().encode_wide().chain(Some(0)).collect(); 115 | 116 | unsafe { 117 | // Windows doesn't have a direct equivalent to FICLONE, but it's possible to 118 | // use the CopyFile API with COPY_FILE_COPY_SYMLINK | COPY_FILE_CLONE_FORCE. 119 | // This requires Windows 10 version 1903 or later with a ReFS filesystem 120 | 121 | extern "system" { 122 | fn CopyFileExW( 123 | lpExistingFileName: *const u16, 124 | lpNewFileName: *const u16, 125 | lpProgressRoutine: *const u8, 126 | lpData: *const u8, 127 | pbCancel: *const i32, 128 | dwCopyFlags: u32, 129 | ) -> i32; 130 | } 131 | 132 | // COPY_FILE_CLONE_FORCE = 0x00800000 - Force a clone (reflink) 133 | const COPY_FILE_CLONE_FORCE: u32 = 0x00800000; 134 | 135 | let result = CopyFileExW( 136 | src_wide.as_ptr(), 137 | dst_wide.as_ptr(), 138 | ptr::null(), 139 | ptr::null(), 140 | ptr::null(), 141 | COPY_FILE_CLONE_FORCE, 142 | ); 143 | 144 | if result != 0 { 145 | Ok(()) 146 | } else { 147 | Err(io::Error::last_os_error()) 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/ui.rs: -------------------------------------------------------------------------------- 1 | use crate::scanner::{ScanListener, Scanner, Stats}; 2 | use std::path::Path; 3 | use std::time::{Duration, Instant}; 4 | 5 | #[derive(Debug)] 6 | struct Timing { 7 | // Time in seconds, used to throttle console output 8 | next_update: u64, 9 | start_time: Instant, 10 | } 11 | 12 | #[derive(Debug)] 13 | pub struct UI { 14 | timing: Timing, 15 | } 16 | 17 | impl UI { 18 | #[must_use] 19 | pub fn new() -> Self { 20 | Self { 21 | timing: Timing { 22 | next_update: 0, 23 | start_time: Instant::now(), 24 | }, 25 | } 26 | } 27 | } 28 | 29 | impl ScanListener for UI { 30 | fn file_scanned(&mut self, path: &Path, stats: &Stats) { 31 | let elapsed = self.timing.start_time.elapsed().as_secs(); 32 | if elapsed > self.timing.next_update { 33 | self.timing.next_update = elapsed+1; 34 | println!("{}+{}+{} dupes ({} saved). {}+{} files scanned. {}/…", 35 | stats.dupes, stats.hardlinks, stats.reflinks, human_size(stats.bytes_deduplicated), stats.added, stats.skipped, 36 | path.parent().unwrap_or(path).display()); 37 | } 38 | } 39 | 40 | #[allow(overlapping_range_endpoints)] 41 | fn scan_over(&self, _: &Scanner, stats: &Stats, scan_duration: Duration) { 42 | let nice_duration = match scan_duration.as_secs() { 43 | x @ 0..=5 => format!("{:.1}s", (x * 1_000_000_000 + u64::from(scan_duration.subsec_nanos())) as f64 / 1_000_000_000f64), 44 | x @ 5..=59 => format!("{x}s"), 45 | x => format!("{}m{}s", x / 60, x % 60), 46 | }; 47 | println!("Dupes found: {}, wasting {}. Existing hardlinks: {}, saving {}. Reflinks created: {}, saving {}. Scanned: {}. Skipped {}. Total scan duration: {}", 48 | stats.dupes, human_size(stats.bytes_deduplicated), stats.hardlinks, human_size(stats.bytes_saved_by_hardlinks), 49 | stats.reflinks, human_size(stats.bytes_saved_by_reflinks), stats.added, stats.skipped, nice_duration); 50 | } 51 | 52 | fn hardlinked(&mut self, src: &Path, dst: &Path) { 53 | println!("Hardlinked {}", combined_paths(src, dst)); 54 | } 55 | 56 | fn reflinked(&mut self, src: &Path, dst: &Path) { 57 | println!("Reflinked {}", combined_paths(src, dst)); 58 | } 59 | 60 | fn duplicate_found(&mut self, src: &Path, dst: &Path) { 61 | println!("Found dupe {}", combined_paths(src, dst)); 62 | } 63 | } 64 | 65 | const POWERS_OF_TWO: [&str; 7] = ["", "k", "M", "G", "T", "P", "E"]; 66 | fn human_size(size: usize) -> String { 67 | let power_threshold = 1024.; 68 | 69 | let mut current_power = 0; 70 | let mut current_power_size = size as f64; 71 | 72 | while current_power_size >= power_threshold { 73 | current_power_size /= 1000_f64; 74 | current_power += 1; 75 | } 76 | 77 | format!("{:.2}{}B", current_power_size, POWERS_OF_TWO[current_power]) 78 | } 79 | 80 | fn combined_paths(base: &Path, relativize: &Path) -> String { 81 | let base: Vec<_> = base.iter().collect(); 82 | let relativize: Vec<_> = relativize.iter().collect(); 83 | 84 | let mut out = String::with_capacity(80); 85 | let mut prefix_len = 0; 86 | for (comp, _) in base.iter().zip(relativize.iter()).take_while(|&(a, b)| a == b) { 87 | prefix_len += 1; 88 | let comp = comp.to_string_lossy(); 89 | out += ∁ 90 | if comp != "/" { 91 | out.push('/'); 92 | } 93 | } 94 | 95 | let suffix: Vec<_> = base.iter().skip(prefix_len).rev().zip(relativize.iter().skip(prefix_len).rev()) 96 | .take_while(|&(a,b)| a==b).map(|(_,b)|b.to_string_lossy()).collect(); 97 | 98 | let base_unique: Vec<_> = base[prefix_len..base.len() - suffix.len()].iter().map(|b| b.to_string_lossy()).collect(); 99 | 100 | out.push('{'); 101 | if base_unique.is_empty() { 102 | out.push('.'); 103 | } else { 104 | out += &base_unique.join("/"); 105 | } 106 | out += " => "; 107 | 108 | let rel_unique: Vec<_> = relativize[prefix_len..relativize.len() - suffix.len()] 109 | .iter() 110 | .map(|b| b.to_string_lossy()) 111 | .collect(); 112 | if rel_unique.is_empty() { 113 | out.push('.'); 114 | } else { 115 | out += &rel_unique.join("/"); 116 | } 117 | out.push('}'); 118 | 119 | for comp in suffix.into_iter().rev() { 120 | out.push('/'); 121 | out += ∁ 122 | } 123 | out 124 | } 125 | 126 | #[test] 127 | fn combined_test() { 128 | use std::path::PathBuf; 129 | let a: PathBuf = "foo/bar/baz/a.txt".into(); 130 | let b: PathBuf = "foo/baz/quz/zzz/a.txt".into(); 131 | let c: PathBuf = "foo/baz/quz/zzz/b.txt".into(); 132 | let d: PathBuf = "b.txt".into(); 133 | let e: PathBuf = "e.txt".into(); 134 | let f: PathBuf = "/foo/bar/baz/a.txt".into(); 135 | let g: PathBuf = "/foo/baz/quz/zzz/a.txt".into(); 136 | let h: PathBuf = "/foo/b/quz/zzz/a.txt".into(); 137 | 138 | assert_eq!(&combined_paths(&a, &b), "foo/{bar/baz => baz/quz/zzz}/a.txt"); 139 | assert_eq!(&combined_paths(&c, &b), "foo/baz/quz/zzz/{b.txt => a.txt}"); 140 | assert_eq!(&combined_paths(&c, &d), "{foo/baz/quz/zzz => .}/b.txt"); 141 | assert_eq!(&combined_paths(&d, &c), "{. => foo/baz/quz/zzz}/b.txt"); 142 | assert_eq!(&combined_paths(&d, &e), "{b.txt => e.txt}"); 143 | assert_eq!(&combined_paths(&f, &g), "/foo/{bar/baz => baz/quz/zzz}/a.txt"); 144 | assert_eq!(&combined_paths(&h, &g), "/foo/{b => baz}/quz/zzz/a.txt"); 145 | } 146 | 147 | #[test] 148 | fn human_size_test() { 149 | assert_eq!(human_size(15632), "15.63kB"); 150 | assert_eq!(human_size(1563244), "1.56MB"); 151 | assert_eq!(human_size(1563244174), "1.56GB"); 152 | assert_eq!(human_size(1563244928194), "1.56TB"); 153 | } 154 | -------------------------------------------------------------------------------- /src/hasher.rs: -------------------------------------------------------------------------------- 1 | use crate::lazyfile::LazyFile; 2 | use smallvec::SmallVec; 3 | use std::cmp::{min, Ordering}; 4 | use std::convert::TryInto; 5 | use std::io; 6 | use std::io::{Read, Seek, SeekFrom}; 7 | use std::path::Path; 8 | 9 | /// A hashed chunk of data of arbitrary size. Files are compared a bit by bit. 10 | #[derive(Debug, PartialOrd, Eq, PartialEq, Ord)] 11 | struct HashedRange { 12 | size: u64, 13 | hash: [u8; 20], 14 | } 15 | 16 | impl HashedRange { 17 | pub fn from_file(file: &mut LazyFile<'_>, start: u64, size: u64) -> Result { 18 | let fd = file.fd()?; 19 | fd.seek(SeekFrom::Start(start))?; 20 | let mut hasher = blake3::Hasher::new(); 21 | let mut to_read = size as usize; 22 | let mut data = vec![0; to_read]; 23 | loop { 24 | match fd.read(&mut data[0..to_read]) { 25 | Ok(0) => break, 26 | Ok(n) => { 27 | debug_assert!(n <= to_read); 28 | hasher.update(&data[0..n]); 29 | 30 | to_read -= n; 31 | if to_read == 0 { 32 | break; 33 | } 34 | }, 35 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}, 36 | Err(e) => return Err(e), 37 | } 38 | } 39 | Ok(Self { 40 | hash: hasher.finalize().as_bytes()[0..20].try_into().unwrap(), 41 | size, 42 | }) 43 | } 44 | } 45 | 46 | #[derive(Debug)] 47 | pub struct Hasher { 48 | ranges: SmallVec<[Option; 1]>, 49 | } 50 | 51 | /// Compares two files using hashes by hashing incrementally until the first difference is found 52 | struct HashIter<'a> { 53 | pub index: usize, 54 | pub start_offset: u64, 55 | pub end_offset: u64, 56 | next_buffer_size: u64, 57 | a_file: LazyFile<'a>, 58 | b_file: LazyFile<'a>, 59 | } 60 | 61 | impl<'h> HashIter<'h> { 62 | pub fn new(size: u64, a_path: &'h Path, b_path: &'h Path) -> Self { 63 | HashIter { 64 | index: 0, 65 | start_offset: 0, 66 | end_offset: size, 67 | next_buffer_size: 2048, 68 | a_file: LazyFile::new(a_path), 69 | b_file: LazyFile::new(b_path), 70 | } 71 | } 72 | 73 | /// Compare (and compute if needed) the next two hashes 74 | pub fn next<'a,'b>(&mut self, a_hash: &'a mut Hasher, b_hash: &'b mut Hasher) -> Result, io::Error> { 75 | if self.start_offset >= self.end_offset { 76 | return Ok(None); 77 | } 78 | 79 | let i = self.index; 80 | let (a_none, b_none, size) = { 81 | let a = a_hash.ranges.get(i); 82 | let b = b_hash.ranges.get(i); 83 | 84 | let failed = a.map_or(false, |a| a.is_none()) || b.map_or(false, |b| b.is_none()); 85 | if failed { 86 | return Err(io::Error::other("cmp i/o")); 87 | } 88 | 89 | // If there is an existing hashed chunk, the chunk size used for comparison must obviously be it. 90 | let size = a 91 | .and_then(|a| a.as_ref().map(|a| a.size)) 92 | .or(b.and_then(|b| b.as_ref().map(|b| b.size))) 93 | .unwrap_or(min(self.end_offset - self.start_offset, self.next_buffer_size)); 94 | (a.is_none(), b.is_none(), size) 95 | }; 96 | 97 | // If any of the ranges is missing, compute it 98 | if a_none { 99 | a_hash.push(HashedRange::from_file(&mut self.a_file, self.start_offset, size)); 100 | } 101 | if b_none { 102 | b_hash.push(HashedRange::from_file(&mut self.b_file, self.start_offset, size)); 103 | } 104 | 105 | self.index += 1; 106 | self.start_offset += size; 107 | // The buffer size is a trade-off between finding a difference quickly 108 | // and reading files one by one without trashing. 109 | // Exponential increase is meant to be a compromise that allows finding 110 | // the difference in the first few KB, but grow quickly to read identical files faster. 111 | self.next_buffer_size = min(size * 16, 128 * 1024 * 1024); 112 | 113 | match (a_hash.ranges.get(i), b_hash.ranges.get(i)) { 114 | (Some(Some(a)), Some(Some(b))) => Ok(Some((a, b))), 115 | _ => Err(io::Error::other("cmp i/o")), 116 | } 117 | } 118 | } 119 | 120 | impl Hasher { 121 | #[inline] 122 | pub fn new() -> Self { 123 | Self { ranges: SmallVec::new() } 124 | } 125 | 126 | #[inline] 127 | fn push(&mut self, range: Result) { 128 | let r = match range { 129 | Ok(r) => Some(r), 130 | Err(err) => { 131 | eprintln!("Can't compare files: {err}"); 132 | None 133 | }, 134 | }; 135 | self.ranges.push(r); 136 | } 137 | 138 | /// Incremental comparison reading files lazily 139 | #[inline] 140 | pub fn compare(&mut self, other: &mut Self, size: u64, self_path: &Path, other_path: &Path) -> Result { 141 | let mut iter = HashIter::new(size, self_path, other_path); 142 | 143 | while let Some((a, b)) = iter.next(self, other)? { 144 | let ord = a.cmp(b); 145 | if ord != Ordering::Equal { 146 | return Ok(ord); 147 | } 148 | } 149 | Ok(Ordering::Equal) 150 | } 151 | } 152 | 153 | #[cfg(test)] 154 | mod test { 155 | use super::*; 156 | use std::fs; 157 | 158 | #[test] 159 | fn range_hash() { 160 | let tmp = tempdir::TempDir::new("hashtest").expect("tmp"); 161 | let path = &tmp.path().join("a"); 162 | fs::write(path, "aaa\n").expect("write"); 163 | let mut file = LazyFile::new(path); 164 | let hashed = HashedRange::from_file(&mut file, 0, 4).expect("hash"); 165 | 166 | assert_eq!(4, hashed.size); 167 | assert_eq!([22, 179, 164, 66, 194, 34, 185, 88, 69, 62, 115, 203, 129, 138, 81, 160, 96, 190, 209, 11], hashed.hash); 168 | 169 | let hashed = HashedRange::from_file(&mut file, 1, 2).expect("hash2"); 170 | assert_eq!(2, hashed.size); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dupe k*r*ill — a fast file deduplicator 2 | 3 | Replaces files that have identical content with hardlinks or reflinks (copy-on-write links), so that file data of all copies is stored only once, saving disk space. Useful for reducing sizes of multiple backups, messy collections of photos and music, countless copies of `node_modules`, macOS app bundles, and anything else that's usually immutable. 4 | 5 | ## Features 6 | 7 | * It's very fast and reasonably memory-efficient. 8 | * Deduplicates incrementally as soon as duplicates are found. 9 | * Replaces files atomically and it's safe to interrupt at any time. 10 | * Proven to be reliable. Used for years without an issue. 11 | * It's aware of existing hardlinks and supports merging of multiple groups of hardlinks. 12 | * **Supports both hardlinks and reflinks (copy-on-write)** for better compatibility and performance. 13 | * Gracefully handles symlinks and special files. 14 | 15 | ## Usage 16 | 17 | [Download binaries from the releases page](https://github.com/kornelski/dupe-krill/releases). 18 | 19 | Works on macOS, Linux, and Windows (with ReFS filesystem for reflink support). 20 | 21 | If you have the [latest stable Rust](https://www.rust-lang.org/) (1.42+), build the program with either `cargo install dupe-krill` or clone this repo and `cargo build --release`. 22 | 23 | ```sh 24 | dupe-krill -d # find dupes without doing anything 25 | dupe-krill # find and replace with hardlinks 26 | dupe-krill --reflink # use reflinks (copy-on-write) when possible 27 | dupe-krill --reflink-or-hardlink # try reflinks first, fallback to hardlinks 28 | ``` 29 | 30 | See `dupe-krill -h` for details. 31 | 32 | ### Output 33 | 34 | It prints one duplicate per line. It prints *both* paths on the same line with the difference between them highlighted as `{first => second}`. 35 | 36 | Progress shows: 37 | 38 | > ``+`` dupes. ``+`` files scanned. 39 | 40 | Symlinks, special device files, and 0-sized files are always skipped. 41 | 42 | Don't try to parse program's usual output. Add `--json` option if you want machine-readable output. You can also use this program as a Rust library for seamless integration. 43 | 44 | ## How does deduplication work? 45 | 46 | Files are deduplicated by making either a hardlink or a reflink, depending on the mode chosen: 47 | 48 | ### Hardlinks 49 | The traditional approach creates hardlinks where literally the same file will exist in two or more directories at once. Unlike symlinks, hardlinks behave like real files. Deleting one hardlink leaves other hardlinks unchanged. Editing a hardlinked file edits it in all places at once (except in some applications that delete & create a new file instead of overwriting). Hardlinking will make all duplicates of a file have the same file permissions. 50 | 51 | ### Reflinks (Copy-on-Write) 52 | A more modern approach that creates reflinks (copy-on-write links). Like hardlinks, reflinks initially point to the same data on disk, saving space. However, when one copy is modified, the filesystem automatically creates a separate copy of the modified portions only. This provides better isolation between files while still saving space for identical content. 53 | 54 | **Platform Support:** 55 | - **Linux**: Uses `FICLONE` ioctl (supported on Btrfs, XFS, and other modern filesystems) 56 | - **macOS**: Uses `clonefile()` system call (supported on APFS) 57 | - **Windows**: Uses `CopyFileEx` with `COPY_FILE_CLONE_FORCE` (requires Windows 10 v1903+ with ReFS filesystem) 58 | 59 | This program will only deduplicate files larger than a single disk block (4KB, usually), because in many filesystems linking tiny files may not actually save space. You can add `-s` flag to dedupe small files, too. 60 | 61 | ### Nerding out about the fast deduplication algorithm 62 | 63 | In short: it uses Rust's standard library `BTreeMap` for deduplication, but with a twist that allows it to compare files lazily, reading only as little file content as necessary. 64 | 65 | ---- 66 | 67 | Theoretically, you could find all duplicate files by putting them in a giant hash table aggregating file paths and using file content as the key: 68 | 69 | ```rust 70 | HashMap, Vec> 71 | ``` 72 | 73 | but of course that would use ludicrous amounts of memory. You can fix it by using hashes of the content instead of the content itself. 74 | 75 | > BTW, I can't stress enough how mind-bogglingly improbable accidental cryptographic hash collisions are. It's not just "you're probably safe if you're lucky". It's "creating this many files would take more energy than our civilisation has ever produced in all of its history". 76 | 77 | ```rust 78 | HashMap<[u8; 16], Vec> 79 | ``` 80 | 81 | but that's still pretty slow, since you still read entire content of all the files. You can save some work by comparing file sizes first: 82 | 83 | ```rust 84 | HashMap> 85 | ``` 86 | 87 | but it helps only a little, since files with identical sizes are surprisingly common. You can eliminate a bit more of near-duplicates by comparing only beginnings of the files first: 88 | 89 | ```rust 90 | HashMap>> 91 | ``` 92 | 93 | and then maybe compare only the ends, and maybe a few more fragments in the middle, etc.: 94 | 95 | ```rust 96 | HashMap>>> 97 | HashMap>>> 98 | ``` 99 | 100 | These endlessly nested hashmaps can be generalized. `BTreeMap` doesn't need to see the whole key at once. It only compares keys with each other, and the comparison can be done incrementally — by only reading enough of the file to show that its key is unique, without even knowing the full key. 101 | 102 | ```rust 103 | BTreeMap, Vec> 104 | ``` 105 | 106 | And that's what this program does (and a bit of wrangling with inodes). 107 | 108 | The whole heavy lifting of deduplication is done by Rust's standard library `BTreeMap` and overloaded `<`/`>` operators that incrementally hash the files (yes, operator overloading that does file I/O is a brilliant idea. I couldn't use `<<`, unfortunately). 109 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "arrayref" 7 | version = "0.3.9" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" 10 | 11 | [[package]] 12 | name = "arrayvec" 13 | version = "0.7.6" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" 16 | 17 | [[package]] 18 | name = "bitflags" 19 | version = "2.9.4" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" 22 | 23 | [[package]] 24 | name = "blake3" 25 | version = "1.8.2" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" 28 | dependencies = [ 29 | "arrayref", 30 | "arrayvec", 31 | "cc", 32 | "cfg-if", 33 | "constant_time_eq", 34 | ] 35 | 36 | [[package]] 37 | name = "cc" 38 | version = "1.2.40" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "e1d05d92f4b1fd76aad469d46cdd858ca761576082cd37df81416691e50199fb" 41 | dependencies = [ 42 | "find-msvc-tools", 43 | "shlex", 44 | ] 45 | 46 | [[package]] 47 | name = "cfg-if" 48 | version = "1.0.3" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" 51 | 52 | [[package]] 53 | name = "cfg_aliases" 54 | version = "0.2.1" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" 57 | 58 | [[package]] 59 | name = "constant_time_eq" 60 | version = "0.3.1" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" 63 | 64 | [[package]] 65 | name = "ctrlc" 66 | version = "3.5.0" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "881c5d0a13b2f1498e2306e82cbada78390e152d4b1378fb28a84f4dcd0dc4f3" 69 | dependencies = [ 70 | "dispatch", 71 | "nix", 72 | "windows-sys", 73 | ] 74 | 75 | [[package]] 76 | name = "dispatch" 77 | version = "0.2.0" 78 | source = "registry+https://github.com/rust-lang/crates.io-index" 79 | checksum = "bd0c93bb4b0c6d9b77f4435b0ae98c24d17f1c45b2ff844c6151a07256ca923b" 80 | 81 | [[package]] 82 | name = "dupe-krill" 83 | version = "1.5.0" 84 | dependencies = [ 85 | "blake3", 86 | "ctrlc", 87 | "getopts", 88 | "libc", 89 | "serde", 90 | "serde_derive", 91 | "serde_json", 92 | "smallvec", 93 | "tempdir", 94 | ] 95 | 96 | [[package]] 97 | name = "find-msvc-tools" 98 | version = "0.1.3" 99 | source = "registry+https://github.com/rust-lang/crates.io-index" 100 | checksum = "0399f9d26e5191ce32c498bebd31e7a3ceabc2745f0ac54af3f335126c3f24b3" 101 | 102 | [[package]] 103 | name = "fuchsia-cprng" 104 | version = "0.1.1" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" 107 | 108 | [[package]] 109 | name = "getopts" 110 | version = "0.2.24" 111 | source = "registry+https://github.com/rust-lang/crates.io-index" 112 | checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" 113 | dependencies = [ 114 | "unicode-width", 115 | ] 116 | 117 | [[package]] 118 | name = "itoa" 119 | version = "1.0.15" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 122 | 123 | [[package]] 124 | name = "libc" 125 | version = "0.2.176" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" 128 | 129 | [[package]] 130 | name = "memchr" 131 | version = "2.7.6" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" 134 | 135 | [[package]] 136 | name = "nix" 137 | version = "0.30.1" 138 | source = "registry+https://github.com/rust-lang/crates.io-index" 139 | checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" 140 | dependencies = [ 141 | "bitflags", 142 | "cfg-if", 143 | "cfg_aliases", 144 | "libc", 145 | ] 146 | 147 | [[package]] 148 | name = "proc-macro2" 149 | version = "1.0.101" 150 | source = "registry+https://github.com/rust-lang/crates.io-index" 151 | checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" 152 | dependencies = [ 153 | "unicode-ident", 154 | ] 155 | 156 | [[package]] 157 | name = "quote" 158 | version = "1.0.41" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" 161 | dependencies = [ 162 | "proc-macro2", 163 | ] 164 | 165 | [[package]] 166 | name = "rand" 167 | version = "0.4.6" 168 | source = "registry+https://github.com/rust-lang/crates.io-index" 169 | checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" 170 | dependencies = [ 171 | "fuchsia-cprng", 172 | "libc", 173 | "rand_core 0.3.1", 174 | "rdrand", 175 | "winapi", 176 | ] 177 | 178 | [[package]] 179 | name = "rand_core" 180 | version = "0.3.1" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" 183 | dependencies = [ 184 | "rand_core 0.4.2", 185 | ] 186 | 187 | [[package]] 188 | name = "rand_core" 189 | version = "0.4.2" 190 | source = "registry+https://github.com/rust-lang/crates.io-index" 191 | checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" 192 | 193 | [[package]] 194 | name = "rdrand" 195 | version = "0.4.0" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" 198 | dependencies = [ 199 | "rand_core 0.3.1", 200 | ] 201 | 202 | [[package]] 203 | name = "remove_dir_all" 204 | version = "0.5.3" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" 207 | dependencies = [ 208 | "winapi", 209 | ] 210 | 211 | [[package]] 212 | name = "ryu" 213 | version = "1.0.20" 214 | source = "registry+https://github.com/rust-lang/crates.io-index" 215 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 216 | 217 | [[package]] 218 | name = "serde" 219 | version = "1.0.228" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" 222 | dependencies = [ 223 | "serde_core", 224 | ] 225 | 226 | [[package]] 227 | name = "serde_core" 228 | version = "1.0.228" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" 231 | dependencies = [ 232 | "serde_derive", 233 | ] 234 | 235 | [[package]] 236 | name = "serde_derive" 237 | version = "1.0.228" 238 | source = "registry+https://github.com/rust-lang/crates.io-index" 239 | checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" 240 | dependencies = [ 241 | "proc-macro2", 242 | "quote", 243 | "syn", 244 | ] 245 | 246 | [[package]] 247 | name = "serde_json" 248 | version = "1.0.145" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 251 | dependencies = [ 252 | "itoa", 253 | "memchr", 254 | "ryu", 255 | "serde", 256 | "serde_core", 257 | ] 258 | 259 | [[package]] 260 | name = "shlex" 261 | version = "1.3.0" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 264 | 265 | [[package]] 266 | name = "smallvec" 267 | version = "1.15.1" 268 | source = "registry+https://github.com/rust-lang/crates.io-index" 269 | checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" 270 | 271 | [[package]] 272 | name = "syn" 273 | version = "2.0.106" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" 276 | dependencies = [ 277 | "proc-macro2", 278 | "quote", 279 | "unicode-ident", 280 | ] 281 | 282 | [[package]] 283 | name = "tempdir" 284 | version = "0.3.7" 285 | source = "registry+https://github.com/rust-lang/crates.io-index" 286 | checksum = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8" 287 | dependencies = [ 288 | "rand", 289 | "remove_dir_all", 290 | ] 291 | 292 | [[package]] 293 | name = "unicode-ident" 294 | version = "1.0.19" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" 297 | 298 | [[package]] 299 | name = "unicode-width" 300 | version = "0.2.2" 301 | source = "registry+https://github.com/rust-lang/crates.io-index" 302 | checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" 303 | 304 | [[package]] 305 | name = "winapi" 306 | version = "0.3.9" 307 | source = "registry+https://github.com/rust-lang/crates.io-index" 308 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 309 | dependencies = [ 310 | "winapi-i686-pc-windows-gnu", 311 | "winapi-x86_64-pc-windows-gnu", 312 | ] 313 | 314 | [[package]] 315 | name = "winapi-i686-pc-windows-gnu" 316 | version = "0.4.0" 317 | source = "registry+https://github.com/rust-lang/crates.io-index" 318 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 319 | 320 | [[package]] 321 | name = "winapi-x86_64-pc-windows-gnu" 322 | version = "0.4.0" 323 | source = "registry+https://github.com/rust-lang/crates.io-index" 324 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 325 | 326 | [[package]] 327 | name = "windows-link" 328 | version = "0.2.1" 329 | source = "registry+https://github.com/rust-lang/crates.io-index" 330 | checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 331 | 332 | [[package]] 333 | name = "windows-sys" 334 | version = "0.61.2" 335 | source = "registry+https://github.com/rust-lang/crates.io-index" 336 | checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" 337 | dependencies = [ 338 | "windows-link", 339 | ] 340 | -------------------------------------------------------------------------------- /src/scanner.rs: -------------------------------------------------------------------------------- 1 | use crate::file::{FileContent, FileSet}; 2 | use crate::metadata::Metadata; 3 | use crate::reflink::{LinkType, reflink, reflink_or_hardlink}; 4 | use std::cell::RefCell; 5 | use std::cmp; 6 | use std::collections::btree_map::Entry as BTreeEntry; 7 | use std::collections::hash_map::Entry as HashEntry; 8 | use std::collections::BTreeMap; 9 | use std::collections::BinaryHeap; 10 | use std::collections::HashMap; 11 | use std::collections::HashSet; 12 | use std::ffi::OsString; 13 | use std::fmt::Debug; 14 | use std::fs; 15 | use std::io; 16 | #[cfg(unix)] 17 | use std::os::unix::fs::MetadataExt; 18 | use std::path::Path; 19 | use std::rc::Rc; 20 | use std::sync::atomic::AtomicU32; 21 | use std::sync::atomic::Ordering; 22 | use std::time::{Duration, Instant}; 23 | 24 | // Platform-specific metadata access functions 25 | #[cfg(unix)] 26 | fn get_inode(metadata: &fs::Metadata) -> u64 { 27 | metadata.ino() 28 | } 29 | 30 | #[cfg(windows)] 31 | fn get_inode(metadata: &fs::Metadata) -> u64 { 32 | // Windows doesn't have inodes, but we can create a simple hash-based substitute 33 | // This is a simplified approach - for production use, more sophisticated methods 34 | // might be needed to ensure uniqueness 35 | use std::collections::hash_map::DefaultHasher; 36 | use std::hash::{Hash, Hasher}; 37 | 38 | let mut hasher = DefaultHasher::new(); 39 | metadata.size().hash(&mut hasher); 40 | metadata.modified().unwrap_or(std::time::SystemTime::UNIX_EPOCH).hash(&mut hasher); 41 | hasher.finish() 42 | } 43 | 44 | #[cfg(unix)] 45 | fn get_device(metadata: &fs::Metadata) -> u64 { 46 | metadata.dev() 47 | } 48 | 49 | #[cfg(windows)] 50 | fn get_device(_metadata: &fs::Metadata) -> u64 { 51 | // On Windows, we'll use a simple constant for device identification 52 | // This means hardlinking across different drives won't work properly, 53 | // but that's expected behavior and matches filesystem limitations 54 | 0 55 | } 56 | 57 | // Helper functions to get the proper size (accounting for block overhead) 58 | #[cfg(unix)] 59 | fn get_size(metadata: &fs::Metadata) -> u64 { 60 | metadata.size() 61 | } 62 | 63 | #[cfg(windows)] 64 | fn get_size(metadata: &fs::Metadata) -> u64 { 65 | // Windows polyfill: round up to the next 4KB block to account for block overhead 66 | let len = metadata.size(); 67 | const BLOCK_SIZE: u64 = 4096; 68 | ((len + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE 69 | } 70 | 71 | #[derive(Debug, Copy, Clone, Eq, PartialEq)] 72 | pub enum RunMode { 73 | /// Merges paths in memory, but not on disk. Gives realistic UI output. 74 | DryRun, 75 | /// Like dry run, but completely skips deduping, with no UI for dupes. 76 | DryRunNoMerging, 77 | Hardlink, 78 | Reflink, 79 | /// Try reflinking first, fall back to hardlinking if reflinking fails 80 | ReflinkOrHardlink, 81 | } 82 | 83 | #[derive(Debug)] 84 | pub struct Settings { 85 | /// Ignore files smaller than a filesystem block. 86 | /// Deduping of such files is unlikely to save space. 87 | pub ignore_small: bool, 88 | pub run_mode: RunMode, 89 | 90 | // If 1, go to flush. If > 1, abort immediately. 91 | pub break_on: Option<&'static AtomicU32>, 92 | } 93 | 94 | impl Settings { 95 | pub fn breaks(&self) -> u32 { 96 | if let Some(break_on) = self.break_on { 97 | break_on.load(Ordering::SeqCst) 98 | } else { 99 | 0 100 | } 101 | } 102 | } 103 | 104 | #[derive(Debug, Default, Copy, Clone)] 105 | #[cfg_attr(feature = "json", derive(serde_derive::Serialize))] 106 | pub struct Stats { 107 | pub added: usize, 108 | pub skipped: usize, 109 | pub dupes: usize, 110 | pub bytes_deduplicated: usize, 111 | pub hardlinks: usize, 112 | pub bytes_saved_by_hardlinks: usize, 113 | pub reflinks: usize, 114 | pub bytes_saved_by_reflinks: usize, 115 | } 116 | 117 | pub trait ScanListener: Debug { 118 | fn file_scanned(&mut self, path: &Path, stats: &Stats); 119 | fn scan_over(&self, scanner: &Scanner, stats: &Stats, scan_duration: Duration); 120 | fn hardlinked(&mut self, src: &Path, dst: &Path); 121 | fn reflinked(&mut self, src: &Path, dst: &Path); 122 | fn duplicate_found(&mut self, src: &Path, dst: &Path); 123 | } 124 | 125 | #[derive(Debug)] 126 | struct SilentListener; 127 | impl ScanListener for SilentListener { 128 | fn file_scanned(&mut self, _: &Path, _: &Stats) {} 129 | 130 | fn scan_over(&self, _: &Scanner, _: &Stats, _: Duration) {} 131 | 132 | fn hardlinked(&mut self, _: &Path, _: &Path) {} 133 | 134 | fn reflinked(&mut self, _: &Path, _: &Path) {} 135 | 136 | fn duplicate_found(&mut self, _: &Path, _: &Path) {} 137 | } 138 | 139 | type RcFileSet = Rc>; 140 | 141 | #[derive(Debug)] 142 | pub struct Scanner { 143 | /// All hardlinks of the same inode have to be treated as the same file 144 | by_inode: HashMap<(u64, u64), RcFileSet>, 145 | /// See Hasher for explanation 146 | by_content: BTreeMap>, 147 | /// Directories left to scan. Sorted by inode number. 148 | /// I'm assuming scanning in this order is faster, since inode is related to file's age, 149 | /// which is related to its physical position on disk, which makes the scan more sequential. 150 | to_scan: BinaryHeap<(u64, Box)>, 151 | 152 | scan_listener: Box, 153 | stats: Stats, 154 | exclude: HashSet, 155 | pub settings: Settings, 156 | 157 | deferred_count: usize, 158 | next_deferred_count: usize, 159 | } 160 | 161 | impl Scanner { 162 | #[must_use] 163 | pub fn new() -> Self { 164 | Self { 165 | settings: Settings { 166 | ignore_small: true, 167 | run_mode: RunMode::Hardlink, 168 | break_on: None, 169 | }, 170 | by_inode: HashMap::new(), 171 | by_content: BTreeMap::new(), 172 | to_scan: BinaryHeap::new(), 173 | scan_listener: Box::new(SilentListener), 174 | stats: Stats::default(), 175 | exclude: HashSet::new(), 176 | deferred_count: 0, 177 | next_deferred_count: 4096, 178 | } 179 | } 180 | 181 | pub fn exclude(&mut self, exclude: Vec) { 182 | self.exclude = exclude.into_iter().map(From::from).collect(); 183 | } 184 | 185 | /// Set the scan listener. Caution: This overrides previously set listeners! 186 | /// Use a multiplexing listener if multiple listeners are required. 187 | pub fn set_listener(&mut self, listener: Box) { 188 | self.scan_listener = listener; 189 | } 190 | 191 | /// Scan any file or directory for dupes. 192 | /// Dedupe is done within the path as well as against all previously added paths. 193 | pub fn scan(&mut self, path: impl AsRef) -> io::Result<()> { 194 | self.enqueue(path)?; 195 | self.flush()?; 196 | Ok(()) 197 | } 198 | 199 | pub fn enqueue(&mut self, path: impl AsRef) -> io::Result<()> { 200 | let path = fs::canonicalize(path)?.into_boxed_path(); 201 | let metadata = fs::symlink_metadata(&path)?; 202 | self.add(path, &metadata)?; 203 | Ok(()) 204 | } 205 | 206 | /// Drains the queue of directories to scan 207 | pub fn flush(&mut self) -> io::Result<()> { 208 | let start_time = Instant::now(); 209 | while let Some((_, path)) = self.to_scan.pop() { 210 | if let Err(err) = self.scan_dir(&path) { 211 | eprintln!("Error scanning {}: {}", path.display(), err); 212 | self.stats.skipped += 1; 213 | } 214 | if self.settings.breaks() > 0 { 215 | eprintln!("Stopping scan"); 216 | break; 217 | } 218 | } 219 | self.flush_deferred(); 220 | let scan_duration = Instant::now().duration_since(start_time); 221 | self.scan_listener.scan_over(self, &self.stats, scan_duration); 222 | Ok(()) 223 | } 224 | 225 | fn scan_dir(&mut self, path: &Path) -> io::Result<()> { 226 | // Errors are ignored here, since it's super common to find permission denied and unreadable symlinks, 227 | // and it'd be annoying if that aborted the whole operation. 228 | // FIXME: store the errors somehow to report them in a controlled manner 229 | for entry in fs::read_dir(path)?.filter_map(|p| p.ok()) { 230 | if self.settings.breaks() > 0 { 231 | break; 232 | } 233 | 234 | let path = entry.path(); 235 | if let Some(file_name) = path.file_name() { 236 | if self.exclude.contains(file_name) { 237 | self.stats.skipped += 1; 238 | continue; 239 | } 240 | } 241 | if let Err(err) = self.add(path.into_boxed_path(), &entry.metadata()?) { 242 | eprintln!("{}: {}", entry.path().display(), err); 243 | } 244 | } 245 | Ok(()) 246 | } 247 | 248 | fn add(&mut self, path: Box, metadata: &fs::Metadata) -> io::Result<()> { 249 | self.scan_listener.file_scanned(&path, &self.stats); 250 | 251 | let ty = metadata.file_type(); 252 | if ty.is_dir() { 253 | // Inode is truncated to group scanning of roughly close inodes together, 254 | // But still preserve some directory traversal order. 255 | // Negation to scan from the highest (assuming latest) first. 256 | let order_key = !(get_inode(metadata) >> 8); 257 | self.to_scan.push((order_key, path)); 258 | return Ok(()); 259 | } else if ty.is_symlink() || !ty.is_file() { 260 | // Support for traversing symlinks would require preventing loops 261 | // Deduping /dev/ would be funny 262 | self.stats.skipped += 1; 263 | return Ok(()); 264 | } 265 | 266 | // APFS reports 4*MB* block size 267 | // On Windows, use a reasonable default block size since blksize() doesn't exist 268 | #[cfg(unix)] 269 | let small_size = cmp::min(16 * 1024, metadata.blksize()); 270 | #[cfg(windows)] 271 | let small_size = cmp::min(16 * 1024, 4096u64); // Assume 4KB blocks on Windows 272 | 273 | if get_size(metadata) == 0 || (self.settings.ignore_small && get_size(metadata) < small_size) { 274 | self.stats.skipped += 1; 275 | return Ok(()); 276 | } 277 | self.stats.added += 1; 278 | 279 | if let Some(fileset) = self.new_fileset(&path, metadata) { 280 | self.dedupe_by_content(fileset, path, metadata)?; 281 | } else { 282 | self.stats.hardlinks += 1; 283 | self.stats.bytes_saved_by_hardlinks += get_size(metadata) as usize; 284 | } 285 | Ok(()) 286 | } 287 | 288 | /// Creates a new fileset if it's a new file. 289 | /// Returns None if it's a hardlink of a file already seen. 290 | fn new_fileset(&mut self, path: &Path, metadata: &fs::Metadata) -> Option { 291 | let path: Box = path.into(); 292 | 293 | // On Windows, skip the by_inode check entirely since Windows doesn't have 294 | // proper inodes and hardlink counts 295 | #[cfg(windows)] 296 | { 297 | let fileset = Rc::new(RefCell::new(FileSet::new(path, 1u64))); 298 | Some(fileset) 299 | } 300 | 301 | #[cfg(unix)] 302 | { 303 | let device_inode = (get_device(metadata), get_inode(metadata)); 304 | 305 | match self.by_inode.entry(device_inode) { 306 | HashEntry::Vacant(e) => { 307 | let links = metadata.nlink(); 308 | let fileset = Rc::new(RefCell::new(FileSet::new(path, links))); 309 | e.insert(Rc::clone(&fileset)); // clone just bumps a refcount here 310 | Some(fileset) 311 | }, 312 | HashEntry::Occupied(mut e) => { 313 | // This case may require a deferred deduping later, 314 | // if the new link belongs to an old fileset that has already been deduped. 315 | let mut t = e.get_mut().borrow_mut(); 316 | t.push(path); 317 | None 318 | }, 319 | } 320 | } 321 | } 322 | 323 | /// Here's where all the magic happens 324 | fn dedupe_by_content(&mut self, fileset: RcFileSet, path: Box, metadata: &fs::Metadata) -> io::Result<()> { 325 | let mut deferred = false; 326 | match self.by_content.entry(FileContent::new(path, Metadata::new(metadata))) { 327 | BTreeEntry::Vacant(e) => { 328 | // Seems unique so far 329 | e.insert(vec![fileset]); 330 | }, 331 | BTreeEntry::Occupied(mut e) => { 332 | // Found a dupe! 333 | self.stats.dupes += 1; 334 | self.stats.bytes_deduplicated += get_size(metadata) as usize; 335 | let filesets = e.get_mut(); 336 | filesets.push(fileset); 337 | // Deduping can either be done immediately or later. Immediate is more cache-friendly and interactive, 338 | // but for files that already have hardlinks it can cause unnecessary re-linking. So if there are 339 | // hardlinks in the set, wait until the end to dedupe when all hardlinks are known. 340 | if filesets.iter().all(|set| set.borrow().links() == 1) { 341 | Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener, &mut self.stats)?; 342 | } else { 343 | deferred = true; 344 | } 345 | }, 346 | } 347 | 348 | // Periodically flush deferred files to avoid building a huge queue 349 | // (the growing limit is a compromise between responsiveness 350 | // and potential to hit a pathological case of hardlinking with wrong hardlink groups) 351 | if deferred { 352 | self.deferred_count += 1; 353 | if self.deferred_count >= self.next_deferred_count { 354 | self.next_deferred_count *= 2; 355 | self.deferred_count = 0; 356 | self.flush_deferred(); 357 | } 358 | } 359 | Ok(()) 360 | } 361 | 362 | fn flush_deferred(&mut self) { 363 | for filesets in self.by_content.values_mut() { 364 | if self.settings.breaks() > 1 { 365 | eprintln!("Aborting"); 366 | break; 367 | } 368 | if let Err(err) = Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener, &mut self.stats) { 369 | eprintln!("{err}"); 370 | } 371 | } 372 | } 373 | 374 | fn dedupe(filesets: &[RcFileSet], run_mode: RunMode, scan_listener: &mut dyn ScanListener, stats: &mut Stats) -> io::Result<()> { 375 | if run_mode == RunMode::DryRunNoMerging { 376 | return Ok(()); 377 | } 378 | 379 | // Find file with the largest number of hardlinks, since it's less work to merge a small group into a large group 380 | let mut largest_idx = 0; 381 | let mut largest_links = 0; 382 | let mut nonempty_filesets = 0; 383 | for (idx, fileset) in filesets.iter().enumerate() { 384 | let fileset = fileset.borrow(); 385 | if !fileset.paths.is_empty() { 386 | // Only actual paths we can merge matter here 387 | nonempty_filesets += 1; 388 | } 389 | let links = fileset.links(); 390 | if links > largest_links { 391 | largest_idx = idx; 392 | largest_links = links; 393 | } 394 | } 395 | 396 | if nonempty_filesets == 0 { 397 | return Ok(()); // Already merged 398 | } 399 | 400 | // The set is still going to be in use! So everything has to be updated to make sense for the next call 401 | let merged_paths = &mut { filesets[largest_idx].borrow_mut() }.paths; 402 | let source_path = merged_paths[0].clone(); 403 | 404 | // Get the file size for statistics tracking 405 | let file_size = get_size(&fs::symlink_metadata(&source_path)?) as usize; 406 | 407 | for (i, set) in filesets.iter().enumerate() { 408 | // We don't want to merge the set with itself 409 | if i == largest_idx { 410 | continue; 411 | } 412 | 413 | let paths = &mut set.borrow_mut().paths; 414 | // dest_path will be "lost" on error, but that's fine, since we don't want to dedupe it if it causes errors 415 | for dest_path in paths.drain(..) { 416 | assert_ne!(&source_path, &dest_path); 417 | debug_assert_ne!(get_inode(&fs::symlink_metadata(&source_path)?), get_inode(&fs::symlink_metadata(&dest_path)?)); 418 | 419 | if run_mode == RunMode::DryRun { 420 | scan_listener.duplicate_found(&dest_path, &source_path); 421 | merged_paths.push(dest_path); 422 | continue; 423 | } 424 | 425 | let temp_path = dest_path.with_file_name(".tmp-dupe-e1iIQcBFn5pC4MUSm-xkcd-221"); 426 | debug_assert!(!temp_path.exists()); 427 | debug_assert!(source_path.exists()); 428 | debug_assert!(dest_path.exists()); 429 | 430 | match run_mode { 431 | RunMode::Hardlink => { 432 | // Traditional hardlink behavior 433 | if let Err(err) = fs::hard_link(&source_path, &temp_path) { 434 | eprintln!("unable to hardlink {} {} due to {}", source_path.display(), temp_path.display(), err); 435 | let _ = fs::remove_file(temp_path); 436 | return Err(err); 437 | } 438 | if let Err(err) = fs::rename(&temp_path, &dest_path) { 439 | eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err); 440 | let _ = fs::remove_file(temp_path); 441 | return Err(err); 442 | } 443 | scan_listener.hardlinked(&dest_path, &source_path); 444 | }, 445 | RunMode::Reflink => { 446 | // Only try reflink 447 | if let Err(err) = reflink(&source_path, &temp_path) { 448 | eprintln!("unable to reflink {} {} due to {}", source_path.display(), temp_path.display(), err); 449 | let _ = fs::remove_file(temp_path); 450 | return Err(err); 451 | } 452 | if let Err(err) = fs::rename(&temp_path, &dest_path) { 453 | eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err); 454 | let _ = fs::remove_file(temp_path); 455 | return Err(err); 456 | } 457 | scan_listener.reflinked(&dest_path, &source_path); 458 | stats.reflinks += 1; 459 | stats.bytes_saved_by_reflinks += file_size; 460 | }, 461 | RunMode::ReflinkOrHardlink => { 462 | // Try reflink first, fallback to hardlink 463 | match reflink_or_hardlink(&source_path, &temp_path)? { 464 | LinkType::Reflink => { 465 | if let Err(err) = fs::rename(&temp_path, &dest_path) { 466 | eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err); 467 | let _ = fs::remove_file(temp_path); 468 | return Err(err); 469 | } 470 | scan_listener.reflinked(&dest_path, &source_path); 471 | stats.reflinks += 1; 472 | stats.bytes_saved_by_reflinks += file_size; 473 | }, 474 | LinkType::Hardlink => { 475 | if let Err(err) = fs::rename(&temp_path, &dest_path) { 476 | eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err); 477 | let _ = fs::remove_file(temp_path); 478 | return Err(err); 479 | } 480 | scan_listener.hardlinked(&dest_path, &source_path); 481 | }, 482 | } 483 | }, 484 | _ => unreachable!("Invalid run mode for linking operation"), 485 | } 486 | 487 | debug_assert!(!temp_path.exists()); 488 | debug_assert!(source_path.exists()); 489 | debug_assert!(dest_path.exists()); 490 | merged_paths.push(dest_path); 491 | } 492 | } 493 | Ok(()) 494 | } 495 | 496 | #[must_use] 497 | pub fn dupes(&self) -> Vec> { 498 | self.by_content.values().map(|filesets| { 499 | filesets.iter().map(|d|{ 500 | let tmp = d.borrow(); 501 | (*tmp).clone() 502 | }).collect() 503 | }).collect() 504 | } 505 | } 506 | 507 | --------------------------------------------------------------------------------